@@ -6072,11 +6072,6 @@ struct Luv2RGB_b
6072
6072
v_scale = vdupq_n_f32 (255 .f );
6073
6073
v_alpha = vdup_n_u8 (ColorChannel<uchar>::max ());
6074
6074
#elif CV_SSE2
6075
- v_scale_inv = _mm_set1_ps (100 .f /255 .f );
6076
- v_coeff1 = _mm_set1_ps (1 .388235294117647f );
6077
- v_coeff2 = _mm_set1_ps (1 .027450980392157f );
6078
- v_134 = _mm_set1_ps (134 .f );
6079
- v_140 = _mm_set1_ps (140 .f );
6080
6075
v_scale = _mm_set1_ps (255 .f );
6081
6076
v_zero = _mm_setzero_si128 ();
6082
6077
haveSIMD = checkHardwareSupport (CV_CPU_SSE2);
@@ -6086,6 +6081,7 @@ struct Luv2RGB_b
6086
6081
#if CV_SSE2
6087
6082
// 16s x 8
6088
6083
void process (__m128i v_l, __m128i v_u, __m128i v_v,
6084
+ __m128 v_coeffs, __m128 v_res,
6089
6085
float * buf) const
6090
6086
{
6091
6087
__m128 v_l0 = _mm_cvtepi32_ps (_mm_unpacklo_epi16 (v_l, v_zero));
@@ -6096,15 +6092,26 @@ struct Luv2RGB_b
6096
6092
__m128 v_u1 = _mm_cvtepi32_ps (_mm_unpackhi_epi16 (v_u, v_zero));
6097
6093
__m128 v_v1 = _mm_cvtepi32_ps (_mm_unpackhi_epi16 (v_v, v_zero));
6098
6094
6099
- v_l0 = _mm_mul_ps (v_l0, v_scale_inv);
6100
- v_l1 = _mm_mul_ps (v_l1, v_scale_inv);
6095
+ v_l0 = _mm_mul_ps (v_l0, v_coeffs);
6096
+ v_u1 = _mm_mul_ps (v_u1, v_coeffs);
6097
+ v_l0 = _mm_sub_ps (v_l0, v_res);
6098
+ v_u1 = _mm_sub_ps (v_u1, v_res);
6099
+
6100
+ v_coeffs = _mm_castsi128_ps (_mm_shuffle_epi32 (_mm_castps_si128 (v_coeffs), 0x49 ));
6101
+ v_res = _mm_castsi128_ps (_mm_shuffle_epi32 (_mm_castps_si128 (v_res), 0x49 ));
6101
6102
6102
- v_u0 = _mm_sub_ps ( _mm_mul_ps (v_u0, v_coeff1), v_134 );
6103
- v_u1 = _mm_sub_ps ( _mm_mul_ps (v_u1, v_coeff1), v_134 );
6104
- v_v0 = _mm_sub_ps (_mm_mul_ps (v_v0, v_coeff2), v_140 );
6105
- v_v1 = _mm_sub_ps (_mm_mul_ps (v_v1, v_coeff2), v_140 );
6103
+ v_l1 = _mm_mul_ps (v_l1, v_coeffs );
6104
+ v_v0 = _mm_mul_ps (v_v0, v_coeffs );
6105
+ v_l1 = _mm_sub_ps (v_l1, v_res );
6106
+ v_v0 = _mm_sub_ps (v_v0, v_res );
6106
6107
6107
- _mm_interleave_ps (v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
6108
+ v_coeffs = _mm_castsi128_ps (_mm_shuffle_epi32 (_mm_castps_si128 (v_coeffs), 0x49 ));
6109
+ v_res = _mm_castsi128_ps (_mm_shuffle_epi32 (_mm_castps_si128 (v_res), 0x49 ));
6110
+
6111
+ v_u0 = _mm_mul_ps (v_u0, v_coeffs);
6112
+ v_v1 = _mm_mul_ps (v_v1, v_coeffs);
6113
+ v_u0 = _mm_sub_ps (v_u0, v_res);
6114
+ v_v1 = _mm_sub_ps (v_v1, v_res);
6108
6115
6109
6116
_mm_store_ps (buf, v_l0);
6110
6117
_mm_store_ps (buf + 4 , v_l1);
@@ -6121,6 +6128,11 @@ struct Luv2RGB_b
6121
6128
uchar alpha = ColorChannel<uchar>::max ();
6122
6129
float CV_DECL_ALIGNED (16 ) buf[3 *BLOCK_SIZE];
6123
6130
6131
+ #if CV_SSE2
6132
+ __m128 v_coeffs = _mm_set_ps (100 .f /255 .f , 1 .027450980392157f , 1 .388235294117647f , 100 .f /255 .f );
6133
+ __m128 v_res = _mm_set_ps (0 .f , 140 .f , 134 .f , 0 .f );
6134
+ #endif
6135
+
6124
6136
for ( i = 0 ; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
6125
6137
{
6126
6138
int dn = std::min (n - i, (int )BLOCK_SIZE);
@@ -6148,36 +6160,16 @@ struct Luv2RGB_b
6148
6160
#elif CV_SSE2
6149
6161
if (haveSIMD)
6150
6162
{
6151
- for ( ; j <= (dn - 32 ) * 3 ; j += 96 )
6163
+ for ( ; j <= (dn - 8 ) * 3 ; j += 24 )
6152
6164
{
6153
- __m128i v_r0 = _mm_loadu_si128 ((__m128i const *)(src + j));
6154
- __m128i v_r1 = _mm_loadu_si128 ((__m128i const *)(src + j + 16 ));
6155
- __m128i v_g0 = _mm_loadu_si128 ((__m128i const *)(src + j + 32 ));
6156
- __m128i v_g1 = _mm_loadu_si128 ((__m128i const *)(src + j + 48 ));
6157
- __m128i v_b0 = _mm_loadu_si128 ((__m128i const *)(src + j + 64 ));
6158
- __m128i v_b1 = _mm_loadu_si128 ((__m128i const *)(src + j + 80 ));
6165
+ __m128i v_src0 = _mm_loadu_si128 ((__m128i const *)(src + j));
6166
+ __m128i v_src1 = _mm_loadl_epi64 ((__m128i const *)(src + j + 16 ));
6159
6167
6160
- _mm_deinterleave_epi8 (v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
6161
-
6162
- process (_mm_unpacklo_epi8 (v_r0, v_zero),
6163
- _mm_unpacklo_epi8 (v_g0, v_zero),
6164
- _mm_unpacklo_epi8 (v_b0, v_zero),
6168
+ process (_mm_unpacklo_epi8 (v_src0, v_zero),
6169
+ _mm_unpackhi_epi8 (v_src0, v_zero),
6170
+ _mm_unpacklo_epi8 (v_src1, v_zero),
6171
+ v_coeffs, v_res,
6165
6172
buf + j);
6166
-
6167
- process (_mm_unpackhi_epi8 (v_r0, v_zero),
6168
- _mm_unpackhi_epi8 (v_g0, v_zero),
6169
- _mm_unpackhi_epi8 (v_b0, v_zero),
6170
- buf + j + 24 );
6171
-
6172
- process (_mm_unpacklo_epi8 (v_r1, v_zero),
6173
- _mm_unpacklo_epi8 (v_g1, v_zero),
6174
- _mm_unpacklo_epi8 (v_b1, v_zero),
6175
- buf + j + 48 );
6176
-
6177
- process (_mm_unpackhi_epi8 (v_r1, v_zero),
6178
- _mm_unpackhi_epi8 (v_g1, v_zero),
6179
- _mm_unpackhi_epi8 (v_b1, v_zero),
6180
- buf + j + 72 );
6181
6173
}
6182
6174
}
6183
6175
#endif
@@ -6261,7 +6253,7 @@ struct Luv2RGB_b
6261
6253
float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
6262
6254
uint8x8_t v_alpha;
6263
6255
#elif CV_SSE2
6264
- __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140 ;
6256
+ __m128 v_scale;
6265
6257
__m128i v_zero;
6266
6258
bool haveSIMD;
6267
6259
#endif
0 commit comments