Skip to content

Commit 4540260

Browse files
author
k-shinotsuka
committed
improve Luv2RGB_b()
1 parent dc3d0cb commit 4540260

File tree

1 file changed

+32
-40
lines changed

1 file changed

+32
-40
lines changed

modules/imgproc/src/color.cpp

Lines changed: 32 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6072,11 +6072,6 @@ struct Luv2RGB_b
60726072
v_scale = vdupq_n_f32(255.f);
60736073
v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
60746074
#elif CV_SSE2
6075-
v_scale_inv = _mm_set1_ps(100.f/255.f);
6076-
v_coeff1 = _mm_set1_ps(1.388235294117647f);
6077-
v_coeff2 = _mm_set1_ps(1.027450980392157f);
6078-
v_134 = _mm_set1_ps(134.f);
6079-
v_140 = _mm_set1_ps(140.f);
60806075
v_scale = _mm_set1_ps(255.f);
60816076
v_zero = _mm_setzero_si128();
60826077
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
@@ -6086,6 +6081,7 @@ struct Luv2RGB_b
60866081
#if CV_SSE2
60876082
// 16s x 8
60886083
void process(__m128i v_l, __m128i v_u, __m128i v_v,
6084+
__m128 v_coeffs, __m128 v_res,
60896085
float * buf) const
60906086
{
60916087
__m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero));
@@ -6096,15 +6092,26 @@ struct Luv2RGB_b
60966092
__m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero));
60976093
__m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero));
60986094

6099-
v_l0 = _mm_mul_ps(v_l0, v_scale_inv);
6100-
v_l1 = _mm_mul_ps(v_l1, v_scale_inv);
6095+
v_l0 = _mm_mul_ps(v_l0, v_coeffs);
6096+
v_u1 = _mm_mul_ps(v_u1, v_coeffs);
6097+
v_l0 = _mm_sub_ps(v_l0, v_res);
6098+
v_u1 = _mm_sub_ps(v_u1, v_res);
6099+
6100+
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
6101+
v_res = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_res), 0x49));
61016102

6102-
v_u0 = _mm_sub_ps(_mm_mul_ps(v_u0, v_coeff1), v_134);
6103-
v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134);
6104-
v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140);
6105-
v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140);
6103+
v_l1 = _mm_mul_ps(v_l1, v_coeffs);
6104+
v_v0 = _mm_mul_ps(v_v0, v_coeffs);
6105+
v_l1 = _mm_sub_ps(v_l1, v_res);
6106+
v_v0 = _mm_sub_ps(v_v0, v_res);
61066107

6107-
_mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
6108+
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
6109+
v_res = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_res), 0x49));
6110+
6111+
v_u0 = _mm_mul_ps(v_u0, v_coeffs);
6112+
v_v1 = _mm_mul_ps(v_v1, v_coeffs);
6113+
v_u0 = _mm_sub_ps(v_u0, v_res);
6114+
v_v1 = _mm_sub_ps(v_v1, v_res);
61086115

61096116
_mm_store_ps(buf, v_l0);
61106117
_mm_store_ps(buf + 4, v_l1);
@@ -6121,6 +6128,11 @@ struct Luv2RGB_b
61216128
uchar alpha = ColorChannel<uchar>::max();
61226129
float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
61236130

6131+
#if CV_SSE2
6132+
__m128 v_coeffs = _mm_set_ps(100.f/255.f, 1.027450980392157f, 1.388235294117647f, 100.f/255.f);
6133+
__m128 v_res = _mm_set_ps(0.f, 140.f, 134.f, 0.f);
6134+
#endif
6135+
61246136
for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
61256137
{
61266138
int dn = std::min(n - i, (int)BLOCK_SIZE);
@@ -6148,36 +6160,16 @@ struct Luv2RGB_b
61486160
#elif CV_SSE2
61496161
if (haveSIMD)
61506162
{
6151-
for ( ; j <= (dn - 32) * 3; j += 96)
6163+
for ( ; j <= (dn - 8) * 3; j += 24)
61526164
{
6153-
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
6154-
__m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
6155-
__m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
6156-
__m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
6157-
__m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
6158-
__m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
6165+
__m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j));
6166+
__m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16));
61596167

6160-
_mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
6161-
6162-
process(_mm_unpacklo_epi8(v_r0, v_zero),
6163-
_mm_unpacklo_epi8(v_g0, v_zero),
6164-
_mm_unpacklo_epi8(v_b0, v_zero),
6168+
process(_mm_unpacklo_epi8(v_src0, v_zero),
6169+
_mm_unpackhi_epi8(v_src0, v_zero),
6170+
_mm_unpacklo_epi8(v_src1, v_zero),
6171+
v_coeffs, v_res,
61656172
buf + j);
6166-
6167-
process(_mm_unpackhi_epi8(v_r0, v_zero),
6168-
_mm_unpackhi_epi8(v_g0, v_zero),
6169-
_mm_unpackhi_epi8(v_b0, v_zero),
6170-
buf + j + 24);
6171-
6172-
process(_mm_unpacklo_epi8(v_r1, v_zero),
6173-
_mm_unpacklo_epi8(v_g1, v_zero),
6174-
_mm_unpacklo_epi8(v_b1, v_zero),
6175-
buf + j + 48);
6176-
6177-
process(_mm_unpackhi_epi8(v_r1, v_zero),
6178-
_mm_unpackhi_epi8(v_g1, v_zero),
6179-
_mm_unpackhi_epi8(v_b1, v_zero),
6180-
buf + j + 72);
61816173
}
61826174
}
61836175
#endif
@@ -6261,7 +6253,7 @@ struct Luv2RGB_b
62616253
float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
62626254
uint8x8_t v_alpha;
62636255
#elif CV_SSE2
6264-
__m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
6256+
__m128 v_scale;
62656257
__m128i v_zero;
62666258
bool haveSIMD;
62676259
#endif

0 commit comments

Comments
 (0)