Skip to content

Commit f8f4afe

Browse files
Optimize vector8_has_le() on AArch64.
Presently, the SIMD implementation of this function uses unsigned saturating subtraction to find bytes less than or equal to the given value, which is a workaround for the lack of unsigned comparison instructions on some architectures. However, Neon offers vminvq_u8(), which returns the minimum (unsigned) value in the vector. This commit adds a Neon-specific implementation that uses vminvq_u8() to optimize vector8_has_le() on AArch64. In passing, adjust the SSE2 implementation to use vector8_min() and vector8_eq() to find values less than or equal to the given value. This was the only use of vector8_ssub(), so it has been removed. Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/aNHDNDSHleq0ogC_%40nathan
1 parent 74b41f5 commit f8f4afe

File tree

1 file changed

+10
-27
lines changed

1 file changed

+10
-27
lines changed

src/include/port/simd.h

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ static inline uint32 vector8_highbit_mask(const Vector8 v);
8686
static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
8787
#ifndef USE_NO_SIMD
8888
static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
89-
static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
9089
#endif
9190

9291
/*
@@ -213,6 +212,10 @@ static inline bool
213212
vector8_has_le(const Vector8 v, const uint8 c)
214213
{
215214
bool result = false;
215+
#ifdef USE_SSE2
216+
Vector8 umin;
217+
Vector8 cmpe;
218+
#endif
216219

217220
/* pre-compute the result for assert checking */
218221
#ifdef USE_ASSERT_CHECKING
@@ -250,14 +253,12 @@ vector8_has_le(const Vector8 v, const uint8 c)
250253
}
251254
}
252255
}
253-
#else
254-
255-
/*
256-
* Use saturating subtraction to find bytes <= c, which will present as
257-
* NUL bytes. This approach is a workaround for the lack of unsigned
258-
* comparison instructions on some architectures.
259-
*/
260-
result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
256+
#elif defined(USE_SSE2)
257+
umin = vector8_min(v, vector8_broadcast(c));
258+
cmpe = vector8_eq(umin, v);
259+
result = vector8_is_highbit_set(cmpe);
260+
#elif defined(USE_NEON)
261+
result = vminvq_u8(v) <= c;
261262
#endif
262263

263264
Assert(assert_result == result);
@@ -358,24 +359,6 @@ vector32_or(const Vector32 v1, const Vector32 v2)
358359
}
359360
#endif /* ! USE_NO_SIMD */
360361

361-
/*
362-
* Return the result of subtracting the respective elements of the input
363-
* vectors using saturation (i.e., if the operation would yield a value less
364-
* than zero, zero is returned instead). For more information on saturation
365-
* arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
366-
*/
367-
#ifndef USE_NO_SIMD
368-
static inline Vector8
369-
vector8_ssub(const Vector8 v1, const Vector8 v2)
370-
{
371-
#ifdef USE_SSE2
372-
return _mm_subs_epu8(v1, v2);
373-
#elif defined(USE_NEON)
374-
return vqsubq_u8(v1, v2);
375-
#endif
376-
}
377-
#endif /* ! USE_NO_SIMD */
378-
379362
/*
380363
* Return a vector with all bits set in each lane where the corresponding
381364
* lanes in the inputs are equal.

0 commit comments

Comments
 (0)