// Copyright (C) 2024 Loongson Technology Corporation Limited. // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only #include #ifdef QT_COMPILER_SUPPORTS_LSX #include #include QT_BEGIN_NAMESPACE void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha) { const quint32 *src = (const quint32 *) srcPixels; quint32 *dst = (quint32 *) destPixels; if (const_alpha == 256) { for (int y = 0; y < h; ++y) { BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w); dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } } else if (const_alpha != 0) { // dest = (s + d * sia) * ca + d * cia // = s * ca + d * (sia * ca + cia) // = s * ca + d * (1 - sa*ca) const_alpha = (const_alpha * 255) >> 8; for (int y = 0; y < h; ++y) { BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha); dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } } } // qblendfunctions.cpp void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha); void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha) { const quint32 *src = (const quint32 *) srcPixels; quint32 *dst = (quint32 *) destPixels; if (const_alpha != 256) { if (const_alpha != 0) { const __m128i half = __lsx_vreplgr2vr_h(0x80); const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); const_alpha = (const_alpha * 255) >> 8; int one_minus_const_alpha = 255 - const_alpha; const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha); for (int y = 0; y < h; ++y) { int x = 0; // First, align dest to 16 bytes: ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); } for (; x < w-3; x += 4) { __m128i srcVector = __lsx_vld(&src[x], 0); __m128i dstVector = __lsx_vld(&dst[x], 0); INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); __lsx_vst(dstVector, &dst[x], 0); } SIMD_EPILOGUE(x, w, 3) dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } } } else { qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); } } void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha) { Q_ASSERT(const_alpha < 256); const quint32 *src = (const quint32 *) srcPixels; quint32 *dst = (quint32 *) destPixels; if (const_alpha == 255) { BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length); } else { BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha); } } void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha) { int x = 0; if (const_alpha == 255) { // 1) Prologue: align destination on 16 bytes ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); // 2) composition with LSX for (; x < length - 3; x += 4) { const __m128i srcVector = __lsx_vld(&src[x], 0); const __m128i dstVector = __lsx_vld(&dst[x], 0); const __m128i result = __lsx_vsadd_bu(srcVector, dstVector); __lsx_vst(result, &dst[x], 0); } // 3) Epilogue: SIMD_EPILOGUE(x, length, 3) dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); } else { const int one_minus_const_alpha = 255 - const_alpha; const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha); // 1) Prologue: align destination on 16 bytes ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); const __m128i half = __lsx_vreplgr2vr_h(0x80); const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); // 2) composition with LSX for (; x < length - 3; x += 4) { const __m128i srcVector = __lsx_vld(&src[x], 0); __m128i dstVector = __lsx_vld(&dst[x], 0); __m128i result = __lsx_vsadd_bu(srcVector, dstVector); INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); __lsx_vst(dstVector, &dst[x], 0); } // 3) Epilogue: SIMD_EPILOGUE(x, length, 3) dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); } } void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha) { if (const_alpha == 255) { ::memcpy(dst, src, length * sizeof(uint)); } else { const int ialpha = 255 - const_alpha; int x = 0; // 1) prologue, align on 16 bytes ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); // 2) interpolate pixels with LSX const __m128i half = __lsx_vreplgr2vr_h(0x80); const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(ialpha); for (; x < length - 3; x += 4) { const __m128i srcVector = __lsx_vld(&src[x], 0); __m128i dstVector = __lsx_vld(&dst[x], 0); INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); __lsx_vst(dstVector, &dst[x], 0); } // 3) Epilogue SIMD_EPILOGUE(x, length, 3) dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); } } static Q_NEVER_INLINE void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount) { __m128i *dst128 = reinterpret_cast<__m128i *>(dest); __m128i *end128 = reinterpret_cast<__m128i *>(static_cast(dest) + bytecount); while (dst128 + 4 <= end128) { __lsx_vst(value128, dst128 + 0, 0); __lsx_vst(value128, dst128 + 1, 0); __lsx_vst(value128, dst128 + 2, 0); __lsx_vst(value128, dst128 + 3, 0); dst128 += 4; } bytecount %= 4 * sizeof(__m128i); switch (bytecount / sizeof(__m128i)) { case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH(); case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH(); case 1: __lsx_vst(value128, dst128++, 0); } } void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count) { quintptr misaligned = quintptr(dest) % sizeof(__m128i); if (misaligned && count) { *dest++ = value; --count; } if (count % 2) { dest[count - 1] = value; --count; } qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64)); } void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count) { if (count < 4) { // this simplifies the code below: the first switch can fall through // without checking the value of count switch (count) { case 3: *dest++ = value; Q_FALLTHROUGH(); case 2: *dest++ = value; Q_FALLTHROUGH(); case 1: *dest = value; } return; } const int align = (quintptr)(dest) & 0xf; switch (align) { case 4: *dest++ = value; --count; Q_FALLTHROUGH(); case 8: *dest++ = value; --count; Q_FALLTHROUGH(); case 12: *dest++ = value; --count; } const int rest = count & 0x3; if (rest) { switch (rest) { case 3: dest[count - 3] = value; Q_FALLTHROUGH(); case 2: dest[count - 2] = value; Q_FALLTHROUGH(); case 1: dest[count - 1] = value; } } qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32)); } void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length, uint color, uint const_alpha) { if (const_alpha == 255) { qt_memfill32(destPixels, color, length); } else { const quint32 ialpha = 255 - const_alpha; color = BYTE_MUL(color, const_alpha); int x = 0; quint32 *dst = (quint32 *) destPixels; const __m128i colorVector = __lsx_vreplgr2vr_w(color); const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); const __m128i half = __lsx_vreplgr2vr_h(0x80); const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha); ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha); for (; x < length-3; x += 4) { __m128i dstVector = __lsx_vld(&dst[x], 0); BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half); dstVector = __lsx_vadd_b(colorVector, dstVector); __lsx_vst(dstVector, &dst[x], 0); } SIMD_EPILOGUE(x, length, 3) destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha); } } void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length, uint color, uint const_alpha) { if ((const_alpha & qAlpha(color)) == 255) { qt_memfill32(destPixels, color, length); } else { if (const_alpha != 255) color = BYTE_MUL(color, const_alpha); const quint32 minusAlphaOfColor = qAlpha(~color); int x = 0; quint32 *dst = (quint32 *) destPixels; const __m128i colorVector = __lsx_vreplgr2vr_w(color); const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); const __m128i half = __lsx_vreplgr2vr_h(0x80); const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor); ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); for (; x < length-3; x += 4) { __m128i dstVector = __lsx_vld(&dst[x], 0); BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half); dstVector = __lsx_vadd_b(colorVector, dstVector); __lsx_vst(dstVector, &dst[x], 0); } SIMD_EPILOGUE(x, length, 3) destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); } } void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y, quint32 color, const uchar *src, int width, int height, int stride) { quint32 *dest = reinterpret_cast(rasterBuffer->scanLine(y)) + x; const int destStride = rasterBuffer->stride(); const __m128i c128 = __lsx_vreplgr2vr_w(color); const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040, 0x20202020, 0x10101010}; const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040, 0x60606060, 0x70707070}; if (width > 4) { const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404, 0x02020202, 0x01010101}; const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c, 0x7e7e7e7e, 0x7f7f7f7f}; while (height--) { for (int x = 0; x < width; x += 8) { const quint8 s = src[x >> 3]; if (!s) continue; __m128i mask1 = __lsx_vreplgr2vr_b(s); __m128i mask2 = mask1; mask1 = __lsx_vand_v(mask1, maskmask1); mask1 = __lsx_vadd_b(mask1, maskadd1); __m128i destSrc1 = __lsx_vld((char*)(dest + x), 0); mask1 = __lsx_vslti_b(mask1,0); destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1); __lsx_vst(destSrc1, (char*)(dest + x), 0); __m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0); mask2 = __lsx_vand_v(mask2, maskmask2); mask2 = __lsx_vadd_b(mask2, maskadd2); mask2 = __lsx_vslti_b(mask2,0); destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2); __lsx_vst(destSrc2, (char*)(dest + x + 4), 0); } dest += destStride; src += stride; } } else { while (height--) { const quint8 s = *src; if (s) { __m128i mask1 = __lsx_vreplgr2vr_b(s); __m128i destSrc1 = __lsx_vld((char*)(dest), 0); mask1 = __lsx_vand_v(mask1, maskmask1); mask1 = __lsx_vadd_b(mask1, maskadd1); mask1 = __lsx_vslti_b(mask1, 0); destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1); __lsx_vst(destSrc1, (char*)(dest), 0); } dest += destStride; src += stride; } } } void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 &color, const uchar *src, int width, int height, int stride) { qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride); } void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 &color, const uchar *src, int width, int height, int stride) { qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride); } void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 &color, const uchar *src, int width, int height, int stride) { const quint16 c = qConvertRgb32To16(color.toArgb32()); quint16 *dest = reinterpret_cast(rasterBuffer->scanLine(y)) + x; const int destStride = rasterBuffer->stride(); const __m128i c128 = __lsx_vreplgr2vr_h(c); const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010, 0x0808, 0x0404, 0x0202, 0x0101}; const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070, 0x7878, 0x7c7c, 0x7e7e, 0x7f7f}; while (--height >= 0) { for (int x = 0; x < width; x += 8) { const quint8 s = src[x >> 3]; if (!s) continue; __m128i mask = __lsx_vreplgr2vr_b(s); __m128i destSrc = __lsx_vld((char*)(dest + x), 0); mask = __lsx_vand_v(mask, maskmask); mask = __lsx_vadd_b(mask, maskadd); mask = __lsx_vslti_b(mask, 0); destSrc = __lsx_vbitsel_v(destSrc, c128, mask); __lsx_vst(destSrc, (char*)(dest + x), 0); } dest += destStride; src += stride; } } class QSimdLsx { public: typedef __m128i Int32x4; typedef __m128 Float32x4; union Vect_buffer_i { Int32x4 v; int i[4]; }; union Vect_buffer_f { Float32x4 v; float f[4]; }; static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); } static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); } static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); } static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); } static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); } static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); } static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); } static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); } static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); } static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); } static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); } static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); } static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); } static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); } static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); } static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); } }; const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op, const QSpanData *data, int y, int x, int length) { return qt_fetch_radial_gradient_template,uint>(buffer, op, data, y, x, length); } void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int srch, const QRectF &targetRect, const QRectF &sourceRect, const QRect &clip, int const_alpha) { if (const_alpha != 256) { // from qblendfunctions.cpp extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int srch, const QRectF &targetRect, const QRectF &sourceRect, const QRect &clip, int const_alpha); return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, const_alpha); } qreal sx = sourceRect.width() / (qreal)targetRect.width(); qreal sy = sourceRect.height() / (qreal)targetRect.height(); const int ix = 0x00010000 * sx; const int iy = 0x00010000 * sy; QRect tr = targetRect.normalized().toRect(); tr = tr.intersected(clip); if (tr.isEmpty()) return; const int tx1 = tr.left(); const int ty1 = tr.top(); int h = tr.height(); int w = tr.width(); quint32 basex; quint32 srcy; if (sx < 0) { int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1; basex = quint32(sourceRect.right() * 65536) + dstx; } else { int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1; basex = quint32(sourceRect.left() * 65536) + dstx; } if (sy < 0) { int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1; srcy = quint32(sourceRect.bottom() * 65536) + dsty; } else { int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1; srcy = quint32(sourceRect.top() * 65536) + dsty; } quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1; const __m128i nullVector = __lsx_vreplgr2vr_w(0); const __m128i half = __lsx_vreplgr2vr_h(0x80); const __m128i one = __lsx_vreplgr2vr_h(0xff); const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix); // this bounds check here is required as floating point rounding above might in some cases lead to // w/h values that are one pixel too large, falling outside of the valid image area. const int ystart = srcy >> 16; if (ystart >= srch && iy < 0) { srcy += iy; --h; } const int xstart = basex >> 16; if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) { basex += ix; --w; } int yend = (srcy + iy * (h - 1)) >> 16; if (yend < 0 || yend >= srch) --h; int xend = (basex + ix * (w - 1)) >> 16; if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32))) --w; while (--h >= 0) { const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl); int srcx = basex; int x = 0; ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { uint s = src[srcx >> 16]; dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); srcx += ix; } __m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx}; for (; x < (w - 3); x += 4) { const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1); const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3); const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5); const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7); srcxVector = __lsx_vadd_w(srcxVector, ixVector); const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]}); BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask); } SIMD_EPILOGUE(x, w, 3) { uint s = src[(basex + x*ix) >> 16]; dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } dst = (quint32 *)(((uchar *) dst) + dbpl); srcy += iy; } } const uint *QT_FASTCALL fetchPixelsBPP24_lsx(uint *buffer, const uchar *src, int index, int count) { const quint24 *s = reinterpret_cast(src); for (int i = 0; i < count; ++i) buffer[i] = s[index + i]; return buffer; } const uint * QT_FASTCALL qt_fetchUntransformed_888_lsx(uint *buffer, const Operator *, const QSpanData *data, int y, int x, int length) { const uchar *line = data->texture.scanLine(y) + x * 3; // from image/qimage_lsx.cpp extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len); qt_convert_rgb888_to_rgb32_lsx(buffer, line, length); return buffer; } void qt_memfill24_lsx(quint24 *dest, quint24 color, qsizetype count) { // LCM of 12 and 16 bytes is 48 bytes (16 px) quint32 v = color; __m128i m = __lsx_vinsgr2vr_w(__lsx_vldi(0), v, 0); quint24 *end = dest + count; constexpr uchar x = 2, y = 1, z = 0; alignas(__m128i) static const uchar shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y }; __m128i indexMask = (__m128i)(v16i8){2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; __m128i mval1 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast(shuffleMask), 0)); __m128i mval2 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast(shuffleMask + 1), 0)); __m128i mval3 = __lsx_vshuf_b(mval2, mval1, indexMask); for ( ; dest + 16 <= end; dest += 16) { __lsx_vst(mval1, reinterpret_cast<__m128i *>(dest) + 0, 0); __lsx_vst(mval2, reinterpret_cast<__m128i *>(dest) + 1, 0); __lsx_vst(mval3, reinterpret_cast<__m128i *>(dest) + 2, 0); } if (count < 3) { if (count > 1) end[-2] = v; if (count) end[-1] = v; return; } // less than 16px/48B left uchar *ptr = reinterpret_cast(dest); uchar *ptr_end = reinterpret_cast(end); qptrdiff left = ptr_end - ptr; if (left >= 24) { // 8px/24B or more left __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) + 0, 0); __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr) + 1, 0, 0); ptr += 24; left -= 24; } // less than 8px/24B left if (left >= 16) { // but more than 5px/15B left __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) , 0); } else if (left >= 8) { // but more than 2px/6B left __lsx_vstelm_d(mval1, reinterpret_cast<__m128i *>(ptr), 0, 0); } if (left) { // 1 or 2px left // store 8 bytes ending with the right values (will overwrite a bit) __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr_end - 8), 0, 0); } } void QT_FASTCALL rbSwap_888_lsx(uchar *dst, const uchar *src, int count) { int i = 0; const static __m128i shuffleMask1 = (__m128i)(v16i8){2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 15}; const static __m128i shuffleMask2 = (__m128i)(v16i8){0, 1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, 14, 15}; const static __m128i shuffleMask3 = (__m128i)(v16i8){0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13}; for (; i + 15 < count; i += 16) { __m128i s1 = __lsx_vld(src, 0); __m128i s2 = __lsx_vld((src + 16), 0); __m128i s3 = __lsx_vld((src + 32), 0); s1 = __lsx_vshuf_b(s1, s1, shuffleMask1); s2 = __lsx_vshuf_b(s2, s2, shuffleMask2); s3 = __lsx_vshuf_b(s3, s3, shuffleMask3); __lsx_vst(s1, dst, 0); __lsx_vst(s2, (dst + 16), 0); __lsx_vst(s3, (dst + 32), 0); // Now fix the last four misplaced values std::swap(dst[15], dst[17]); std::swap(dst[30], dst[32]); src += 48; dst += 48; } if (src != dst) { SIMD_EPILOGUE(i, count, 15) { dst[0] = src[2]; dst[1] = src[1]; dst[2] = src[0]; dst += 3; src += 3; } } else { SIMD_EPILOGUE(i, count, 15) { std::swap(dst[0], dst[2]); dst += 3; } } } template static void convertARGBToARGB32PM_lsx(uint *buffer, const uint *src, int count) { int i = 0; const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15}; const __m128i half = __lsx_vreplgr2vr_h(0x0080); const __m128i zero = __lsx_vldi(0); for (; i < count - 3; i += 4) { __m128i srcVector = __lsx_vld(&src[i], 0); const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask)); if (testz[0]!=0) { const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask)); if (testc[0]!=0) { if (RGBA) srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask); __m128i src1 = __lsx_vilvl_b(zero, srcVector); __m128i src2 = __lsx_vilvh_b(zero, srcVector); __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask); __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask); src1 = __lsx_vmul_h(src1, alpha1); src2 = __lsx_vmul_h(src2, alpha2); src1 = __lsx_vadd_h(src1, __lsx_vsrli_h(src1, 8)); src2 = __lsx_vadd_h(src2, __lsx_vsrli_h(src2, 8)); src1 = __lsx_vadd_h(src1, half); src2 = __lsx_vadd_h(src2, half); src1 = __lsx_vsrli_h(src1, 8); src2 = __lsx_vsrli_h(src2, 8); __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15}; src1 = __lsx_vshuf_h(blendMask, alpha1, src1); src2 = __lsx_vshuf_h(blendMask, alpha2, src2); src1 = __lsx_vmaxi_h(src1, 0); src2 = __lsx_vmaxi_h(src2, 0); srcVector = __lsx_vpickev_b(__lsx_vsat_hu(src2, 7), __lsx_vsat_hu(src1, 7)); __lsx_vst(srcVector, &buffer[i], 0); } else { if (RGBA) __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0); else if (buffer != src) __lsx_vst(srcVector, &buffer[i], 0); } } else { __lsx_vst(zero, &buffer[i], 0); } } SIMD_EPILOGUE(i, count, 3) { uint v = qPremultiply(src[i]); buffer[i] = RGBA ? RGBA2ARGB(v) : v; } } template static void convertARGBToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count) { int i = 0; const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15}; const __m128i zero = __lsx_vldi(0); for (; i < count - 3; i += 4) { __m128i srcVector = __lsx_vld(&src[i], 0); const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask)); if (testz[0]!=0) { const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask)); if (!RGBA) srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask); const __m128i src1 = __lsx_vilvl_b(srcVector, srcVector); const __m128i src2 = __lsx_vilvh_b(srcVector, srcVector); if (testc[0]!=0) { __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask); __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask); __m128i dst1 = __lsx_vmuh_hu(src1, alpha1); __m128i dst2 = __lsx_vmuh_hu(src2, alpha2); // Map 0->0xfffe to 0->0xffff dst1 = __lsx_vadd_h(dst1, __lsx_vsrli_h(dst1, 15)); dst2 = __lsx_vadd_h(dst2, __lsx_vsrli_h(dst2, 15)); // correct alpha value: const __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15}; dst1 = __lsx_vshuf_h(blendMask, src1, dst1); dst2 = __lsx_vshuf_h(blendMask, src2, dst2); __lsx_vst(dst1, &buffer[i], 0); __lsx_vst(dst2, &buffer[i + 2], 0); } else { __lsx_vst(src1, &buffer[i], 0); __lsx_vst(src2, &buffer[i + 2], 0); } } else { __lsx_vst(zero, &buffer[i], 0); __lsx_vst(zero, &buffer[i + 2], 0); } } SIMD_EPILOGUE(i, count, 3) { const uint s = RGBA ? RGBA2ARGB(src[i]) : src[i]; buffer[i] = QRgba64::fromArgb32(s).premultiplied(); } } template static inline void convertARGBFromARGB32PM_lsx(uint *buffer, const uint *src, int count) { int i = 0; const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; const __m128i zero = __lsx_vldi(0); for (; i < count - 3; i += 4) { __m128i srcVector = __lsx_vld(&src[i], 0); const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask)); if (testz[0]!=0) { const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask)); if (testc[0]!=0) { __m128i srcVectorAlpha = __lsx_vsrli_w(srcVector, 24); if (RGBA) srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask); const __m128 a = __lsx_vffint_s_w(srcVectorAlpha); const __m128 ia = reciprocal_mul_ps(a, 255.0f); __m128i src1 = __lsx_vilvl_b(zero, srcVector); __m128i src3 = __lsx_vilvh_b(zero, srcVector); __m128i src2 = __lsx_vilvh_h(zero, src1); __m128i src4 = __lsx_vilvh_h(zero, src3); src1 = __lsx_vilvl_h(zero, src1); src3 = __lsx_vilvl_h(zero, src3); __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0); __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1); __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2); __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3); src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1)); src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2)); src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3)); src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4)); src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); src1 = __lsx_vmaxi_h(src1, 0); src3 = __lsx_vmaxi_h(src3, 0); src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7)); // Handle potential alpha == 0 values: __m128i srcVectorAlphaMask = __lsx_vseq_w(srcVectorAlpha, zero); src1 = __lsx_vandn_v(srcVectorAlphaMask, src1); // Fixup alpha values: if (RGBx) srcVector = __lsx_vor_v(src1, alphaMask); else srcVector = __lsx_vbitsel_v(src1, srcVector, __lsx_vslti_b(alphaMask, 0)); __lsx_vst(srcVector, &buffer[i], 0); } else { if (RGBA) __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0); else if (buffer != src) __lsx_vst(srcVector, &buffer[i], 0); } } else { if (RGBx) __lsx_vst(alphaMask, &buffer[i], 0); else __lsx_vst(zero, &buffer[i], 0); } } SIMD_EPILOGUE(i, count, 3) { uint v = qUnpremultiply_lsx(src[i]); if (RGBx) v = 0xff000000 | v; if (RGBA) v = ARGB2RGBA(v); buffer[i] = v; } } template static inline void convertARGBFromRGBA64PM_lsx(uint *buffer, const QRgba64 *src, int count) { int i = 0; const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48)); const __m128i alphaMask32 = __lsx_vreplgr2vr_w(0xff000000); const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; const __m128i zero = __lsx_vldi(0); for (; i < count - 3; i += 4) { __m128i srcVector1 = __lsx_vld(&src[i], 0); __m128i srcVector2 = __lsx_vld(&src[i + 2], 0); const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask)); bool transparent1 = testz1[0]==0; const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask)); bool opaque1 = testc1[0]==0; const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask)); bool transparent2 = testz2[0]==0; const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask)); bool opaque2 = testc2[0]==0; if (!(transparent1 && transparent2)) { if (!(opaque1 && opaque2)) { __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48); __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48); __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15), __lsx_vsat_wu(srcVector1Alpha, 15)); const __m128 a = __lsx_vffint_s_w(srcVectorAlpha); // Convert srcVectorAlpha to final 8-bit alpha channel srcVectorAlpha = __lsx_vadd_w(srcVectorAlpha, __lsx_vreplgr2vr_w(128)); srcVectorAlpha = __lsx_vsub_w(srcVectorAlpha, __lsx_vsrli_w(srcVectorAlpha, 8)); srcVectorAlpha = __lsx_vsrli_w(srcVectorAlpha, 8); srcVectorAlpha = __lsx_vslli_w(srcVectorAlpha, 24); const __m128 ia = reciprocal_mul_ps(a, 255.0f); __m128i src1 = __lsx_vilvl_h(zero, srcVector1); __m128i src2 = __lsx_vilvh_h(zero, srcVector1); __m128i src3 = __lsx_vilvl_h(zero, srcVector2); __m128i src4 = __lsx_vilvh_h(zero, srcVector2); __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0); __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1); __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2); __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3); src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1)); src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2)); src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3)); src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4)); src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); // Handle potential alpha == 0 values: __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero); __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero); src1 = __lsx_vandn_v(srcVector1AlphaMask, src1); src3 = __lsx_vandn_v(srcVector2AlphaMask, src3); src1 = __lsx_vmaxi_h(src1, 0); src3 = __lsx_vmaxi_h(src3, 0); src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7)); // Fixup alpha values: src1 = __lsx_vbitsel_v(src1, srcVectorAlpha, __lsx_vslti_b(alphaMask32, 0)); // Fix RGB order if (!RGBA){ src1 = __lsx_vshuf_b(zero, src1, rgbaMask);} __lsx_vst(src1, (__m128i *)&buffer[i], 0); } else { __m128i src1 = __lsx_vilvl_h(zero, srcVector1); __m128i src2 = __lsx_vilvh_h(zero, srcVector1); __m128i src3 = __lsx_vilvl_h(zero, srcVector2); __m128i src4 = __lsx_vilvh_h(zero, srcVector2); src1 = __lsx_vadd_w(src1, __lsx_vreplgr2vr_w(128)); src2 = __lsx_vadd_w(src2, __lsx_vreplgr2vr_w(128)); src3 = __lsx_vadd_w(src3, __lsx_vreplgr2vr_w(128)); src4 = __lsx_vadd_w(src4, __lsx_vreplgr2vr_w(128)); src1 = __lsx_vsub_w(src1, __lsx_vsrli_w(src1, 8)); src2 = __lsx_vsub_w(src2, __lsx_vsrli_w(src2, 8)); src3 = __lsx_vsub_w(src3, __lsx_vsrli_w(src3, 8)); src4 = __lsx_vsub_w(src4, __lsx_vsrli_w(src4, 8)); src1 = __lsx_vsrli_w(src1, 8); src2 = __lsx_vsrli_w(src2, 8); src3 = __lsx_vsrli_w(src3, 8); src4 = __lsx_vsrli_w(src4, 8); src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); src1 = __lsx_vmaxi_h(src1, 0); src3 = __lsx_vmaxi_h(src3, 0); src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 15)); if (!RGBA){ src1 = __lsx_vshuf_b(zero, src1, rgbaMask);} __lsx_vst(src1, &buffer[i], 0); } } else { __lsx_vst(zero, &buffer[i], 0); } } SIMD_EPILOGUE(i, count, 3) { buffer[i] = qConvertRgba64ToRgb32_lsx(src[i]); } } template static inline void convertRGBA64FromRGBA64PM_lsx(QRgba64 *buffer, const QRgba64 *src, int count) { int i = 0; const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48)); const __m128i zero = __lsx_vldi(0); for (; i < count - 3; i += 4) { __m128i srcVector1 = __lsx_vld(&src[i + 0], 0); __m128i srcVector2 = __lsx_vld(&src[i + 2], 0); const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask)); bool transparent1 = testz1[0]==0; const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask)); bool opaque1 = testc1[0]==0; const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask)); bool transparent2 = testz2[0]==0; const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask)); bool opaque2 = testc2[0]==0; if (!(transparent1 && transparent2)) { if (!(opaque1 && opaque2)) { __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48); __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48); __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15), __lsx_vsat_wu(srcVector1Alpha, 15)); const __m128 a = __lsx_vffint_s_w(srcVectorAlpha); const __m128 ia = reciprocal_mul_ps(a, 65535.0f); __m128i src1 = __lsx_vilvl_h(zero, srcVector1); __m128i src2 = __lsx_vilvh_h(zero, srcVector1); __m128i src3 = __lsx_vilvl_h(zero, srcVector2); __m128i src4 = __lsx_vilvh_h(zero, srcVector2); __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0); __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1); __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2); __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3); src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1)); src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2)); src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3)); src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4)); src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); // Handle potential alpha == 0 values: __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero); __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero); src1 = __lsx_vandn_v(srcVector1AlphaMask, src1); src3 = __lsx_vandn_v(srcVector2AlphaMask, src3); // Fixup alpha values: if (mask) { src1 = __lsx_vor_v(src1, alphaMask); src3 = __lsx_vor_v(src3, alphaMask); } else { src1 = __lsx_vbitsel_v(src1, srcVector1, __lsx_vslti_b(alphaMask, 0)); src3 = __lsx_vbitsel_v(src3, srcVector2, __lsx_vslti_b(alphaMask, 0)); } __lsx_vst(src1, &buffer[i + 0], 0); __lsx_vst(src3, &buffer[i + 2], 0); } else { if (mask) { srcVector1 = __lsx_vor_v(srcVector1, alphaMask); srcVector2 = __lsx_vor_v(srcVector2, alphaMask); } if (mask || src != buffer) { __lsx_vst(srcVector1, &buffer[i + 0], 0); __lsx_vst(srcVector2, &buffer[i + 2], 0); } } } else { __lsx_vst(zero, &buffer[i + 0], 0); __lsx_vst(zero, &buffer[i + 2], 0); } } SIMD_EPILOGUE(i, count, 3) { QRgba64 v = src[i].unpremultiplied(); if (mask) v.setAlpha(65535); buffer[i] = v; } } void QT_FASTCALL convertARGB32ToARGB32PM_lsx(uint *buffer, int count, const QList *) { convertARGBToARGB32PM_lsx(buffer, buffer, count); } void QT_FASTCALL convertRGBA8888ToARGB32PM_lsx(uint *buffer, int count, const QList *) { convertARGBToARGB32PM_lsx(buffer, buffer, count); } const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count, const QList *, QDitherInfo *) { convertARGBToRGBA64PM_lsx(buffer, src, count); return buffer; } const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count, const QList *, QDitherInfo *) { convertARGBToRGBA64PM_lsx(buffer, src, count); return buffer; } const uint *QT_FASTCALL fetchARGB32ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count, const QList *, QDitherInfo *) { convertARGBToARGB32PM_lsx(buffer, reinterpret_cast(src) + index, count); return buffer; } const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count, const QList *, QDitherInfo *) { convertARGBToARGB32PM_lsx(buffer, reinterpret_cast(src) + index, count); return buffer; } const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count, const QList *, QDitherInfo *) { convertARGBToRGBA64PM_lsx(buffer, reinterpret_cast(src) + index, count); return buffer; } const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count, const QList *, QDitherInfo *) { convertARGBToRGBA64PM_lsx(buffer, reinterpret_cast(src) + index, count); return buffer; } void QT_FASTCALL storeRGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; convertARGBFromARGB32PM_lsx(d, src, count); } void QT_FASTCALL storeARGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; convertARGBFromARGB32PM_lsx(d, src, count); } void QT_FASTCALL storeRGBA8888FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; convertARGBFromARGB32PM_lsx(d, src, count); } void QT_FASTCALL storeRGBXFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; convertARGBFromARGB32PM_lsx(d, src, count); } template void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; for (int i = 0; i < count; ++i) d[i] = qConvertArgb32ToA2rgb30_lsx(src[i]); } #if QT_CONFIG(raster_64bit) void QT_FASTCALL destStore64ARGB32_lsx(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length) { uint *dest = (uint*)rasterBuffer->scanLine(y) + x; convertARGBFromRGBA64PM_lsx(dest, buffer, length); } void QT_FASTCALL destStore64RGBA8888_lsx(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length) { uint *dest = (uint*)rasterBuffer->scanLine(y) + x; convertARGBFromRGBA64PM_lsx(dest, buffer, length); } #endif void QT_FASTCALL storeARGB32FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, const QList *, QDitherInfo *) { uint *d = (uint*)dest + index; convertARGBFromRGBA64PM_lsx(d, src, count); } void QT_FASTCALL storeRGBA8888FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, const QList *, QDitherInfo *) { uint *d = (uint*)dest + index; convertARGBFromRGBA64PM_lsx(d, src, count); } template void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList *, QDitherInfo *); template void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList *, QDitherInfo *); void QT_FASTCALL storeRGBA64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, const QList *, QDitherInfo *) { QRgba64 *d = (QRgba64 *)dest + index; convertRGBA64FromRGBA64PM_lsx(d, src, count); } void QT_FASTCALL storeRGBx64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, const QList *, QDitherInfo *) { QRgba64 *d = (QRgba64 *)dest + index; convertRGBA64FromRGBA64PM_lsx(d, src, count); } #if QT_CONFIG(raster_fp) const QRgbaFloat32 *QT_FASTCALL fetchRGBA32FToRGBA32F_lsx(QRgbaFloat32 *buffer, const uchar *src, int index, int count, const QList *, QDitherInfo *) { const QRgbaFloat32 *s = reinterpret_cast(src) + index; for (int i = 0; i < count; ++i) { __m128 vsf = (__m128)__lsx_vld(reinterpret_cast(s + i), 0); __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3); vsf = __lsx_vfmul_s(vsf, vsa); vsf = (__m128)__lsx_vextrins_w(vsf, vsa, 0x30); __lsx_vst(vsf, reinterpret_cast(buffer + i), 0); } return buffer; } void QT_FASTCALL storeRGBX32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src, int index, int count, const QList *, QDitherInfo *) { QRgbaFloat32 *d = reinterpret_cast(dest) + index; const __m128 zero = (__m128)(v4f32){0.0f, 0.0f, 0.0f, 1.0f}; for (int i = 0; i < count; ++i) { __m128 vsf = (__m128)__lsx_vld(reinterpret_cast(src + i), 0); const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3); FloatInt a; a.i = __lsx_vpickve2gr_w(vsa, 0); if (a.f == 1.0f) { } else if (a.f == 0.0f) vsf = zero; else { __m128 vsr = __lsx_vfrecip_s(vsa); vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr), __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa))); vsf = __lsx_vfmul_s(vsf, vsr); FloatInt b = {.f = 1.0f}; vsf = (__m128)__lsx_vinsgr2vr_w(vsf, b.i, 3); } __lsx_vst(vsf, reinterpret_cast(d + i), 0); } } void QT_FASTCALL storeRGBA32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src, int index, int count, const QList *, QDitherInfo *) { QRgbaFloat32 *d = reinterpret_cast(dest) + index; const __m128 zero = (__m128)__lsx_vldi(0); for (int i = 0; i < count; ++i) { __m128 vsf = (__m128)__lsx_vld(reinterpret_cast(src + i), 0); const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3); FloatInt a; a.i = __lsx_vpickve2gr_w(vsa, 0); if (a.f == 1.0f) { } else if (a.f == 0.0f) vsf = zero; else { __m128 vsr = __lsx_vfrecip_s(vsa); vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr), __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa))); FloatInt b = {.f = 1.0f}; vsr = (__m128)__lsx_vinsgr2vr_w(vsr, b.i, 3); vsf = __lsx_vfmul_s(vsf, vsr); } __lsx_vst(vsf, reinterpret_cast(d + i), 0); } } #endif QT_END_NAMESPACE #endif // QT_COMPILER_SUPPORTS_LSX