16#include <winpr/sysinfo.h> 
   18#include <freerdp/config.h> 
   21#include <freerdp/types.h> 
   22#include <freerdp/primitives.h> 
   23#include <freerdp/log.h> 
   25#include "prim_internal.h" 
   27#include "../codec/color.h" 
   29#include <freerdp/codec/color.h> 
   31#if defined(SSE_AVX_INTRINSICS_ENABLED) 
   35static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
 
   36                                      uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
 
   38  return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
 
   39                          (int32_t)i5, (int32_t)i6, (int32_t)i7);
 
   42static inline pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
 
   43                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
 
   45                                                     const BYTE* WINPR_RESTRICT pSrcData,
 
   46                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
 
   47                                                     int64_t srcVMultiplier, int64_t srcVOffset,
 
   48                                                     int64_t dstVMultiplier, int64_t dstVOffset)
 
   51  const int64_t srcByte = 3;
 
   52  const int64_t dstByte = 4;
 
   54  const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
 
   55                                       0xFF000000, 0xFF000000, 0xFF000000);
 
   56  const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
 
   57                                        0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
 
   58  const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
 
   59                                            0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
 
   60  const UINT32 rem = nWidth % 8;
 
   61  const int64_t width = nWidth - rem;
 
   63  for (int64_t y = 0; y < nHeight; y++)
 
   65    const BYTE* WINPR_RESTRICT srcLine =
 
   66        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
 
   67    BYTE* WINPR_RESTRICT dstLine =
 
   68        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
 
   73    for (; x < width; x += 8)
 
   75      const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
 
   76      __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
 
   77      const __m256i s0 = _mm256_loadu_si256(src);
 
   78      __m256i s1 = _mm256_shuffle_epi8(s0, smask);
 
   82      const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
 
   83      const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
 
   84      const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
 
   85                                             0x00000000, 0x00000000, 0x00000000, 0x00000000);
 
   86      const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
 
   88      const __m256i s2 = _mm256_loadu_si256(dst);
 
   89      __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
 
   90      _mm256_storeu_si256(dst, d0);
 
   93    for (; x < nWidth; x++)
 
   95      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
 
   96      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
 
  103  return PRIMITIVES_SUCCESS;
 
  106static inline pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
 
  107                                                      UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
 
  108                                                      UINT32 nWidth, UINT32 nHeight,
 
  109                                                      const BYTE* WINPR_RESTRICT pSrcData,
 
  110                                                      UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
 
  111                                                      int64_t srcVMultiplier, int64_t srcVOffset,
 
  112                                                      int64_t dstVMultiplier, int64_t dstVOffset)
 
  115  const int64_t srcByte = 4;
 
  116  const int64_t dstByte = 4;
 
  118  const __m256i mask = _mm256_setr_epi8(
 
  119      (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
 
  120      (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
 
  121      (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00,
 
  122      (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00, (
char)0xFF, (
char)0xFF, (
char)0xFF, 0x00);
 
  123  const UINT32 rem = nWidth % 8;
 
  124  const int64_t width = nWidth - rem;
 
  125  for (int64_t y = 0; y < nHeight; y++)
 
  127    const BYTE* WINPR_RESTRICT srcLine =
 
  128        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
 
  129    BYTE* WINPR_RESTRICT dstLine =
 
  130        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
 
  133    for (; x < width; x += 8)
 
  135      const __m256i* src = (
const __m256i*)&srcLine[(x + nXSrc) * srcByte];
 
  136      __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
 
  137      const __m256i s0 = _mm256_loadu_si256(src);
 
  138      const __m256i s1 = _mm256_loadu_si256(dst);
 
  139      __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
 
  140      _mm256_storeu_si256(dst, d0);
 
  143    for (; x < nWidth; x++)
 
  145      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
 
  146      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
 
  153  return PRIMITIVES_SUCCESS;
 
  156static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
 
  157    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
 
  158    UINT32 nWidth, UINT32 nHeight, 
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
 
  159    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, 
const gdiPalette* WINPR_RESTRICT palette,
 
  160    UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
 
  163  WINPR_ASSERT(pDstData);
 
  164  WINPR_ASSERT(pSrcData);
 
  168    case PIXEL_FORMAT_BGR24:
 
  171        case PIXEL_FORMAT_BGRX32:
 
  172        case PIXEL_FORMAT_BGRA32:
 
  173          return avx2_image_copy_bgr24_bgrx32(
 
  174              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
 
  175              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
 
  180    case PIXEL_FORMAT_BGRX32:
 
  181    case PIXEL_FORMAT_BGRA32:
 
  184        case PIXEL_FORMAT_BGRX32:
 
  185        case PIXEL_FORMAT_BGRA32:
 
  186          return avx2_image_copy_bgrx32_bgrx32(
 
  187              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
 
  188              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
 
  193    case PIXEL_FORMAT_RGBX32:
 
  194    case PIXEL_FORMAT_RGBA32:
 
  197        case PIXEL_FORMAT_RGBX32:
 
  198        case PIXEL_FORMAT_RGBA32:
 
  199          return avx2_image_copy_bgrx32_bgrx32(
 
  200              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
 
  201              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
 
  211  return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
 
  212                              pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
 
  215static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
 
  216                                            UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
 
  217                                            UINT32 nWidth, UINT32 nHeight,
 
  218                                            const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
 
  219                                            UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
 
  220                                            const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
 
  222  const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
 
  223  int64_t srcVOffset = 0;
 
  224  int64_t srcVMultiplier = 1;
 
  225  int64_t dstVOffset = 0;
 
  226  int64_t dstVMultiplier = 1;
 
  228  if ((nWidth == 0) || (nHeight == 0))
 
  229    return PRIMITIVES_SUCCESS;
 
  231  if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
 
  234  if (!pDstData || !pSrcData)
 
  238    nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
 
  241    nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
 
  245    srcVOffset = (nHeight - 1ll) * nSrcStep;
 
  249  if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
 
  250    return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
 
  251                                                nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
 
  252                                                nXSrc, nYSrc, palette, flags, srcVMultiplier,
 
  253                                                srcVOffset, dstVMultiplier, dstVOffset);
 
  254  else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
 
  255    return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
 
  256                                                nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
 
  257                                                nXSrc, nYSrc, palette, srcVMultiplier,
 
  258                                                srcVOffset, dstVMultiplier, dstVOffset, flags);
 
  262    return gen->
copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
 
  263                                pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
 
  269void primitives_init_copy_avx2_int(
primitives_t* WINPR_RESTRICT prims)
 
  271#if defined(SSE_AVX_INTRINSICS_ENABLED) 
  272  WLog_VRB(PRIM_TAG, 
"AVX2 optimizations");
 
  273  prims->copy_no_overlap = avx2_image_copy_no_overlap;
 
  275  WLog_VRB(PRIM_TAG, 
"undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
 
fn_copy_no_overlap_t copy_no_overlap