diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp index 73c21e22779daab047821e0830ee7c880ace63be..7af9b1fdf06e828e665bca22880b3f2c5230ff8f 100644 --- a/source/Lib/CommonLib/Buffer.cpp +++ b/source/Lib/CommonLib/Buffer.cpp @@ -177,8 +177,13 @@ void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStr { for (int x = 0; x < (width - 2 * BIO_EXTEND_SIZE); x++) { +#if JVET_O0570_GRAD_SIMP + gradYTmp[x] = ( srcTmp[x + srcStride] >> shift1 ) - ( srcTmp[x - srcStride] >> shift1 ); + gradXTmp[x] = ( srcTmp[x + 1] >> shift1 ) - ( srcTmp[x - 1] >> shift1 ); +#else gradYTmp[x] = (srcTmp[x + srcStride] - srcTmp[x - srcStride]) >> shift1; gradXTmp[x] = (srcTmp[x + 1] - srcTmp[x - 1]) >> shift1; +#endif } gradXTmp += gradStride; gradYTmp += gradStride; diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index fc95ff8b85e487da1c7b77001ead05c22a1c4e28..c5c6ca9c2418eace0749e97e68f2ef3a8d7ebd1a 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -53,6 +53,8 @@ #define JVET_O0070_PROF 1 // JVET-O0070 method 4-2.1a: Prediction refinement with optical flow for affine mode +#define JVET_O0570_GRAD_SIMP 1 // JVET-O0570/JVET-O0211, SMID friendly spatial gradient calculation + #define JVET_O1170_IBC_VIRTUAL_BUFFER 1 // JVET-O1170/O1171: IBC virtual buffer #if JVET_O1170_IBC_VIRTUAL_BUFFER #define JVET_O1170_CHECK_BV_AT_DECODER 1 // For decoder to check if a BV is valid or not diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index d9c88fa8e1b305e159cb18bd7f74c563365c3606..6e82e3f411a3b3666cc0ec707988be68d023abd2 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -491,7 +491,9 @@ template< X86_VEXT vext > #endif void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth) { +#if !JVET_O0570_GRAD_SIMP __m128i vzero = _mm_setzero_si128(); +#endif Pel* srcTmp = src + srcStride + 1; Pel* gradXTmp = gradX + gradStride + 1; Pel* gradYTmp = gradY + gradStride + 1; @@ -499,32 +501,79 @@ void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStri int widthInside = width - 2 * BIO_EXTEND_SIZE; int heightInside = height - 2 * BIO_EXTEND_SIZE; int shift1 = std::max<int>(6, bitDepth - 6); - +#if JVET_O0570_GRAD_SIMP + __m128i mmShift1 = _mm_cvtsi32_si128( shift1 ); +#endif assert((widthInside & 3) == 0); - for (int y = 0; y < heightInside; y++) +#if JVET_O0570_GRAD_SIMP + if ( ( widthInside & 7 ) == 0 ) { - int x = 0; - for (; x < widthInside; x += 4) +#endif + for (int y = 0; y < heightInside; y++) { - __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - srcStride))); - __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + srcStride))); - __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - 1))); - __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + 1))); - - __m128i mmGradVer = _mm_sra_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), _mm_cvtsi32_si128(shift1)); - __m128i mmGradHor = _mm_sra_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), _mm_cvtsi32_si128(shift1)); - mmGradVer = _mm_packs_epi32(mmGradVer, vzero); - mmGradHor = _mm_packs_epi32(mmGradHor, vzero); - - _mm_storel_epi64((__m128i *)(gradYTmp + x), mmGradVer); - _mm_storel_epi64((__m128i *)(gradXTmp + x), mmGradHor); - } + int x = 0; +#if JVET_O0570_GRAD_SIMP + for ( ; x < widthInside; x += 8 ) + { + __m128i mmPixTop = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x - srcStride ) ), mmShift1 ); + __m128i mmPixBottom = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x + srcStride ) ), mmShift1 ); + __m128i mmPixLeft = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x - 1 ) ), mmShift1 ); + __m128i mmPixRight = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x + 1 ) ), mmShift1 ); - gradXTmp += gradStride; - gradYTmp += gradStride; - srcTmp += srcStride; + __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop ); + __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft ); + + _mm_storeu_si128( ( __m128i * ) ( gradYTmp + x ), mmGradVer ); + _mm_storeu_si128( ( __m128i * ) ( gradXTmp + x ), mmGradHor ); + } +#else + for (; x < widthInside; x += 4) + { + __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - srcStride))); + __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + srcStride))); + __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - 1))); + __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + 1))); + + __m128i mmGradVer = _mm_sra_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), _mm_cvtsi32_si128(shift1)); + __m128i mmGradHor = _mm_sra_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), _mm_cvtsi32_si128(shift1)); + mmGradVer = _mm_packs_epi32(mmGradVer, vzero); + mmGradHor = _mm_packs_epi32(mmGradHor, vzero); + + _mm_storel_epi64((__m128i *)(gradYTmp + x), mmGradVer); + _mm_storel_epi64((__m128i *)(gradXTmp + x), mmGradHor); + } +#endif + gradXTmp += gradStride; + gradYTmp += gradStride; + srcTmp += srcStride; + } +#if JVET_O0570_GRAD_SIMP } + else + { + __m128i mmPixTop = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp - srcStride ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp ) ) ), mmShift1 ); + for ( int y = 0; y < heightInside; y += 2 ) + { + __m128i mmPixBottom = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp + srcStride ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp + ( srcStride << 1 ) ) ) ), mmShift1 ); + __m128i mmPixLeft = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp - 1 ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp - 1 + srcStride ) ) ), mmShift1 ); + __m128i mmPixRight = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp + 1 ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp + 1 + srcStride ) ) ), mmShift1 ); + + __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop ); + __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft ); + + _mm_storel_epi64( (__m128i *) gradYTmp, mmGradVer ); + _mm_storel_epi64( (__m128i *) ( gradYTmp + gradStride ), _mm_unpackhi_epi64( mmGradVer, mmGradHor ) ); + _mm_storel_epi64( (__m128i *) gradXTmp, mmGradHor ); + _mm_storel_epi64( (__m128i *) ( gradXTmp + gradStride ), _mm_unpackhi_epi64( mmGradHor, mmGradVer ) ); + + mmPixTop = mmPixBottom; + gradXTmp += gradStride << 1; + gradYTmp += gradStride << 1; + srcTmp += srcStride << 1; + } + } +#endif #if JVET_O0070_PROF if (PAD)