diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp index 7c1bb9f1d7e080fe27d9de64c9f4e60af4f49830..586f8728f27d08de6ab6402c1bfc3ab34668b687 100644 --- a/source/Lib/CommonLib/Buffer.cpp +++ b/source/Lib/CommonLib/Buffer.cpp @@ -42,7 +42,11 @@ #include "Buffer.h" #include "InterpolationFilter.h" +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING +void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng) +#else void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng) +#endif { int idx = 0; #if !JVET_P0057_BDOF_PROF_HARMONIZATION @@ -63,10 +67,16 @@ void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int w #endif #if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING dI = Clip3(-dILimit, dILimit - 1, dI); -#endif - + dst[w] = src[w] + dI; + if (!bi) + { + dst[w] = (dst[w] + offset) >> shiftNum; + dst[w] = ClipPel(dst[w], clpRng); + } +#else dI = (src[w] + dI + offset) >> shiftNum; dst[w] = (Pel)ClipPel(dI, clpRng); +#endif idx++; } @@ -77,6 +87,7 @@ void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int w } } +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING template<bool l1PROFEnabled = true> void applyBiPROFCore (Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t w0, const ClpRng& clpRng) { @@ -142,6 +153,7 @@ void applyBiPROFCore (Pel* dst, int dstStride, const Pel* src0, const Pel* src1, src1 += srcStride; } } +#endif template< typename T > void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T* dest, int dstStride, int width, int height, int rshift, int offset, const ClpRng& clpRng ) @@ -433,8 +445,10 @@ PelBufferOps::PelBufferOps() profGradFilter = gradFilterCore <false>; applyPROF = applyPROFCore; +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING applyBiPROF[1] = applyBiPROFCore; applyBiPROF[0] = applyBiPROFCore <false>; +#endif roundIntVector = nullptr; } diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index f4ea3ad6c5d7af238669cd71de3e037635788db0..4f3b95c4c2e5a575bf1e498cd74dbda2286189a8 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -81,8 +81,12 @@ struct PelBufferOps void ( *removeHighFreq4) ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height); #endif void (*profGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth); +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING + void (*applyPROF) (Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng); +#else void (*applyPROF) (Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng); void (*applyBiPROF[2]) (Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t gbiWeightL0, const ClpRng& clpRng); +#endif void (*roundIntVector) (int* v, int size, unsigned int nShift, const int dmvLimit); }; diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index d5a970c43a9b80c2de5bfdad4293c4d1c18da0d3..f686ac6822a717a354e7e1be30d316b3a6aed584 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -862,19 +862,28 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio enablePROF &= !m_encOnly || pu.cu->slice->getCheckLDC() || iDMvHorX > profThres || iDMvHorY > profThres || iDMvVerX > profThres || iDMvVerY > profThres || iDMvHorX < -profThres || iDMvHorY < -profThres || iDMvVerX < -profThres || iDMvVerY < -profThres; enablePROF &= pu.cs->pps->getPicWidthInLumaSamples() == refPic->getPicWidthInLumaSamples() && pu.cs->pps->getPicHeightInLumaSamples() == refPic->getPicHeightInLumaSamples(); +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING if (compID == COMPONENT_Y) { m_applyPROF[m_iRefListIdx] = enablePROF; } +#endif bool isLast = enablePROF ? false : !bi; +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING + const int cuExtW = AFFINE_MIN_BLOCK_SIZE + PROF_BORDER_EXT_W * 2; + const int cuExtH = AFFINE_MIN_BLOCK_SIZE + PROF_BORDER_EXT_H * 2; + + PelBuf gradXExt(m_gradBuf[0], cuExtW, cuExtH); + PelBuf gradYExt(m_gradBuf[1], cuExtW, cuExtH); +#else const int cuExtW = pu.blocks[compID].width + PROF_BORDER_EXT_W * 2; const int cuExtH = pu.blocks[compID].height + PROF_BORDER_EXT_H * 2; PelBuf gradXExt(m_gradBuf[m_iRefListIdx][0], cuExtW, cuExtH); PelBuf gradYExt(m_gradBuf[m_iRefListIdx][1], cuExtW, cuExtH); - +#endif const int MAX_FILTER_SIZE = std::max<int>(NTAPS_LUMA, NTAPS_CHROMA); const int dstExtW = ((blockWidth + PROF_BORDER_EXT_W * 2 + 7) >> 3) << 3; const int dstExtH = blockHeight + PROF_BORDER_EXT_H * 2; @@ -888,7 +897,11 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio int *dMvScaleHor = m_dMvBuf[m_iRefListIdx]; int *dMvScaleVer = m_dMvBuf[m_iRefListIdx] + 16; +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING + if (enablePROF) +#else if (enablePROF && !bi) +#endif { int* dMvH = dMvScaleHor; int* dMvV = dMvScaleVer; @@ -1120,8 +1133,13 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio dstPel[blockWidth] = leftShift_round(refPel[blockWidth], shift) - (Pel)IF_INTERNAL_OFFS; } +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING + PelBuf gradXBuf = gradXExt.subBuf(0, 0, blockWidth + 2, blockHeight + 2); + PelBuf gradYBuf = gradYExt.subBuf(0, 0, blockWidth + 2, blockHeight + 2); +#else PelBuf gradXBuf = gradXExt.subBuf(w, h, blockWidth + 2, blockHeight + 2); PelBuf gradYBuf = gradYExt.subBuf(w, h, blockWidth + 2, blockHeight + 2); +#endif g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, gradXBuf.stride, gradXBuf.buf, gradYBuf.buf, clpRng.bd); const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); @@ -1132,6 +1150,9 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio Pel * dstY = dstBuf.bufAt(w, h); +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING + g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, bi, shiftNum, offset, clpRng); +#else if (!bi) { g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, shiftNum, offset, clpRng); @@ -1142,6 +1163,7 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio PelBuf destBuf(dstY, dstBuf.stride, Size(blockWidth, blockHeight)); destBuf.copyFrom(srcExtBuf); } +#endif } } } @@ -1300,6 +1322,7 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB if( iRefIdx0 >= 0 && iRefIdx1 >= 0 ) { +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING if (pu.cu->affine && (m_applyPROF[0] || m_applyPROF[1])) { xApplyBiPROF(pu, pcYuvSrc0.bufs[COMPONENT_Y], pcYuvSrc1.bufs[COMPONENT_Y], pcYuvDst.bufs[COMPONENT_Y], clpRngs.comp[COMPONENT_Y]); @@ -1307,6 +1330,7 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB CHECK(yuvDstTmp, "yuvDstTmp is disallowed with PROF"); return; } +#endif if( pu.cu->GBiIdx != GBI_DEFAULT && (yuvDstTmp || !pu.mhIntraFlag) ) { CHECK(bioApplied, "GBi is disallowed with BIO"); @@ -1389,6 +1413,7 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB } } +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING void InterPrediction::xApplyBiPROF(const PredictionUnit &pu, const CPelBuf& pcYuvSrc0, const CPelBuf& pcYuvSrc1, PelBuf& pcYuvDst, const ClpRng& clpRng) { int blockWidth = AFFINE_MIN_BLOCK_SIZE; @@ -1532,6 +1557,7 @@ void InterPrediction::xApplyBiPROF(const PredictionUnit &pu, const CPelBuf& pcYu else g_pelBufOP.applyBiPROF[0](dstY, pcYuvDst.stride, srcY1, srcY0, pcYuvSrc0.stride, width, height, gX1, gY1, gX0, gY0, gradXExt0.stride, dMvX1, dMvY1, dMvX0, dMvY0, blockWidth, getGbiWeight(pu.cu->GBiIdx, REF_PIC_LIST_1), clpRng); } +#endif void InterPrediction::motionCompensation( PredictionUnit &pu, PelUnitBuf &predBuf, const RefPicList &eRefPicList , const bool luma, const bool chroma diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index b28b49dd3c93969837a8ef870ff9575f4ba593e8..d9e2b2599840be92d7787290243ef13a49af8089 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -102,9 +102,15 @@ protected: Mv(-2, 2), Mv(-1, 2), Mv(0, 2), Mv(1, 2), Mv(2, 2) }; uint64_t m_SADsArray[((2 * DMVR_NUM_ITERATION) + 1) * ((2 * DMVR_NUM_ITERATION) + 1)]; +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING + Pel m_gradBuf[2][(AFFINE_MIN_BLOCK_SIZE + 2) * (AFFINE_MIN_BLOCK_SIZE + 2)]; +#else Pel m_gradBuf[2][2][(MAX_CU_SIZE + 2) * (MAX_CU_SIZE + 2)]; +#endif int m_dMvBuf[2][16 * 2]; +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING bool m_applyPROF[2]; +#endif bool m_skipPROF; bool m_encOnly; bool m_isBi; @@ -141,7 +147,9 @@ protected: void xCalcBIOPar (const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG, int bitDepth); void xCalcBlkGradient (int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize); void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied, PelUnitBuf* yuvDstTmp = NULL ); +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING void xApplyBiPROF (const PredictionUnit& pu, const CPelBuf& pcYuvSrc0, const CPelBuf& pcYuvSrc1, PelBuf& pcYuvDst, const ClpRng& clpRng); +#endif void xPredAffineBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng, const std::pair<int, int> scalingRatio = SCALE_1X ); void xWeightedTriangleBlk ( const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1 ); diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index 3275097e88c78882ffcd4fda69fce38dfb80da3f..ae44540199073b88010b619e5d22261730164e0e 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -351,10 +351,45 @@ void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel* } template< X86_VEXT vext > +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING +void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng) +#else void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng) +#endif { CHECKD((width & 3), "block width error!"); +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING + const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13); + +#ifdef USE_AVX2 + __m256i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0, mm_src; +#if !JVET_P0057_BDOF_PROF_HARMONIZATION + __m256i mm_dIoffset = _mm256_set1_epi32(1); +#endif + __m256i mm_offset = _mm256_set1_epi16(offset); + __m256i vibdimin = _mm256_set1_epi16(clpRng.min); + __m256i vibdimax = _mm256_set1_epi16(clpRng.max); + __m256i mm_dimin = _mm256_set1_epi32(-dILimit); + __m256i mm_dimax = _mm256_set1_epi32(dILimit - 1); +#else + __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0; +#if !JVET_P0057_BDOF_PROF_HARMONIZATION + __m128i mm_dIoffset = _mm_set1_epi32(1); +#endif + __m128i mm_offset = _mm_set1_epi16(offset); + __m128i vibdimin = _mm_set1_epi16(clpRng.min); + __m128i vibdimax = _mm_set1_epi16(clpRng.max); + __m128i mm_dimin = _mm_set1_epi32(-dILimit); + __m128i mm_dimax = _mm_set1_epi32(dILimit - 1); +#endif + +#if USE_AVX2 + for (int h = 0; h < height; h += 4) +#else + for (int h = 0; h < height; h += 2) +#endif +#else __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_src; #if !JVET_P0057_BDOF_PROF_HARMONIZATION __m128i mm_dIoffset = _mm_set1_epi32(1); @@ -364,13 +399,8 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, __m128i vibdimax = _mm_set1_epi32(clpRng.max); __m128i vzero = _mm_setzero_si128(); -#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING - const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13); - __m128i vdImin = _mm_set1_epi32(-dILimit); - __m128i vdImax = _mm_set1_epi32(dILimit - 1); -#endif - for (int h = 0; h < height; h++) +#endif { const int* vX = dMvX; const int* vY = dMvY; @@ -381,6 +411,100 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, for (int w = 0; w < width; w += 4) { +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING +#if USE_AVX2 + const int *vX0 = vX, *vY0 = vY; + const Pel *gX0 = gX, *gY0 = gY; + + // first two rows + mm_dmvx = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vX0)), _mm_loadu_si128((const __m128i *)(vX0 + dMvStride)), 1); + mm_dmvy = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vY0)), _mm_loadu_si128((const __m128i *)(vY0 + dMvStride)), 1); + mm_gradx = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX0))), + _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gX0 + gradStride))), 1); + mm_grady = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY0))), + _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gY0 + gradStride))), 1); + mm_dI0 = _mm256_add_epi32(_mm256_mullo_epi32(mm_dmvx, mm_gradx), _mm256_mullo_epi32(mm_dmvy, mm_grady)); +#if !JVET_P0057_BDOF_PROF_HARMONIZATION + mm_dI0 = _mm256_srai_epi32(_mm256_add_epi32(mm_dI0, mm_dIoffset), 1); +#endif + mm_dI0 = _mm256_min_epi32(mm_dimax, _mm256_max_epi32(mm_dimin, mm_dI0)); + + // next two rows + vX0 += (dMvStride << 1); vY0 += (dMvStride << 1); gX0 += (gradStride << 1); gY0 += (gradStride << 1); + mm_dmvx = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vX0)), _mm_loadu_si128((const __m128i *)(vX0 + dMvStride)), 1); + mm_dmvy = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vY0)), _mm_loadu_si128((const __m128i *)(vY0 + dMvStride)), 1); + mm_gradx = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX0))), + _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gX0 + gradStride))), 1); + mm_grady = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY0))), + _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gY0 + gradStride))), 1); + mm_dI = _mm256_add_epi32(_mm256_mullo_epi32(mm_dmvx, mm_gradx), _mm256_mullo_epi32(mm_dmvy, mm_grady)); +#if !JVET_P0057_BDOF_PROF_HARMONIZATION + mm_dI = _mm256_srai_epi32(_mm256_add_epi32(mm_dI, mm_dIoffset), 1); +#endif + mm_dI = _mm256_min_epi32(mm_dimax, _mm256_max_epi32(mm_dimin, mm_dI)); + + // combine four rows + mm_dI = _mm256_packs_epi32(mm_dI0, mm_dI); + const Pel* src0 = src + srcStride; + mm_src = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src), _mm_loadl_epi64((const __m128i *)(src + (srcStride << 1))))), + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src0), _mm_loadl_epi64((const __m128i *)(src0 + (srcStride << 1)))), + 1 + ); + mm_dI = _mm256_add_epi16(mm_dI, mm_src); + if (!bi) + { + mm_dI = _mm256_srai_epi16(_mm256_add_epi16(mm_dI, mm_offset), shiftNum); + mm_dI = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, mm_dI)); + } + + // store final results + __m128i dITmp = _mm256_extractf128_si256(mm_dI, 1); + Pel* dst0 = dst; + _mm_storel_epi64((__m128i *)dst0, _mm256_castsi256_si128(mm_dI)); + dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, dITmp); + dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(_mm256_castsi256_si128(mm_dI), _mm256_castsi256_si128(mm_dI))); + dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(dITmp, dITmp)); +#else + // first row + mm_dmvx = _mm_loadu_si128((const __m128i *)vX); + mm_dmvy = _mm_loadu_si128((const __m128i *)vY); + mm_gradx = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX)); + mm_grady = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY)); + mm_dI0 = _mm_add_epi32(_mm_mullo_epi32(mm_dmvx, mm_gradx), _mm_mullo_epi32(mm_dmvy, mm_grady)); +#if !JVET_P0057_BDOF_PROF_HARMONIZATION + mm_dI0 = _mm_srai_epi32(_mm_add_epi32(mm_dI0, mm_dIoffset), 1); +#endif + mm_dI0 = _mm_min_epi32(mm_dimax, _mm_max_epi32(mm_dimin, mm_dI0)); + + // second row + mm_dmvx = _mm_loadu_si128((const __m128i *)(vX + dMvStride)); + mm_dmvy = _mm_loadu_si128((const __m128i *)(vY + dMvStride)); + mm_gradx = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gX + gradStride))); + mm_grady = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gY + gradStride))); + mm_dI = _mm_add_epi32(_mm_mullo_epi32(mm_dmvx, mm_gradx), _mm_mullo_epi32(mm_dmvy, mm_grady)); +#if !JVET_P0057_BDOF_PROF_HARMONIZATION + mm_dI = _mm_srai_epi32(_mm_add_epi32(mm_dI, mm_dIoffset), 1); +#endif + mm_dI = _mm_min_epi32(mm_dimax, _mm_max_epi32(mm_dimin, mm_dI)); + + // combine both rows + mm_dI = _mm_packs_epi32(mm_dI0, mm_dI); + mm_dI = _mm_add_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src), _mm_loadl_epi64((const __m128i *)(src + srcStride))), mm_dI); + if (!bi) + { + mm_dI = _mm_srai_epi16(_mm_add_epi16(mm_dI, mm_offset), shiftNum); + mm_dI = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_dI)); + } + + _mm_storel_epi64((__m128i *)dst, mm_dI); + _mm_storel_epi64((__m128i *)(dst + dstStride), _mm_unpackhi_epi64(mm_dI, mm_dI)); +#endif +#else mm_dmvx = _mm_loadu_si128((const __m128i *)vX); mm_dmvy = _mm_loadu_si128((const __m128i *)vY); mm_gradx = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX)); @@ -391,24 +515,43 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, #if !JVET_P0057_BDOF_PROF_HARMONIZATION mm_dI = _mm_srai_epi32(_mm_add_epi32(mm_dI, mm_dIoffset), 1); #endif -#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING - mm_dI = _mm_min_epi32(vdImax, _mm_max_epi32(vdImin, mm_dI)); -#endif + mm_dI = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(mm_dI, mm_src), mm_offset), shiftNum); mm_dI = _mm_packs_epi32(_mm_min_epi32(vibdimax, _mm_max_epi32(vibdimin, mm_dI)), vzero); _mm_storel_epi64((__m128i *)dst, mm_dI); +#endif vX += 4; vY += 4; gX += 4; gY += 4; src += 4; dst += 4; } + +#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING +#if USE_AVX2 + dMvX += (dMvStride << 2); + dMvY += (dMvStride << 2); + gradX += (gradStride << 2); + gradY += (gradStride << 2); + srcPel += (srcStride << 2); + dstPel += (dstStride << 2); +#else + dMvX += (dMvStride << 1); + dMvY += (dMvStride << 1); + gradX += (gradStride << 1); + gradY += (gradStride << 1); + srcPel += (srcStride << 1); + dstPel += (dstStride << 1); +#endif +#else dMvX += dMvStride; dMvY += dMvStride; gradX += gradStride; gradY += gradStride; srcPel += srcStride; dstPel += dstStride; +#endif } } +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING template< X86_VEXT vext, bool l1PROFEnabled = true> void applyBiPROF_SSE(Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t w0, const ClpRng& clpRng) { @@ -526,6 +669,7 @@ void applyBiPROF_SSE(Pel* dst, int dstStride, const Pel* src0, const Pel* src1, dst += dstStride; } } +#endif template< X86_VEXT vext > void roundIntVector_SIMD(int* v, int size, unsigned int nShift, const int dmvLimit) @@ -1295,8 +1439,10 @@ void PelBufferOps::_initPelBufOpsX86() #endif profGradFilter = gradFilter_SSE<vext, false>; applyPROF = applyPROF_SSE<vext>; +#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING applyBiPROF[1] = applyBiPROF_SSE<vext>; applyBiPROF[0] = applyBiPROF_SSE<vext, false>; +#endif roundIntVector = roundIntVector_SIMD<vext>; }