diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp index 2e9c1a42a3935f1e992b83663e57b2b895965168..2454c501ae976e3885395f6efcc3169531db8c9b 100644 --- a/source/Lib/CommonLib/Buffer.cpp +++ b/source/Lib/CommonLib/Buffer.cpp @@ -63,7 +63,7 @@ void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T } #if JVET_L0256_BIO -void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *pGradX0, const Pel *pGradX1, const Pel *pGradY0, const Pel*pGradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) +void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) { int b = 0; @@ -71,125 +71,125 @@ void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Str { for (int x = 0; x < width; x += 4) { - b = tmpx * (pGradX0[x] - pGradX1[x]) + tmpy * (pGradY0[x] - pGradY1[x]); + b = tmpx * (gradX0[x] - gradX1[x]) + tmpy * (gradY0[x] - gradY1[x]); b = ((b + 1) >> 1); dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng); - b = tmpx * (pGradX0[x + 1] - pGradX1[x + 1]) + tmpy * (pGradY0[x + 1] - pGradY1[x + 1]); + b = tmpx * (gradX0[x + 1] - gradX1[x + 1]) + tmpy * (gradY0[x + 1] - gradY1[x + 1]); b = ((b + 1) >> 1); dst[x + 1] = ClipPel((int16_t)rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng); - b = tmpx * (pGradX0[x + 2] - pGradX1[x + 2]) + tmpy * (pGradY0[x + 2] - pGradY1[x + 2]); + b = tmpx * (gradX0[x + 2] - gradX1[x + 2]) + tmpy * (gradY0[x + 2] - gradY1[x + 2]); b = ((b + 1) >> 1); dst[x + 2] = ClipPel((int16_t)rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng); - b = tmpx * (pGradX0[x + 3] - pGradX1[x + 3]) + tmpy * (pGradY0[x + 3] - pGradY1[x + 3]); + b = tmpx * (gradX0[x + 3] - gradX1[x + 3]) + tmpy * (gradY0[x + 3] - gradY1[x + 3]); b = ((b + 1) >> 1); dst[x + 3] = ClipPel((int16_t)rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng); } dst += dstStride; src0 += src0Stride; src1 += src1Stride; - pGradX0 += gradStride; pGradX1 += gradStride; pGradY0 += gradStride; pGradY1 += gradStride; + gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride; } } -void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* pGradX, Pel* pGradY) +void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY) { - Pel* piSrcTmp = pSrc + srcStride + 1; - Pel* piGradXTmp = pGradX + gradStride + 1; - Pel* piGradYTmp = pGradY + gradStride + 1; + Pel* srcTmp = pSrc + srcStride + 1; + Pel* gradXTmp = gradX + gradStride + 1; + Pel* gradYTmp = gradY + gradStride + 1; - for (int y = 0; y < (height - 2 * JVET_L0256_BIO_EXTEND_SIZE); y++) + for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++) { - for (int x = 0; x < (width - 2 * JVET_L0256_BIO_EXTEND_SIZE); x++) + for (int x = 0; x < (width - 2 * BIO_EXTEND_SIZE); x++) { - piGradYTmp[x] = (piSrcTmp[x + srcStride] - piSrcTmp[x - srcStride]) >> 4; - piGradXTmp[x] = (piSrcTmp[x + 1] - piSrcTmp[x - 1]) >> 4; + gradYTmp[x] = (srcTmp[x + srcStride] - srcTmp[x - srcStride]) >> 4; + gradXTmp[x] = (srcTmp[x + 1] - srcTmp[x - 1]) >> 4; } - piGradXTmp += gradStride; - piGradYTmp += gradStride; - piSrcTmp += srcStride; + gradXTmp += gradStride; + gradYTmp += gradStride; + srcTmp += srcStride; } - piGradXTmp = pGradX + gradStride + 1; - piGradYTmp = pGradY + gradStride + 1; - for (int y = 0; y < (height - 2 * JVET_L0256_BIO_EXTEND_SIZE); y++) + gradXTmp = gradX + gradStride + 1; + gradYTmp = gradY + gradStride + 1; + for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++) { - piGradXTmp[-1] = piGradXTmp[0]; - piGradXTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE] = piGradXTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1]; - piGradXTmp += gradStride; + gradXTmp[-1] = gradXTmp[0]; + gradXTmp[width - 2 * BIO_EXTEND_SIZE] = gradXTmp[width - 2 * BIO_EXTEND_SIZE - 1]; + gradXTmp += gradStride; - piGradYTmp[-1] = piGradYTmp[0]; - piGradYTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE] = piGradYTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1]; - piGradYTmp += gradStride; + gradYTmp[-1] = gradYTmp[0]; + gradYTmp[width - 2 * BIO_EXTEND_SIZE] = gradYTmp[width - 2 * BIO_EXTEND_SIZE - 1]; + gradYTmp += gradStride; } - piGradXTmp = pGradX + gradStride; - piGradYTmp = pGradY + gradStride; - ::memcpy(piGradXTmp - gradStride, piGradXTmp, sizeof(Pel)*(width)); - ::memcpy(piGradXTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE)*gradStride, piGradXTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); - ::memcpy(piGradYTmp - gradStride, piGradYTmp, sizeof(Pel)*(width)); - ::memcpy(piGradYTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE)*gradStride, piGradYTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); + gradXTmp = gradX + gradStride; + gradYTmp = gradY + gradStride; + ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel)*(width)); + ::memcpy(gradXTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradXTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); + ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width)); + ::memcpy(gradYTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradYTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); } -void calcBIOParCore(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGradX0, const Pel* pGradX1, const Pel* pGradY0, const Pel* pGradY1, int* m_piDotProductTemp1, int* m_piDotProductTemp2, int* m_piDotProductTemp3, int* m_piDotProductTemp5, int* m_piDotProductTemp6, const int iSrc0Stride, const int iSrc1Stride, const int iGradStride, const int iWidthG, const int iHeightG) +void calcBIOParCore(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG) { - for (int y = 0; y < iHeightG; y++) + for (int y = 0; y < heightG; y++) { - for (int x = 0; x < iWidthG; x++) + for (int x = 0; x < widthG; x++) { - int temp = (pSrcY0Temp[x] >> 6) - (pSrcY1Temp[x] >> 6); - int tempX = (pGradX0[x] + pGradX1[x]) >> 3; - int tempY = (pGradY0[x] + pGradY1[x]) >> 3; - m_piDotProductTemp1[x] = tempX * tempX; - m_piDotProductTemp2[x] = tempX * tempY; - m_piDotProductTemp3[x] = -tempX * temp; - m_piDotProductTemp5[x] = tempY * tempY; - m_piDotProductTemp6[x] = -tempY * temp; + int temp = (srcY0Temp[x] >> 6) - (srcY1Temp[x] >> 6); + int tempX = (gradX0[x] + gradX1[x]) >> 3; + int tempY = (gradY0[x] + gradY1[x]) >> 3; + dotProductTemp1[x] = tempX * tempX; + dotProductTemp2[x] = tempX * tempY; + dotProductTemp3[x] = -tempX * temp; + dotProductTemp5[x] = tempY * tempY; + dotProductTemp6[x] = -tempY * temp; } - pSrcY0Temp += iSrc0Stride; - pSrcY1Temp += iSrc1Stride; - pGradX0 += iGradStride; - pGradX1 += iGradStride; - pGradY0 += iGradStride; - pGradY1 += iGradStride; - m_piDotProductTemp1 += iWidthG; - m_piDotProductTemp2 += iWidthG; - m_piDotProductTemp3 += iWidthG; - m_piDotProductTemp5 += iWidthG; - m_piDotProductTemp6 += iWidthG; + srcY0Temp += src0Stride; + srcY1Temp += src1Stride; + gradX0 += gradStride; + gradX1 += gradStride; + gradY0 += gradStride; + gradY1 += gradStride; + dotProductTemp1 += widthG; + dotProductTemp2 += widthG; + dotProductTemp3 += widthG; + dotProductTemp5 += widthG; + dotProductTemp6 += widthG; } } void calcBlkGradientCore(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize) { - int *pGx2 = arraysGx2; - int *pGy2 = arraysGy2; - int *pGxGy = arraysGxGy; - int *pGxdI = arraysGxdI; - int *pGydI = arraysGydI; + int *Gx2 = arraysGx2; + int *Gy2 = arraysGy2; + int *GxGy = arraysGxGy; + int *GxdI = arraysGxdI; + int *GydI = arraysGydI; // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE - pGx2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGy2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGxGy -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGxdI -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGydI -= (JVET_L0256_BIO_EXTEND_SIZE*width); + Gx2 -= (BIO_EXTEND_SIZE*width); + Gy2 -= (BIO_EXTEND_SIZE*width); + GxGy -= (BIO_EXTEND_SIZE*width); + GxdI -= (BIO_EXTEND_SIZE*width); + GydI -= (BIO_EXTEND_SIZE*width); - for (int y = -JVET_L0256_BIO_EXTEND_SIZE; y < unitSize + JVET_L0256_BIO_EXTEND_SIZE; y++) + for (int y = -BIO_EXTEND_SIZE; y < unitSize + BIO_EXTEND_SIZE; y++) { - for (int x = -JVET_L0256_BIO_EXTEND_SIZE; x < unitSize + JVET_L0256_BIO_EXTEND_SIZE; x++) + for (int x = -BIO_EXTEND_SIZE; x < unitSize + BIO_EXTEND_SIZE; x++) { - sGx2 += pGx2[x]; - sGy2 += pGy2[x]; - sGxGy += pGxGy[x]; - sGxdI += pGxdI[x]; - sGydI += pGydI[x]; + sGx2 += Gx2[x]; + sGy2 += Gy2[x]; + sGxGy += GxGy[x]; + sGxdI += GxdI[x]; + sGydI += GydI[x]; } - pGx2 += width; - pGy2 += width; - pGxGy += width; - pGxdI += width; - pGydI += width; + Gx2 += width; + Gy2 += width; + GxGy += width; + GxdI += width; + GydI += width; } } #endif diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index b8e69315f295aac1b1c4732b06dae3bd5e2e8c5a..a0142743f3efd1b287efa3c039b80e038aeb6db6 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -69,9 +69,9 @@ struct PelBufferOps void ( *linTf4 ) ( const Pel* src0, int src0Stride, Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip ); void ( *linTf8 ) ( const Pel* src0, int src0Stride, Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip ); #if JVET_L0256_BIO - void(*addBIOAvg4) (const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *pGradX0, const Pel *pGradX1, const Pel *pGradY0, const Pel*pGradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng); - void(*bioGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* pGradX, Pel* pGradY); - void(*calcBIOPar) (const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGradX0, const Pel* pGradX1, const Pel* pGradY0, const Pel* pGradY1, int* m_piDotProductTemp1, int* m_piDotProductTemp2, int* m_piDotProductTemp3, int* m_piDotProductTemp5, int* m_piDotProductTemp6, const int iSrc0Stride, const int iSrc1Stride, const int iGradStride, const int iWidthG, const int iHeightG); + void(*addBIOAvg4) (const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng); + void(*bioGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY); + void(*calcBIOPar) (const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG); void(*calcBlkGradient)(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize); #endif #if ENABLE_SIMD_OPT_GBI diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 98174dbeb777a52e254f4a21c4897c03b7cc0f68..41d8be8c326c47a95fdbd4bf4792a44a5f2bd618 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -297,6 +297,12 @@ static const int MAX_NUM_GT2_BINS_4x4SUBBLOCK = 4; ///< max static const int MAX_NUM_REG_BINS_2x2SUBBLOCK = 8; ///< max number of context-coded bins (incl. gt2 bins) per 2x2 subblock (chroma) static const int MAX_NUM_GT2_BINS_2x2SUBBLOCK = 2; ///< max number of gt2 bins per 2x2 subblock (chroma) #endif + +#if JVET_L0256_BIO +static const int BIO_EXTEND_SIZE = 1; +static const int BIO_TEMP_BUFFER_SIZE = (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE) * (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE); +#endif + #if JVET_L0646_GBI static const int GBI_NUM = 5; ///< the number of weight options static const int GBI_DEFAULT = ((uint8_t)(GBI_NUM >> 1)); ///< Default weighting index representing for w=0.5 diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index 4720c41efb5bcff76a85033f6ab5bca045bcc612..d980b24cc05c874241e46be5acb47342d03830ad 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -56,10 +56,10 @@ InterPrediction::InterPrediction() , m_maxCompIDToPred ( MAX_NUM_COMPONENT ) , m_pcRdCost ( nullptr ) #if JVET_L0256_BIO -, m_pGradX0(nullptr) -, m_pGradY0(nullptr) -, m_pGradX1(nullptr) -, m_pGradY1(nullptr) +, m_gradX0(nullptr) +, m_gradY0(nullptr) +, m_gradX1(nullptr) +, m_gradY1(nullptr) , m_subPuMC(false) #endif { @@ -118,10 +118,10 @@ void InterPrediction::destroy() } #if JVET_L0256_BIO - xFree(m_pGradX0); m_pGradX0 = nullptr; - xFree(m_pGradY0); m_pGradY0 = nullptr; - xFree(m_pGradX1); m_pGradX1 = nullptr; - xFree(m_pGradY1); m_pGradY1 = nullptr; + xFree(m_gradX0); m_gradX0 = nullptr; + xFree(m_gradY0); m_gradY0 = nullptr; + xFree(m_gradX1); m_gradX1 = nullptr; + xFree(m_gradY1); m_gradY1 = nullptr; #endif } @@ -142,8 +142,8 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ ) { #if JVET_L0256_BIO - int extWidth = MAX_CU_SIZE + (2 * JVET_L0256_BIO_EXTEND_SIZE + 2) + 16; - int extHeight = MAX_CU_SIZE + (2 * JVET_L0256_BIO_EXTEND_SIZE + 2) + 1; + int extWidth = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 16; + int extHeight = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 1; #else int extWidth = MAX_CU_SIZE + 16; int extHeight = MAX_CU_SIZE + 1; @@ -169,10 +169,10 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) m_iRefListIdx = -1; #if JVET_L0256_BIO - m_pGradX0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); - m_pGradY0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); - m_pGradX1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); - m_pGradY1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_gradX0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_gradY0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_gradX1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_gradY1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); #endif } @@ -336,7 +336,7 @@ void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const R void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi #if JVET_L0256_BIO - ,const bool& bBIOApplied /*=false*/ + ,const bool& bioApplied /*=false*/ #endif ) { @@ -368,7 +368,7 @@ void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& if ( pu.cu->affine ) { #if JVET_L0256_BIO - CHECK( bBIOApplied, "BIO is not allowed with affine" ); + CHECK( bioApplied, "BIO is not allowed with affine" ); #endif xPredAffineBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv, pcYuvPred, bi, pu.cu->slice->clpRng( compID ) ); } @@ -376,7 +376,7 @@ void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& { xPredInterBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv[0], pcYuvPred, bi, pu.cu->slice->clpRng( compID ) #if JVET_L0256_BIO - ,bBIOApplied + ,bioApplied #endif ); } @@ -389,31 +389,31 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) const Slice &slice = *pu.cs->slice; #if JVET_L0256_BIO - bool bBIOApplied = false; + bool bioApplied = false; if (pu.cs->sps->getSpsNext().getUseBIO()) { if (pu.cu->affine || m_subPuMC) { - bBIOApplied = false; + bioApplied = false; } else { - const bool bBIOcheck0 = !(pps.getWPBiPred() && slice.getSliceType() == B_SLICE); - const bool bBIOcheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE); - if (bBIOcheck0 - && bBIOcheck1 + const bool biocheck0 = !(pps.getWPBiPred() && slice.getSliceType() == B_SLICE); + const bool biocheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE); + if (biocheck0 + && biocheck1 && PU::isBiPredFromDifferentDir(pu) && !(pu.Y().height == 4 || (pu.Y().width == 4 && pu.Y().height == 8)) ) { - bBIOApplied = true; + bioApplied = true; } } #if JVET_L0646_GBI - if (pu.cu->cs->sps->getSpsNext().getUseGBi() && bBIOApplied && pu.cu->GBiIdx != GBI_DEFAULT) + if (pu.cu->cs->sps->getSpsNext().getUseGBi() && bioApplied && pu.cu->GBiIdx != GBI_DEFAULT) { - bBIOApplied = false; + bioApplied = false; } #endif } @@ -439,7 +439,7 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) { xPredInterUni ( pu, eRefPicList, pcMbBuf, true #if JVET_L0256_BIO - ,bBIOApplied + ,bioApplied #endif ); } @@ -474,7 +474,7 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) else { #if JVET_L0256_BIO - xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs(), bBIOApplied ); + xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied ); #else xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs() ); #endif @@ -483,7 +483,7 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) void InterPrediction::xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng #if JVET_L0256_BIO - ,const bool& bBIOApplied /*=false*/ + ,const bool& bioApplied /*=false*/ #endif ) { @@ -533,10 +533,10 @@ void InterPrediction::xPredInterBlk ( const ComponentID& compID, const Predictio Pel *backupDstBufPtr = dstBuf.buf; int backupDstBufStride = dstBuf.stride; - if (bBIOApplied && compID == COMPONENT_Y) + if (bioApplied && compID == COMPONENT_Y) { - width = width + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; - height = height + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; + width = width + 2 * BIO_EXTEND_SIZE + 2; + height = height + 2 * BIO_EXTEND_SIZE + 2; // change MC output dstBuf.stride = width; @@ -582,7 +582,7 @@ void InterPrediction::xPredInterBlk ( const ComponentID& compID, const Predictio JVET_J0090_SET_CACHE_ENABLE( true ); } #if JVET_L0256_BIO - if (bBIOApplied && compID == COMPONENT_Y) + if (bioApplied && compID == COMPONENT_Y) { refBuf.buf = refBuf.buf - refBuf.stride - 1; dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + dstBuf.stride + 1; @@ -744,48 +744,48 @@ int getMSB( unsigned x ) } #if JVET_L0256_BIO -void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &pcYuvSrc0, const CPelUnitBuf &pcYuvSrc1, const int &iRefIdx0, const int &iRefIdx1, PelUnitBuf &pcYuvDst, const BitDepths &clipBitDepths) +void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths) { - const int iHeight = pcYuvDst.Y().height; - const int iWidth = pcYuvDst.Y().width; - int iHeightG = iHeight + 2 * JVET_L0256_BIO_EXTEND_SIZE; - int iWidthG = iWidth + 2 * JVET_L0256_BIO_EXTEND_SIZE; - int offsetPos = iWidthG*JVET_L0256_BIO_EXTEND_SIZE + JVET_L0256_BIO_EXTEND_SIZE; - - Pel* pGradX0 = m_pGradX0; - Pel* pGradX1 = m_pGradX1; - Pel* pGradY0 = m_pGradY0; - Pel* pGradY1 = m_pGradY1; - - int stridePredMC = iWidthG + 2; - const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + stridePredMC + 1; - const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + stridePredMC + 1; - const int iSrc0Stride = stridePredMC; - const int iSrc1Stride = stridePredMC; - - Pel* pDstY = pcYuvDst.Y().buf; - const int iDstStride = pcYuvDst.Y().stride; - const Pel* pSrcY0Temp = pSrcY0; - const Pel* pSrcY1Temp = pSrcY1; + const int height = yuvDst.Y().height; + const int width = yuvDst.Y().width; + int heightG = height + 2 * BIO_EXTEND_SIZE; + int widthG = width + 2 * BIO_EXTEND_SIZE; + int offsetPos = widthG*BIO_EXTEND_SIZE + BIO_EXTEND_SIZE; + + Pel* gradX0 = m_gradX0; + Pel* gradX1 = m_gradX1; + Pel* gradY0 = m_gradY0; + Pel* gradY1 = m_gradY1; + + int stridePredMC = widthG + 2; + const Pel* srcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + stridePredMC + 1; + const Pel* srcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + stridePredMC + 1; + const int src0Stride = stridePredMC; + const int src1Stride = stridePredMC; + + Pel* dstY = yuvDst.Y().buf; + const int dstStride = yuvDst.Y().stride; + const Pel* srcY0Temp = srcY0; + const Pel* srcY1Temp = srcY1; for (int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++) { Pel* dstTempPtr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + stridePredMC + 1; - Pel* pGradY = (refList == 0) ? m_pGradY0 : m_pGradY1; - Pel* pGradX = (refList == 0) ? m_pGradX0 : m_pGradX1; + Pel* gradY = (refList == 0) ? m_gradY0 : m_gradY1; + Pel* gradX = (refList == 0) ? m_gradX0 : m_gradX1; - g_pelBufOP.bioGradFilter(dstTempPtr, stridePredMC, iWidthG, iHeightG, iWidthG, pGradX, pGradY); - Pel* pcPadStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 2; - for (int y = 0; y< iHeight; y++) + g_pelBufOP.bioGradFilter(dstTempPtr, stridePredMC, widthG, heightG, widthG, gradX, gradY); + Pel* padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 2; + for (int y = 0; y< height; y++) { - pcPadStr[-1] = pcPadStr[0]; - pcPadStr[iWidth] = pcPadStr[iWidth - 1]; - pcPadStr += stridePredMC; + padStr[-1] = padStr[0]; + padStr[width] = padStr[width - 1]; + padStr += stridePredMC; } - pcPadStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 1; - ::memcpy(pcPadStr - stridePredMC, pcPadStr, sizeof(Pel)*(iWidthG)); - ::memcpy(pcPadStr + iHeight*stridePredMC, pcPadStr + (iHeight - 1)*stridePredMC, sizeof(Pel)*(iWidthG)); + padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 1; + ::memcpy(padStr - stridePredMC, padStr, sizeof(Pel)*(widthG)); + ::memcpy(padStr + height*stridePredMC, padStr + (height - 1)*stridePredMC, sizeof(Pel)*(widthG)); } const ClpRng& clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y); @@ -794,20 +794,20 @@ void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; const int limit = ((int)1 << (4 + IF_INTERNAL_PREC - bitDepth - 5)); - int* m_piDotProductTemp1 = m_piDotProduct1; - int* m_piDotProductTemp2 = m_piDotProduct2; - int* m_piDotProductTemp3 = m_piDotProduct3; - int* m_piDotProductTemp5 = m_piDotProduct5; - int* m_piDotProductTemp6 = m_piDotProduct6; + int* dotProductTemp1 = m_dotProduct1; + int* dotProductTemp2 = m_dotProduct2; + int* dotProductTemp3 = m_dotProduct3; + int* dotProductTemp5 = m_dotProduct5; + int* dotProductTemp6 = m_dotProduct6; - g_pelBufOP.calcBIOPar(pSrcY0Temp, pSrcY1Temp, pGradX0, pGradX1, pGradY0, pGradY1, m_piDotProductTemp1, m_piDotProductTemp2, m_piDotProductTemp3, m_piDotProductTemp5, m_piDotProductTemp6, iSrc0Stride, iSrc1Stride, iWidthG, iWidthG, iHeightG); + g_pelBufOP.calcBIOPar(srcY0Temp, srcY1Temp, gradX0, gradX1, gradY0, gradY1, dotProductTemp1, dotProductTemp2, dotProductTemp3, dotProductTemp5, dotProductTemp6, src0Stride, src1Stride, widthG, widthG, heightG); - int xUnit = (iWidth >> 2); - int yUnit = (iHeight >> 2); + int xUnit = (width >> 2); + int yUnit = (height >> 2); - Pel *pDstY0 = pDstY; - pGradX0 = m_pGradX0; pGradX1 = m_pGradX1; - pGradY0 = m_pGradY0; pGradY1 = m_pGradY1; + Pel *dstY0 = dstY; + gradX0 = m_gradX0; gradX1 = m_gradX1; + gradY0 = m_gradY0; gradY1 = m_gradY1; for (int yu = 0; yu < yUnit; yu++) { @@ -815,23 +815,23 @@ void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf { if (m_bioPredSubBlkDist[yu*xUnit + xu] < m_bioSubBlkDistThres) { - pSrcY0Temp = pSrcY0 + (stridePredMC + 1) + ((yu*iSrc0Stride + xu) << 2); - pSrcY1Temp = pSrcY1 + (stridePredMC + 1) + ((yu*iSrc1Stride + xu) << 2); - pDstY0 = pDstY + ((yu*iDstStride + xu) << 2); - g_pelBufOP.addAvg4(pSrcY0Temp, iSrc0Stride, pSrcY1Temp, iSrc1Stride, pDstY0, iDstStride, (1 << 2), (1 << 2), shiftNum, offset, clpRng); + srcY0Temp = srcY0 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2); + srcY1Temp = srcY1 + (stridePredMC + 1) + ((yu*src1Stride + xu) << 2); + dstY0 = dstY + ((yu*dstStride + xu) << 2); + g_pelBufOP.addAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, (1 << 2), (1 << 2), shiftNum, offset, clpRng); continue; } int sGxdI = 0, sGydI = 0, sGxGy = 0, sGx2 = 0, sGy2 = 0; int tmpx = 0, tmpy = 0; - m_piDotProductTemp1 = m_piDotProduct1 + offsetPos + ((yu*iWidthG + xu) << 2); - m_piDotProductTemp2 = m_piDotProduct2 + offsetPos + ((yu*iWidthG + xu) << 2); - m_piDotProductTemp3 = m_piDotProduct3 + offsetPos + ((yu*iWidthG + xu) << 2); - m_piDotProductTemp5 = m_piDotProduct5 + offsetPos + ((yu*iWidthG + xu) << 2); - m_piDotProductTemp6 = m_piDotProduct6 + offsetPos + ((yu*iWidthG + xu) << 2); + dotProductTemp1 = m_dotProduct1 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp2 = m_dotProduct2 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp3 = m_dotProduct3 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp5 = m_dotProduct5 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp6 = m_dotProduct6 + offsetPos + ((yu*widthG + xu) << 2); - g_pelBufOP.calcBlkGradient(xu << 2, yu << 2, m_piDotProductTemp1, m_piDotProductTemp2, m_piDotProductTemp3, m_piDotProductTemp5, m_piDotProductTemp6, sGx2, sGy2, sGxGy, sGxdI, sGydI, iWidthG, iHeightG, (1 << 2)); + g_pelBufOP.calcBlkGradient(xu << 2, yu << 2, dotProductTemp1, dotProductTemp2, dotProductTemp3, dotProductTemp5, dotProductTemp6, sGx2, sGy2, sGxGy, sGxdI, sGydI, widthG, heightG, (1 << 2)); if (sGx2 > 0) { @@ -848,15 +848,15 @@ void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf tmpy = Clip3(-limit, limit, tmpy); } - pSrcY0Temp = pSrcY0 + (stridePredMC + 1) + ((yu*iSrc0Stride + xu) << 2); - pSrcY1Temp = pSrcY1 + (stridePredMC + 1) + ((yu*iSrc0Stride + xu) << 2); - pGradX0 = m_pGradX0 + offsetPos + ((yu*iWidthG + xu) << 2); - pGradX1 = m_pGradX1 + offsetPos + ((yu*iWidthG + xu) << 2); - pGradY0 = m_pGradY0 + offsetPos + ((yu*iWidthG + xu) << 2); - pGradY1 = m_pGradY1 + offsetPos + ((yu*iWidthG + xu) << 2); + srcY0Temp = srcY0 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2); + srcY1Temp = srcY1 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2); + gradX0 = m_gradX0 + offsetPos + ((yu*widthG + xu) << 2); + gradX1 = m_gradX1 + offsetPos + ((yu*widthG + xu) << 2); + gradY0 = m_gradY0 + offsetPos + ((yu*widthG + xu) << 2); + gradY1 = m_gradY1 + offsetPos + ((yu*widthG + xu) << 2); - pDstY0 = pDstY + ((yu*iDstStride + xu) << 2); - g_pelBufOP.addBIOAvg4(pSrcY0Temp, iSrc0Stride, pSrcY1Temp, iSrc1Stride, pDstY0, iDstStride, pGradX0, pGradX1, pGradY0, pGradY1, iWidthG, (1 << 2), (1 << 2), (int)tmpx, (int)tmpy, shiftNum, offset, clpRng); + dstY0 = dstY + ((yu*dstStride + xu) << 2); + g_pelBufOP.addBIOAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, gradX0, gradX1, gradY0, gradY1, widthG, (1 << 2), (1 << 2), (int)tmpx, (int)tmpy, shiftNum, offset, clpRng); } // xu } // yu } @@ -957,7 +957,7 @@ bool InterPrediction::xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* #endif #if JVET_L0256_BIO -void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bBIOApplied ) +void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied ) #else void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs ) #endif @@ -971,17 +971,17 @@ void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnit if( pu.cu->GBiIdx != GBI_DEFAULT ) { #if JVET_L0256_BIO - CHECK(bBIOApplied, "GBi is disallowed with BIO"); + CHECK(bioApplied, "GBi is disallowed with BIO"); #endif pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->GBiIdx); return; } #endif #if JVET_L0256_BIO - if (bBIOApplied) + if (bioApplied) { - const int src0Stride = pu.lwidth() + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; - const int src1Stride = pu.lwidth() + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; + const int src0Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2; + const int src1Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2; const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + 2 * src0Stride + 2; const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + 2 * src1Stride + 2; @@ -995,7 +995,7 @@ void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnit pcYuvDst.bufs[0].addAvg(CPelBuf(pSrcY0, src0Stride, pu.lumaSize()), CPelBuf(pSrcY1, src1Stride, pu.lumaSize()), clpRngs.comp[0]); } } - pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, bBIOApplied); + pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, bioApplied); #else pcYuvDst.addAvg( pcYuvSrc0, pcYuvSrc1, clpRngs ); #endif diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index 01b13c84e527ecac4a179f20a389041dd64dba22..0bff85b70a5f525b26f8734c6e3b2c682cf48846 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -60,10 +60,6 @@ class Mv; // Class definition // ==================================================================================================================== -#if JVET_L0256_BIO -#define BIO_TEMP_BUFFER_SIZE ( MAX_CU_SIZE+2*JVET_L0256_BIO_EXTEND_SIZE ) * ( MAX_CU_SIZE+2*JVET_L0256_BIO_EXTEND_SIZE ) -#endif - class InterPrediction : public WeightPrediction { private: @@ -73,11 +69,11 @@ private: Distortion m_bioSubBlkDistThres; Distortion m_bioPredSubBlkDist[MAX_NUM_PARTS_IN_CTU]; - int m_piDotProduct1[BIO_TEMP_BUFFER_SIZE]; - int m_piDotProduct2[BIO_TEMP_BUFFER_SIZE]; - int m_piDotProduct3[BIO_TEMP_BUFFER_SIZE]; - int m_piDotProduct5[BIO_TEMP_BUFFER_SIZE]; - int m_piDotProduct6[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct1[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct2[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct3[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct5[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct6[BIO_TEMP_BUFFER_SIZE]; #endif protected: @@ -97,32 +93,32 @@ protected: int m_iRefListIdx; #if JVET_L0256_BIO - Pel* m_pGradX0; - Pel* m_pGradY0; - Pel* m_pGradX1; - Pel* m_pGradY1; + Pel* m_gradX0; + Pel* m_gradY0; + Pel* m_gradX1; + Pel* m_gradY1; bool m_subPuMC; int rightShiftMSB(int numer, int denom); - void applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &pcYuvSrc0, const CPelUnitBuf &pcYuvSrc1, const int &iRefIdx0, const int &iRefIdx1, PelUnitBuf &pcYuvDst, const BitDepths &clipBitDepths); - bool xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* pYuvSrc0, const int src0Stride, const Pel* pYuvSrc1, const int src1Stride, const BitDepths &clipBitDepths); + void applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths); + bool xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* yuvSrc0, const int src0Stride, const Pel* yuvSrc1, const int src1Stride, const BitDepths &clipBitDepths); void bioSampleExtendBilinearFilter(Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int dim, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng); #endif void xPredInterUni ( const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi #if JVET_L0256_BIO - ,const bool& bBIOApplied = false + ,const bool& bioApplied = false #endif ); void xPredInterBi ( PredictionUnit& pu, PelUnitBuf &pcYuvPred ); void xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng #if JVET_L0256_BIO - ,const bool& bBIOApplied = false + ,const bool& bioApplied = false #endif ); #if JVET_L0256_BIO - void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bBIOApplied ); + void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied ); #else void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs ); #endif diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp index 85306088a19032ef2405bd1742c08198b66f32dc..572eacb8ba30219900a0137a0d55f05157511430 100644 --- a/source/Lib/CommonLib/RdCost.cpp +++ b/source/Lib/CommonLib/RdCost.cpp @@ -323,7 +323,7 @@ void RdCost::setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &c } #if JVET_L0256_BIO -void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard, bool bBIOApplied ) +void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard, bool bioApplied ) #else void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard ) #endif @@ -348,7 +348,7 @@ void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, CHECK( useHadamard || rcDP.useMR || subShiftMode > 0, "only used in xDirectMCCost with these default parameters (so far...)" ); #if JVET_L0256_BIO - if ( bBIOApplied ) + if ( bioApplied ) { rcDP.distFunc = m_afpDistortFunc[ DF_SAD_INTERMEDIATE_BITDEPTH ]; return; diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h index 07b9f69395c7e622cb6499acf201c4d16603df79..3c95f90adf54158c5cfce66572b7dedc82613f7d 100644 --- a/source/Lib/CommonLib/RdCost.h +++ b/source/Lib/CommonLib/RdCost.h @@ -102,9 +102,6 @@ private: // for distortion static FpDistFunc m_afpDistortFunc[DF_TOTAL_FUNCTIONS]; // [eDFunc] -#if JVET_L0256_BIO - -#endif CostMode m_costMode; double m_distortionWeight[MAX_NUM_COMPONENT]; // only chroma values are used. double m_dLambda; @@ -158,7 +155,7 @@ public: void setDistParam( DistParam &rcDP, const CPelBuf &org, const Pel* piRefY , int iRefStride, int bitDepth, ComponentID compID, int subShiftMode = 0, int step = 1, bool useHadamard = false ); void setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &cur, int bitDepth, ComponentID compID, bool useHadamard = false ); #if JVET_L0256_BIO - void setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false, bool bBIOApplied = false ); + void setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false, bool bioApplied = false ); #else void setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false ); #endif @@ -273,7 +270,7 @@ private: static Distortion xGetSAD_SIMD ( const DistParam& pcDtParam ); template< int iWidth, X86_VEXT vext > static Distortion xGetSAD_NxN_SIMD( const DistParam& pcDtParam ); -#if JVET_L0256_BIO +#if ENABLE_SIMD_OPT_BIO template< X86_VEXT vext > static Distortion xGetSAD_IBD_SIMD(const DistParam& pcDtParam); #endif diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index af2cf490b7a86a5545535951e2bb5a30a7ca446b..e6ad40144ce200256e7f02b07bc582741f44c798 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -76,9 +76,6 @@ #define L0074_SUBBLOCK_DEBLOCKING 1 #define JVET_L0256_BIO 1 -#if JVET_L0256_BIO -#define JVET_L0256_BIO_EXTEND_SIZE 1 -#endif #define JVET_L0646_GBI 1 // Generalized bi-prediction (GBi) @@ -258,6 +255,7 @@ #if ENABLE_SIMD_OPT_BUFFER #define ENABLE_SIMD_OPT_GBI 1 ///< SIMD optimization for GBi #endif +#define ENABLE_SIMD_OPT_BIO ( JVET_L0256_BIO && ENABLE_SIMD_OPT ) ///< SIMD optimization for BIO // End of SIMD optimizations diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index 1add0996f854645da46a4f205a1c0e6076883f76..92f52655a47940016d708dadc68a152a00a1315b 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -128,9 +128,9 @@ void addAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int s } } -#if JVET_L0256_BIO +#if ENABLE_SIMD_OPT_BIO template< X86_VEXT vext > -void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *pGradX0, const Pel *pGradX1, const Pel *pGradY0, const Pel*pGradY1, int iGradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) +void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) { __m128i mm_tmpx = _mm_unpacklo_epi64(_mm_set1_epi16(tmpx), _mm_set1_epi16(tmpy)); __m128i mm_boffset = _mm_set1_epi32(1); @@ -142,8 +142,8 @@ void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1St { for (int x = 0; x < width; x += 4) { - __m128i mm_a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pGradX0 + x)), _mm_loadl_epi64((const __m128i *)(pGradY0 + x))); - __m128i mm_b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pGradX1 + x)), _mm_loadl_epi64((const __m128i *)(pGradY1 + x))); + __m128i mm_a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX0 + x)), _mm_loadl_epi64((const __m128i *)(gradY0 + x))); + __m128i mm_b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX1 + x)), _mm_loadl_epi64((const __m128i *)(gradY1 + x))); mm_a = _mm_sub_epi16(mm_a, mm_b); mm_b = _mm_mulhi_epi16(mm_a, mm_tmpx); mm_a = _mm_mullo_epi16(mm_a, mm_tmpx); @@ -158,82 +158,82 @@ void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1St _mm_storel_epi64((__m128i *)(dst + x), mm_sum); } dst += dstStride; src0 += src0Stride; src1 += src1Stride; - pGradX0 += iGradStride; pGradX1 += iGradStride; pGradY0 += iGradStride; pGradY1 += iGradStride; + gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride; } } template< X86_VEXT vext > -void gradFilter_SSE(Pel* piSrc, int iSrcStride, int iWidth, int iHeight, int iGradStride, Pel* piGradX, Pel* piGradY) +void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY) { __m128i vzero = _mm_setzero_si128(); - Pel* piSrcTmp = piSrc + iSrcStride + 1; - Pel* piGradXTmp = piGradX + iGradStride + 1; - Pel* piGradYTmp = piGradY + iGradStride + 1; + Pel* srcTmp = src + srcStride + 1; + Pel* gradXTmp = gradX + gradStride + 1; + Pel* gradYTmp = gradY + gradStride + 1; - int iWidthInside = iWidth - 2 * JVET_L0256_BIO_EXTEND_SIZE; - int iHeightInside = iHeight - 2 * JVET_L0256_BIO_EXTEND_SIZE; + int widthInside = width - 2 * BIO_EXTEND_SIZE; + int heightInside = height - 2 * BIO_EXTEND_SIZE; - assert((iWidthInside & 3) == 0); + assert((widthInside & 3) == 0); - for (int y = 0; y < iHeightInside; y++) + for (int y = 0; y < heightInside; y++) { int x = 0; - for (; x < iWidthInside; x += 4) + for (; x < widthInside; x += 4) { - __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x - iSrcStride))); - __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x + iSrcStride))); - __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x - 1))); - __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x + 1))); + __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - srcStride))); + __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + srcStride))); + __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - 1))); + __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + 1))); __m128i mmGradVer = _mm_srai_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), 4); __m128i mmGradHor = _mm_srai_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), 4); mmGradVer = _mm_packs_epi32(mmGradVer, vzero); mmGradHor = _mm_packs_epi32(mmGradHor, vzero); - _mm_storel_epi64((__m128i *)(piGradYTmp + x), mmGradVer); - _mm_storel_epi64((__m128i *)(piGradXTmp + x), mmGradHor); + _mm_storel_epi64((__m128i *)(gradYTmp + x), mmGradVer); + _mm_storel_epi64((__m128i *)(gradXTmp + x), mmGradHor); } - piGradXTmp += iGradStride; - piGradYTmp += iGradStride; - piSrcTmp += iSrcStride; + gradXTmp += gradStride; + gradYTmp += gradStride; + srcTmp += srcStride; } - piGradXTmp = piGradX + iGradStride + 1; - piGradYTmp = piGradY + iGradStride + 1; - for (int y = 0; y < iHeightInside; y++) + gradXTmp = gradX + gradStride + 1; + gradYTmp = gradY + gradStride + 1; + for (int y = 0; y < heightInside; y++) { - piGradXTmp[-1] = piGradXTmp[0]; - piGradXTmp[iWidthInside] = piGradXTmp[iWidthInside - 1]; - piGradXTmp += iGradStride; + gradXTmp[-1] = gradXTmp[0]; + gradXTmp[widthInside] = gradXTmp[widthInside - 1]; + gradXTmp += gradStride; - piGradYTmp[-1] = piGradYTmp[0]; - piGradYTmp[iWidthInside] = piGradYTmp[iWidthInside - 1]; - piGradYTmp += iGradStride; + gradYTmp[-1] = gradYTmp[0]; + gradYTmp[widthInside] = gradYTmp[widthInside - 1]; + gradYTmp += gradStride; } - piGradXTmp = piGradX + iGradStride; - piGradYTmp = piGradY + iGradStride; - ::memcpy(piGradXTmp - iGradStride, piGradXTmp, sizeof(Pel)*(iWidth)); - ::memcpy(piGradXTmp + iHeightInside*iGradStride, piGradXTmp + (iHeightInside - 1)*iGradStride, sizeof(Pel)*(iWidth)); - ::memcpy(piGradYTmp - iGradStride, piGradYTmp, sizeof(Pel)*(iWidth)); - ::memcpy(piGradYTmp + iHeightInside*iGradStride, piGradYTmp + (iHeightInside - 1)*iGradStride, sizeof(Pel)*(iWidth)); + gradXTmp = gradX + gradStride; + gradYTmp = gradY + gradStride; + ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel)*(width)); + ::memcpy(gradXTmp + heightInside*gradStride, gradXTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width)); + ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width)); + ::memcpy(gradYTmp + heightInside*gradStride, gradYTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width)); } template< X86_VEXT vext > -void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGradX0, const Pel* pGradX1, const Pel* pGradY0, const Pel* pGradY1, int* m_piDotProductTemp1, int* m_piDotProductTemp2, int* m_piDotProductTemp3, int* m_piDotProductTemp5, int* m_piDotProductTemp6, const int iSrc0Stride, const int iSrc1Stride, const int iGradStride, const int iWidthG, const int iHeightG) +void calcBIOPar_SSE(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG) { - for (int y = 0; y < iHeightG; y++) + for (int y = 0; y < heightG; y++) { int x = 0; - for (; x < ((iWidthG >> 3) << 3); x += 8) + for (; x < ((widthG >> 3) << 3); x += 8) { - __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(pSrcY0Temp + x)), 6); - __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(pSrcY1Temp + x)), 6); - __m128i mmGradX0 = _mm_loadu_si128((__m128i*)(pGradX0 + x)); - __m128i mmGradX1 = _mm_loadu_si128((__m128i*)(pGradX1 + x)); - __m128i mmGradY0 = _mm_loadu_si128((__m128i*)(pGradY0 + x)); - __m128i mmGradY1 = _mm_loadu_si128((__m128i*)(pGradY1 + x)); + __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Temp + x)), 6); + __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Temp + x)), 6); + __m128i mmGradX0 = _mm_loadu_si128((__m128i*)(gradX0 + x)); + __m128i mmGradX1 = _mm_loadu_si128((__m128i*)(gradX1 + x)); + __m128i mmGradY0 = _mm_loadu_si128((__m128i*)(gradY0 + x)); + __m128i mmGradY1 = _mm_loadu_si128((__m128i*)(gradY1 + x)); __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp); __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3); @@ -246,8 +246,8 @@ void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGr __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b); __m128i mm_h = _mm_unpackhi_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp1 + x), mm_l); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp1 + x + 4), mm_h); + _mm_storeu_si128((__m128i *)(dotProductTemp1 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp1 + x + 4), mm_h); // m_piDotProductTemp2 mm_b = _mm_mulhi_epi16(mmTempX, mmTempY); @@ -256,8 +256,8 @@ void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGr mm_l = _mm_unpacklo_epi16(mm_a, mm_b); mm_h = _mm_unpackhi_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp2 + x), mm_l); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp2 + x + 4), mm_h); + _mm_storeu_si128((__m128i *)(dotProductTemp2 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp2 + x + 4), mm_h); // m_piDotProductTemp3 mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1); @@ -266,8 +266,8 @@ void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGr mm_l = _mm_unpacklo_epi16(mm_a, mm_b); mm_h = _mm_unpackhi_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp3 + x), mm_l); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp3 + x + 4), mm_h); + _mm_storeu_si128((__m128i *)(dotProductTemp3 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp3 + x + 4), mm_h); // m_piDotProductTemp5 mm_b = _mm_mulhi_epi16(mmTempY, mmTempY); @@ -276,8 +276,8 @@ void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGr mm_l = _mm_unpacklo_epi16(mm_a, mm_b); mm_h = _mm_unpackhi_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp5 + x), mm_l); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp5 + x + 4), mm_h); + _mm_storeu_si128((__m128i *)(dotProductTemp5 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp5 + x + 4), mm_h); // m_piDotProductTemp6 mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1); @@ -286,18 +286,18 @@ void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGr mm_l = _mm_unpacklo_epi16(mm_a, mm_b); mm_h = _mm_unpackhi_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp6 + x), mm_l); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp6 + x + 4), mm_h); + _mm_storeu_si128((__m128i *)(dotProductTemp6 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp6 + x + 4), mm_h); } - for (; x < ((iWidthG >> 2) << 2); x += 4) + for (; x < ((widthG >> 2) << 2); x += 4) { - __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(pSrcY0Temp + x)), 6); - __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(pSrcY1Temp + x)), 6); - __m128i mmGradX0 = _mm_loadl_epi64((__m128i*)(pGradX0 + x)); - __m128i mmGradX1 = _mm_loadl_epi64((__m128i*)(pGradX1 + x)); - __m128i mmGradY0 = _mm_loadl_epi64((__m128i*)(pGradY0 + x)); - __m128i mmGradY1 = _mm_loadl_epi64((__m128i*)(pGradY1 + x)); + __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(srcY0Temp + x)), 6); + __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(srcY1Temp + x)), 6); + __m128i mmGradX0 = _mm_loadl_epi64((__m128i*)(gradX0 + x)); + __m128i mmGradX1 = _mm_loadl_epi64((__m128i*)(gradX1 + x)); + __m128i mmGradY0 = _mm_loadl_epi64((__m128i*)(gradY0 + x)); + __m128i mmGradY1 = _mm_loadl_epi64((__m128i*)(gradY1 + x)); __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp); __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3); @@ -308,78 +308,78 @@ void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGr __m128i mm_a = _mm_mullo_epi16(mmTempX, mmTempX); __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp1 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp1 + x), mm_l); // m_piDotProductTemp2 mm_b = _mm_mulhi_epi16(mmTempX, mmTempY); mm_a = _mm_mullo_epi16(mmTempX, mmTempY); mm_l = _mm_unpacklo_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp2 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp2 + x), mm_l); // m_piDotProductTemp3 mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1); mm_a = _mm_mullo_epi16(mmTempX, mmTemp1); mm_l = _mm_unpacklo_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp3 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp3 + x), mm_l); // m_piDotProductTemp5 mm_b = _mm_mulhi_epi16(mmTempY, mmTempY); mm_a = _mm_mullo_epi16(mmTempY, mmTempY); mm_l = _mm_unpacklo_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp5 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp5 + x), mm_l); // m_piDotProductTemp6 mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1); mm_a = _mm_mullo_epi16(mmTempY, mmTemp1); mm_l = _mm_unpacklo_epi16(mm_a, mm_b); - _mm_storeu_si128((__m128i *)(m_piDotProductTemp6 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp6 + x), mm_l); } - for (; x < iWidthG; x++) + for (; x < widthG; x++) { - int temp = (pSrcY0Temp[x] >> 6) - (pSrcY1Temp[x] >> 6); - int tempX = (pGradX0[x] + pGradX1[x]) >> 3; - int tempY = (pGradY0[x] + pGradY1[x]) >> 3; - m_piDotProductTemp1[x] = tempX * tempX; - m_piDotProductTemp2[x] = tempX * tempY; - m_piDotProductTemp3[x] = -tempX * temp; - m_piDotProductTemp5[x] = tempY * tempY; - m_piDotProductTemp6[x] = -tempY * temp; + int temp = (srcY0Temp[x] >> 6) - (srcY1Temp[x] >> 6); + int tempX = (gradX0[x] + gradX1[x]) >> 3; + int tempY = (gradY0[x] + gradY1[x]) >> 3; + dotProductTemp1[x] = tempX * tempX; + dotProductTemp2[x] = tempX * tempY; + dotProductTemp3[x] = -tempX * temp; + dotProductTemp5[x] = tempY * tempY; + dotProductTemp6[x] = -tempY * temp; } - pSrcY0Temp += iSrc0Stride; - pSrcY1Temp += iSrc1Stride; - pGradX0 += iGradStride; - pGradX1 += iGradStride; - pGradY0 += iGradStride; - pGradY1 += iGradStride; - m_piDotProductTemp1 += iWidthG; - m_piDotProductTemp2 += iWidthG; - m_piDotProductTemp3 += iWidthG; - m_piDotProductTemp5 += iWidthG; - m_piDotProductTemp6 += iWidthG; + srcY0Temp += src0Stride; + srcY1Temp += src1Stride; + gradX0 += gradStride; + gradX1 += gradStride; + gradY0 += gradStride; + gradY1 += gradStride; + dotProductTemp1 += widthG; + dotProductTemp2 += widthG; + dotProductTemp3 += widthG; + dotProductTemp5 += widthG; + dotProductTemp6 += widthG; } } template< X86_VEXT vext > void calcBlkGradient_SSE(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize) { - int *pGx2 = arraysGx2; - int *pGy2 = arraysGy2; - int *pGxGy = arraysGxGy; - int *pGxdI = arraysGxdI; - int *pGydI = arraysGydI; + int *Gx2 = arraysGx2; + int *Gy2 = arraysGy2; + int *GxGy = arraysGxGy; + int *GxdI = arraysGxdI; + int *GydI = arraysGydI; // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE - pGx2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGy2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGxGy -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGxdI -= (JVET_L0256_BIO_EXTEND_SIZE*width); - pGydI -= (JVET_L0256_BIO_EXTEND_SIZE*width); + Gx2 -= (BIO_EXTEND_SIZE*width); + Gy2 -= (BIO_EXTEND_SIZE*width); + GxGy -= (BIO_EXTEND_SIZE*width); + GxdI -= (BIO_EXTEND_SIZE*width); + GydI -= (BIO_EXTEND_SIZE*width); __m128i vzero = _mm_setzero_si128(); __m128i mmGx2Total = _mm_setzero_si128(); @@ -388,13 +388,13 @@ void calcBlkGradient_SSE(int sx, int sy, int *arraysGx2, int *arraysGxGy __m128i mmGxdITotal = _mm_setzero_si128(); __m128i mmGydITotal = _mm_setzero_si128(); - for (int y = -JVET_L0256_BIO_EXTEND_SIZE; y < unitSize + JVET_L0256_BIO_EXTEND_SIZE; y++) + for (int y = -BIO_EXTEND_SIZE; y < unitSize + BIO_EXTEND_SIZE; y++) { - __m128i mmsGx2 = _mm_loadu_si128((__m128i*)(pGx2 - 1)); __m128i mmsGx2Sec = _mm_loadl_epi64((__m128i*)(pGx2 + 3)); - __m128i mmsGy2 = _mm_loadu_si128((__m128i*)(pGy2 - 1)); __m128i mmsGy2Sec = _mm_loadl_epi64((__m128i*)(pGy2 + 3)); - __m128i mmsGxGy = _mm_loadu_si128((__m128i*)(pGxGy - 1)); __m128i mmsGxGySec = _mm_loadl_epi64((__m128i*)(pGxGy + 3)); - __m128i mmsGxdI = _mm_loadu_si128((__m128i*)(pGxdI - 1)); __m128i mmsGxdISec = _mm_loadl_epi64((__m128i*)(pGxdI + 3)); - __m128i mmsGydI = _mm_loadu_si128((__m128i*)(pGydI - 1)); __m128i mmsGydISec = _mm_loadl_epi64((__m128i*)(pGydI + 3)); + __m128i mmsGx2 = _mm_loadu_si128((__m128i*)(Gx2 - 1)); __m128i mmsGx2Sec = _mm_loadl_epi64((__m128i*)(Gx2 + 3)); + __m128i mmsGy2 = _mm_loadu_si128((__m128i*)(Gy2 - 1)); __m128i mmsGy2Sec = _mm_loadl_epi64((__m128i*)(Gy2 + 3)); + __m128i mmsGxGy = _mm_loadu_si128((__m128i*)(GxGy - 1)); __m128i mmsGxGySec = _mm_loadl_epi64((__m128i*)(GxGy + 3)); + __m128i mmsGxdI = _mm_loadu_si128((__m128i*)(GxdI - 1)); __m128i mmsGxdISec = _mm_loadl_epi64((__m128i*)(GxdI + 3)); + __m128i mmsGydI = _mm_loadu_si128((__m128i*)(GydI - 1)); __m128i mmsGydISec = _mm_loadl_epi64((__m128i*)(GydI + 3)); mmsGx2 = _mm_add_epi32(mmsGx2, mmsGx2Sec); mmsGy2 = _mm_add_epi32(mmsGy2, mmsGy2Sec); @@ -409,11 +409,11 @@ void calcBlkGradient_SSE(int sx, int sy, int *arraysGx2, int *arraysGxGy mmGxdITotal = _mm_add_epi32(mmGxdITotal, mmsGxdI); mmGydITotal = _mm_add_epi32(mmGydITotal, mmsGydI); - pGx2 += width; - pGy2 += width; - pGxGy += width; - pGxdI += width; - pGydI += width; + Gx2 += width; + Gy2 += width; + GxGy += width; + GxdI += width; + GydI += width; } mmGx2Total = _mm_hadd_epi32(_mm_hadd_epi32(mmGx2Total, vzero), vzero); @@ -798,7 +798,7 @@ void PelBufferOps::_initPelBufOpsX86() addAvg8 = addAvg_SSE<vext, 8>; addAvg4 = addAvg_SSE<vext, 4>; -#if JVET_L0256_BIO +#if ENABLE_SIMD_OPT_BIO addBIOAvg4 = addBIOAvg4_SSE<vext>; bioGradFilter = gradFilter_SSE<vext>; calcBIOPar = calcBIOPar_SSE<vext>; diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h index ab54737fa5a9e3e47c64660d9f4ab73d06624827..4d87189ca052e0d8ab14a17980b92c2e1406c13e 100644 --- a/source/Lib/CommonLib/x86/RdCostX86.h +++ b/source/Lib/CommonLib/x86/RdCostX86.h @@ -297,7 +297,7 @@ Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam ) return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } -#if JVET_L0256_BIO +#if ENABLE_SIMD_OPT_BIO template< X86_VEXT vext > Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) { @@ -308,14 +308,14 @@ Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) const short* src1 = (const short*)rcDtParam.cur.buf; int width = rcDtParam.org.height; int height = rcDtParam.org.width; - int iSubShift = rcDtParam.subShift; - int iSubStep = (1 << iSubShift); - const int src0Stride = rcDtParam.org.stride * iSubStep; - const int src1Stride = rcDtParam.cur.stride * iSubStep; + int subShift = rcDtParam.subShift; + int subStep = (1 << subShift); + const int src0Stride = rcDtParam.org.stride * subStep; + const int src1Stride = rcDtParam.cur.stride * subStep; __m128i vtotalsum32 = _mm_setzero_si128(); __m128i vzero = _mm_setzero_si128(); - for (int y = 0; y < height; y += iSubStep) + for (int y = 0; y < height; y += subStep) { for (int x = 0; x < width; x += 4) { @@ -332,7 +332,7 @@ Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero); Distortion uiSum = _mm_cvtsi128_si32(vtotalsum32); - uiSum <<= iSubShift; + uiSum <<= subShift; return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } #endif @@ -2462,7 +2462,7 @@ void RdCost::_initRdCostX86() m_afpDistortFunc[DF_HAD64] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; m_afpDistortFunc[DF_HAD16N] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; -#if JVET_L0256_BIO +#if ENABLE_SIMD_OPT_BIO m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD_IBD_SIMD<vext>; #endif }