From 8352de9704e430b66b8058ff684fce10d2365dd5 Mon Sep 17 00:00:00 2001 From: xiuxx <xiaoyu.xiu@interdigital.com> Date: Wed, 24 Oct 2018 17:18:15 -0700 Subject: [PATCH] JVET_L0256 check-in for the adoption of the BIO --- cfg/encoder_randomaccess_vtm.cfg | 3 +- source/App/EncoderApp/EncApp.cpp | 3 + source/App/EncoderApp/EncAppCfg.cpp | 13 +- source/App/EncoderApp/EncAppCfg.h | 3 + source/Lib/CommonLib/Buffer.cpp | 139 +++++++ source/Lib/CommonLib/Buffer.h | 6 + source/Lib/CommonLib/CommonDef.h | 4 + source/Lib/CommonLib/InterPrediction.cpp | 398 ++++++++++++++++++- source/Lib/CommonLib/InterPrediction.h | 41 +- source/Lib/CommonLib/InterpolationFilter.cpp | 44 ++ source/Lib/CommonLib/InterpolationFilter.h | 8 + source/Lib/CommonLib/RdCost.cpp | 16 + source/Lib/CommonLib/RdCost.h | 11 + source/Lib/CommonLib/Slice.cpp | 3 + source/Lib/CommonLib/Slice.h | 7 + source/Lib/CommonLib/TypeDef.h | 20 +- source/Lib/CommonLib/x86/BufferX86.h | 309 ++++++++++++++ source/Lib/CommonLib/x86/RdCostX86.h | 43 ++ source/Lib/DecoderLib/VLCReader.cpp | 3 + source/Lib/EncoderLib/EncCfg.h | 7 + source/Lib/EncoderLib/EncLib.cpp | 3 + source/Lib/EncoderLib/VLCWriter.cpp | 3 + 22 files changed, 1067 insertions(+), 20 deletions(-) diff --git a/cfg/encoder_randomaccess_vtm.cfg b/cfg/encoder_randomaccess_vtm.cfg index 63ff58df9..29f790277 100644 --- a/cfg/encoder_randomaccess_vtm.cfg +++ b/cfg/encoder_randomaccess_vtm.cfg @@ -142,7 +142,8 @@ DepQuant : 1 IMV : 2 ALF : 1 GBi : 1 -GBiFast : 1 +GBiFast : 1 +BIO : 1 # Fast tools PBIntraFast : 1 diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp index 3c6efc537..dba92e5f4 100644 --- a/source/App/EncoderApp/EncApp.cpp +++ b/source/App/EncoderApp/EncApp.cpp @@ -227,6 +227,9 @@ void EncApp::xInitLibCfg() m_cEncLib.setAffineType ( m_AffineType ); #if !REMOVE_MV_ADAPT_PREC m_cEncLib.setHighPrecisionMv (m_highPrecisionMv); +#endif +#if JVET_L0256_BIO + m_cEncLib.setBIO (m_BIO); #endif m_cEncLib.setDisableMotionCompression ( m_DisableMotionCompression ); m_cEncLib.setMTTMode ( m_MTT ); diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index 0b7da7327..9b4797641 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -818,8 +818,11 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) #if !REMOVE_MV_ADAPT_PREC ("HighPrecMv", m_highPrecisionMv, false, "High precision motion vectors for temporal merging (0:off, 1:on) [default: off]") #endif - ("Affine", m_Affine, false, "Enable affine prediction (0:off, 1:on) [default: off]") - ( "AffineType", m_AffineType, true, "Enable affine type prediction (0:off, 1:on) [default: on]" ) + ("Affine", m_Affine, false, "Enable affine prediction (0:off, 1:on) [default: off]") + ("AffineType", m_AffineType, true, "Enable affine type prediction (0:off, 1:on) [default: on]" ) +#if JVET_L0256_BIO + ("BIO", m_BIO, false, "Enable bi-directional optical flow") +#endif ("DisableMotCompression", m_DisableMotionCompression, false, "Disable motion data compression for all modes") ("IMV", m_ImvMode, 2, "Adaptive MV precision Mode (IMV)\n" "\t0: disabled IMV\n" @@ -1913,6 +1916,9 @@ bool EncAppCfg::xCheckParameter() #if !REMOVE_MV_ADAPT_PREC xConfirmPara( m_highPrecisionMv, "High precision MV for temporal merging can only be used with NEXT profile" ); xConfirmPara( m_Affine, "Affine is only allowed with NEXT profile" ); +#endif +#if JVET_L0256_BIO + xConfirmPara( m_BIO, "BIO only allowed with NEXT profile" ); #endif xConfirmPara( m_DisableMotionCompression, "Disable motion data compression only allowed with NEXT profile" ); xConfirmPara( m_MTT, "Multi type tree is only allowed with NEXT profile" ); @@ -3110,6 +3116,9 @@ void EncAppCfg::xPrintParameter() if( !m_QTBT ) msg( VERBOSE, "IMVMaxCand:%d ", m_ImvMaxCand ); #if !REMOVE_MV_ADAPT_PREC msg(VERBOSE, "HighPrecMv:%d ", m_highPrecisionMv); +#endif +#if JVET_L0256_BIO + msg( VERBOSE, "BIO:%d ", m_BIO ); #endif msg( VERBOSE, "DisMDC:%d ", m_DisableMotionCompression ); msg( VERBOSE, "MTT:%d ", m_MTT ); diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h index a9eb00f22..42a6aaf61 100644 --- a/source/App/EncoderApp/EncAppCfg.h +++ b/source/App/EncoderApp/EncAppCfg.h @@ -210,6 +210,9 @@ protected: bool m_AffineType; #if !REMOVE_MV_ADAPT_PREC bool m_highPrecisionMv; +#endif +#if JVET_L0256_BIO + bool m_BIO; #endif bool m_DisableMotionCompression; unsigned m_MTT; diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp index f31a22044..2e9c1a42a 100644 --- a/source/Lib/CommonLib/Buffer.cpp +++ b/source/Lib/CommonLib/Buffer.cpp @@ -62,6 +62,138 @@ void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T #undef ADD_AVG_CORE_INC } +#if JVET_L0256_BIO +void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *pGradX0, const Pel *pGradX1, const Pel *pGradY0, const Pel*pGradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) +{ + int b = 0; + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x += 4) + { + b = tmpx * (pGradX0[x] - pGradX1[x]) + tmpy * (pGradY0[x] - pGradY1[x]); + b = ((b + 1) >> 1); + dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng); + + b = tmpx * (pGradX0[x + 1] - pGradX1[x + 1]) + tmpy * (pGradY0[x + 1] - pGradY1[x + 1]); + b = ((b + 1) >> 1); + dst[x + 1] = ClipPel((int16_t)rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng); + + b = tmpx * (pGradX0[x + 2] - pGradX1[x + 2]) + tmpy * (pGradY0[x + 2] - pGradY1[x + 2]); + b = ((b + 1) >> 1); + dst[x + 2] = ClipPel((int16_t)rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng); + + b = tmpx * (pGradX0[x + 3] - pGradX1[x + 3]) + tmpy * (pGradY0[x + 3] - pGradY1[x + 3]); + b = ((b + 1) >> 1); + dst[x + 3] = ClipPel((int16_t)rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng); + } + dst += dstStride; src0 += src0Stride; src1 += src1Stride; + pGradX0 += gradStride; pGradX1 += gradStride; pGradY0 += gradStride; pGradY1 += gradStride; + } +} + +void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* pGradX, Pel* pGradY) +{ + Pel* piSrcTmp = pSrc + srcStride + 1; + Pel* piGradXTmp = pGradX + gradStride + 1; + Pel* piGradYTmp = pGradY + gradStride + 1; + + for (int y = 0; y < (height - 2 * JVET_L0256_BIO_EXTEND_SIZE); y++) + { + for (int x = 0; x < (width - 2 * JVET_L0256_BIO_EXTEND_SIZE); x++) + { + piGradYTmp[x] = (piSrcTmp[x + srcStride] - piSrcTmp[x - srcStride]) >> 4; + piGradXTmp[x] = (piSrcTmp[x + 1] - piSrcTmp[x - 1]) >> 4; + } + piGradXTmp += gradStride; + piGradYTmp += gradStride; + piSrcTmp += srcStride; + } + + piGradXTmp = pGradX + gradStride + 1; + piGradYTmp = pGradY + gradStride + 1; + for (int y = 0; y < (height - 2 * JVET_L0256_BIO_EXTEND_SIZE); y++) + { + piGradXTmp[-1] = piGradXTmp[0]; + piGradXTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE] = piGradXTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1]; + piGradXTmp += gradStride; + + piGradYTmp[-1] = piGradYTmp[0]; + piGradYTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE] = piGradYTmp[width - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1]; + piGradYTmp += gradStride; + } + + piGradXTmp = pGradX + gradStride; + piGradYTmp = pGradY + gradStride; + ::memcpy(piGradXTmp - gradStride, piGradXTmp, sizeof(Pel)*(width)); + ::memcpy(piGradXTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE)*gradStride, piGradXTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); + ::memcpy(piGradYTmp - gradStride, piGradYTmp, sizeof(Pel)*(width)); + ::memcpy(piGradYTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE)*gradStride, piGradYTmp + (height - 2 * JVET_L0256_BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); +} + +void calcBIOParCore(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGradX0, const Pel* pGradX1, const Pel* pGradY0, const Pel* pGradY1, int* m_piDotProductTemp1, int* m_piDotProductTemp2, int* m_piDotProductTemp3, int* m_piDotProductTemp5, int* m_piDotProductTemp6, const int iSrc0Stride, const int iSrc1Stride, const int iGradStride, const int iWidthG, const int iHeightG) +{ + for (int y = 0; y < iHeightG; y++) + { + for (int x = 0; x < iWidthG; x++) + { + int temp = (pSrcY0Temp[x] >> 6) - (pSrcY1Temp[x] >> 6); + int tempX = (pGradX0[x] + pGradX1[x]) >> 3; + int tempY = (pGradY0[x] + pGradY1[x]) >> 3; + m_piDotProductTemp1[x] = tempX * tempX; + m_piDotProductTemp2[x] = tempX * tempY; + m_piDotProductTemp3[x] = -tempX * temp; + m_piDotProductTemp5[x] = tempY * tempY; + m_piDotProductTemp6[x] = -tempY * temp; + } + pSrcY0Temp += iSrc0Stride; + pSrcY1Temp += iSrc1Stride; + pGradX0 += iGradStride; + pGradX1 += iGradStride; + pGradY0 += iGradStride; + pGradY1 += iGradStride; + m_piDotProductTemp1 += iWidthG; + m_piDotProductTemp2 += iWidthG; + m_piDotProductTemp3 += iWidthG; + m_piDotProductTemp5 += iWidthG; + m_piDotProductTemp6 += iWidthG; + } +} + +void calcBlkGradientCore(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize) +{ + int *pGx2 = arraysGx2; + int *pGy2 = arraysGy2; + int *pGxGy = arraysGxGy; + int *pGxdI = arraysGxdI; + int *pGydI = arraysGydI; + + // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE + pGx2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGy2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGxGy -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGxdI -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGydI -= (JVET_L0256_BIO_EXTEND_SIZE*width); + + for (int y = -JVET_L0256_BIO_EXTEND_SIZE; y < unitSize + JVET_L0256_BIO_EXTEND_SIZE; y++) + { + for (int x = -JVET_L0256_BIO_EXTEND_SIZE; x < unitSize + JVET_L0256_BIO_EXTEND_SIZE; x++) + { + sGx2 += pGx2[x]; + sGy2 += pGy2[x]; + sGxGy += pGxGy[x]; + sGxdI += pGxdI[x]; + sGydI += pGydI[x]; + } + pGx2 += width; + pGy2 += width; + pGxGy += width; + pGxdI += width; + pGydI += width; + } +} +#endif + #if ENABLE_SIMD_OPT_GBI && JVET_L0646_GBI void removeWeightHighFreq(int16_t* dst, int dstStride, const int16_t* src, int srcStride, int width, int height, int shift, int gbiWeight) { @@ -138,6 +270,13 @@ PelBufferOps::PelBufferOps() linTf4 = linTfCore<Pel>; linTf8 = linTfCore<Pel>; +#if JVET_L0256_BIO + addBIOAvg4 = addBIOAvgCore; + bioGradFilter = gradFilterCore; + calcBIOPar = calcBIOParCore; + calcBlkGradient = calcBlkGradientCore; +#endif + #if ENABLE_SIMD_OPT_GBI removeWeightHighFreq8 = removeWeightHighFreq; removeWeightHighFreq4 = removeWeightHighFreq; diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index fdf3b962f..b8e69315f 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -68,6 +68,12 @@ struct PelBufferOps void ( *reco8 ) ( const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, int width, int height, const ClpRng& clpRng ); void ( *linTf4 ) ( const Pel* src0, int src0Stride, Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip ); void ( *linTf8 ) ( const Pel* src0, int src0Stride, Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip ); +#if JVET_L0256_BIO + void(*addBIOAvg4) (const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *pGradX0, const Pel *pGradX1, const Pel *pGradY0, const Pel*pGradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng); + void(*bioGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* pGradX, Pel* pGradY); + void(*calcBIOPar) (const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGradX0, const Pel* pGradX1, const Pel* pGradY0, const Pel* pGradY1, int* m_piDotProductTemp1, int* m_piDotProductTemp2, int* m_piDotProductTemp3, int* m_piDotProductTemp5, int* m_piDotProductTemp6, const int iSrc0Stride, const int iSrc1Stride, const int iGradStride, const int iWidthG, const int iHeightG); + void(*calcBlkGradient)(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize); +#endif #if ENABLE_SIMD_OPT_GBI void ( *removeWeightHighFreq8) ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight); void ( *removeWeightHighFreq4) ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight); diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 6e853ecfc..98174dbeb 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -364,6 +364,10 @@ static const unsigned C806_ALF_TEMPPRED_NUM = 6; static const int NTAPS_LUMA = 8; ///< Number of taps for luma static const int NTAPS_CHROMA = 4; ///< Number of taps for chroma +#if JVET_L0256_BIO +static const int NTAPS_BILINEAR = 2; ///< Number of taps for bilinear filter +#endif + // ==================================================================================================================== // Macro functions // ==================================================================================================================== diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index c6ea5e914..4720c41ef 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -55,6 +55,13 @@ InterPrediction::InterPrediction() m_currChromaFormat( NUM_CHROMA_FORMAT ) , m_maxCompIDToPred ( MAX_NUM_COMPONENT ) , m_pcRdCost ( nullptr ) +#if JVET_L0256_BIO +, m_pGradX0(nullptr) +, m_pGradY0(nullptr) +, m_pGradX1(nullptr) +, m_pGradY1(nullptr) +, m_subPuMC(false) +#endif { for( uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++ ) { @@ -109,6 +116,13 @@ void InterPrediction::destroy() m_filteredBlockTmp[i][c] = nullptr; } } + +#if JVET_L0256_BIO + xFree(m_pGradX0); m_pGradX0 = nullptr; + xFree(m_pGradY0); m_pGradY0 = nullptr; + xFree(m_pGradX1); m_pGradX1 = nullptr; + xFree(m_pGradY1); m_pGradY1 = nullptr; +#endif } void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) @@ -127,8 +141,13 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) { for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ ) { +#if JVET_L0256_BIO + int extWidth = MAX_CU_SIZE + (2 * JVET_L0256_BIO_EXTEND_SIZE + 2) + 16; + int extHeight = MAX_CU_SIZE + (2 * JVET_L0256_BIO_EXTEND_SIZE + 2) + 1; +#else int extWidth = MAX_CU_SIZE + 16; int extHeight = MAX_CU_SIZE + 1; +#endif for( uint32_t i = 0; i < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS; i++ ) { m_filteredBlockTmp[i][c] = ( Pel* ) xMalloc( Pel, ( extWidth + 4 ) * ( extHeight + 7 + 4 ) ); @@ -148,7 +167,13 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) m_iRefListIdx = -1; - + +#if JVET_L0256_BIO + m_pGradX0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_pGradY0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_pGradX1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_pGradY1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); +#endif } #if !JVET_J0090_MEMORY_BANDWITH_MEASURE @@ -264,6 +289,10 @@ void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const R int fstStep = (!verMC ? puHeight : puWidth); int secStep = (!verMC ? puWidth : puHeight); +#if JVET_L0256_BIO + m_subPuMC = true; +#endif + for (int fstDim = fstStart; fstDim < fstEnd; fstDim += fstStep) { for (int secDim = secStart; secDim < secEnd; secDim += secStep) @@ -299,10 +328,16 @@ void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const R secDim = later - secStep; } } +#if JVET_L0256_BIO + m_subPuMC = false; +#endif } void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi +#if JVET_L0256_BIO + ,const bool& bBIOApplied /*=false*/ +#endif ) { const SPS &sps = *pu.cs->sps; @@ -332,12 +367,18 @@ void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& const ComponentID compID = ComponentID( comp ); if ( pu.cu->affine ) { +#if JVET_L0256_BIO + CHECK( bBIOApplied, "BIO is not allowed with affine" ); +#endif xPredAffineBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv, pcYuvPred, bi, pu.cu->slice->clpRng( compID ) ); } else { xPredInterBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv[0], pcYuvPred, bi, pu.cu->slice->clpRng( compID ) - ); +#if JVET_L0256_BIO + ,bBIOApplied +#endif + ); } } } @@ -347,6 +388,36 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) const PPS &pps = *pu.cs->pps; const Slice &slice = *pu.cs->slice; +#if JVET_L0256_BIO + bool bBIOApplied = false; + if (pu.cs->sps->getSpsNext().getUseBIO()) + { + if (pu.cu->affine || m_subPuMC) + { + bBIOApplied = false; + } + else + { + const bool bBIOcheck0 = !(pps.getWPBiPred() && slice.getSliceType() == B_SLICE); + const bool bBIOcheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE); + if (bBIOcheck0 + && bBIOcheck1 + && PU::isBiPredFromDifferentDir(pu) + && !(pu.Y().height == 4 || (pu.Y().width == 4 && pu.Y().height == 8)) + ) + { + bBIOApplied = true; + } + } + +#if JVET_L0646_GBI + if (pu.cu->cs->sps->getSpsNext().getUseGBi() && bBIOApplied && pu.cu->GBiIdx != GBI_DEFAULT) + { + bBIOApplied = false; + } +#endif + } +#endif for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++) { @@ -367,6 +438,9 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) if (pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0) { xPredInterUni ( pu, eRefPicList, pcMbBuf, true +#if JVET_L0256_BIO + ,bBIOApplied +#endif ); } else @@ -399,13 +473,19 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) } else { +#if JVET_L0256_BIO + xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs(), bBIOApplied ); +#else xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs() ); +#endif } } - void InterPrediction::xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng - ) +#if JVET_L0256_BIO + ,const bool& bBIOApplied /*=false*/ +#endif +) { JVET_J0090_SET_REF_PICTURE( refPic, compID ); const ChromaFormat chFmt = pu.chromaFormat; @@ -446,24 +526,75 @@ void InterPrediction::xPredInterBlk ( const ComponentID& compID, const Predictio refBuf = refPic->getRecoBuf( CompArea( compID, chFmt, offset, pu.blocks[compID].size() ) ); } +#if JVET_L0256_BIO + // backup data + int backupWidth = width; + int backupHeight = height; + Pel *backupDstBufPtr = dstBuf.buf; + int backupDstBufStride = dstBuf.stride; + + if (bBIOApplied && compID == COMPONENT_Y) + { + width = width + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; + height = height + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; + + // change MC output + dstBuf.stride = width; + dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + 2 * dstBuf.stride + 2; + } +#endif + if( yFrac == 0 ) { +#if JVET_L0256_BIO + m_if.filterHor(compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, xFrac, rndRes, chFmt, clpRng); +#else m_if.filterHor(compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width, height, xFrac, rndRes, chFmt, clpRng); +#endif } else if( xFrac == 0 ) { +#if JVET_L0256_BIO + m_if.filterVer(compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, true, rndRes, chFmt, clpRng); +#else m_if.filterVer(compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width, height, yFrac, true, rndRes, chFmt, clpRng); +#endif } else { - PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], pu.blocks[compID]); + PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], pu.blocks[compID]); +#if JVET_L0256_BIO + tmpBuf.stride = dstBuf.stride; +#endif int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA; +#if JVET_L0256_BIO + m_if.filterHor(compID, (Pel*)refBuf.buf - ((vFilterSize >> 1) - 1) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, backupWidth, backupHeight + vFilterSize - 1, xFrac, false, chFmt, clpRng); +#else m_if.filterHor(compID, (Pel*) refBuf.buf - ((vFilterSize >> 1) - 1) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, width, height + vFilterSize - 1, xFrac, false, chFmt, clpRng); +#endif JVET_J0090_SET_CACHE_ENABLE( false ); +#if JVET_L0256_BIO + m_if.filterVer(compID, (Pel*)tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, false, rndRes, chFmt, clpRng); +#else m_if.filterVer(compID, (Pel*) tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, width, height, yFrac, false, rndRes, chFmt, clpRng); +#endif JVET_J0090_SET_CACHE_ENABLE( true ); } +#if JVET_L0256_BIO + if (bBIOApplied && compID == COMPONENT_Y) + { + refBuf.buf = refBuf.buf - refBuf.stride - 1; + dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + dstBuf.stride + 1; + bioSampleExtendBilinearFilter(refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width - 2, height - 2, 1, xFrac, yFrac, rndRes, chFmt, clpRng); + + // restore data + width = backupWidth; + height = backupHeight; + dstBuf.buf = backupDstBufPtr; + dstBuf.stride = backupDstBufStride; + } +#endif } void InterPrediction::xPredAffineBlk( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng ) @@ -612,8 +743,224 @@ int getMSB( unsigned x ) return msb; } +#if JVET_L0256_BIO +void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &pcYuvSrc0, const CPelUnitBuf &pcYuvSrc1, const int &iRefIdx0, const int &iRefIdx1, PelUnitBuf &pcYuvDst, const BitDepths &clipBitDepths) +{ + const int iHeight = pcYuvDst.Y().height; + const int iWidth = pcYuvDst.Y().width; + int iHeightG = iHeight + 2 * JVET_L0256_BIO_EXTEND_SIZE; + int iWidthG = iWidth + 2 * JVET_L0256_BIO_EXTEND_SIZE; + int offsetPos = iWidthG*JVET_L0256_BIO_EXTEND_SIZE + JVET_L0256_BIO_EXTEND_SIZE; + + Pel* pGradX0 = m_pGradX0; + Pel* pGradX1 = m_pGradX1; + Pel* pGradY0 = m_pGradY0; + Pel* pGradY1 = m_pGradY1; + + int stridePredMC = iWidthG + 2; + const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + stridePredMC + 1; + const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + stridePredMC + 1; + const int iSrc0Stride = stridePredMC; + const int iSrc1Stride = stridePredMC; + + Pel* pDstY = pcYuvDst.Y().buf; + const int iDstStride = pcYuvDst.Y().stride; + const Pel* pSrcY0Temp = pSrcY0; + const Pel* pSrcY1Temp = pSrcY1; + + for (int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++) + { + Pel* dstTempPtr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + stridePredMC + 1; + Pel* pGradY = (refList == 0) ? m_pGradY0 : m_pGradY1; + Pel* pGradX = (refList == 0) ? m_pGradX0 : m_pGradX1; + + g_pelBufOP.bioGradFilter(dstTempPtr, stridePredMC, iWidthG, iHeightG, iWidthG, pGradX, pGradY); + Pel* pcPadStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 2; + for (int y = 0; y< iHeight; y++) + { + pcPadStr[-1] = pcPadStr[0]; + pcPadStr[iWidth] = pcPadStr[iWidth - 1]; + pcPadStr += stridePredMC; + } + + pcPadStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 1; + ::memcpy(pcPadStr - stridePredMC, pcPadStr, sizeof(Pel)*(iWidthG)); + ::memcpy(pcPadStr + iHeight*stridePredMC, pcPadStr + (iHeight - 1)*stridePredMC, sizeof(Pel)*(iWidthG)); + } + + const ClpRng& clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y); + const int bitDepth = clipBitDepths.recon[toChannelType(COMPONENT_Y)]; + const int shiftNum = IF_INTERNAL_PREC + 1 - bitDepth; + const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; + const int limit = ((int)1 << (4 + IF_INTERNAL_PREC - bitDepth - 5)); + + int* m_piDotProductTemp1 = m_piDotProduct1; + int* m_piDotProductTemp2 = m_piDotProduct2; + int* m_piDotProductTemp3 = m_piDotProduct3; + int* m_piDotProductTemp5 = m_piDotProduct5; + int* m_piDotProductTemp6 = m_piDotProduct6; + + g_pelBufOP.calcBIOPar(pSrcY0Temp, pSrcY1Temp, pGradX0, pGradX1, pGradY0, pGradY1, m_piDotProductTemp1, m_piDotProductTemp2, m_piDotProductTemp3, m_piDotProductTemp5, m_piDotProductTemp6, iSrc0Stride, iSrc1Stride, iWidthG, iWidthG, iHeightG); + + int xUnit = (iWidth >> 2); + int yUnit = (iHeight >> 2); + + Pel *pDstY0 = pDstY; + pGradX0 = m_pGradX0; pGradX1 = m_pGradX1; + pGradY0 = m_pGradY0; pGradY1 = m_pGradY1; + + for (int yu = 0; yu < yUnit; yu++) + { + for (int xu = 0; xu < xUnit; xu++) + { + if (m_bioPredSubBlkDist[yu*xUnit + xu] < m_bioSubBlkDistThres) + { + pSrcY0Temp = pSrcY0 + (stridePredMC + 1) + ((yu*iSrc0Stride + xu) << 2); + pSrcY1Temp = pSrcY1 + (stridePredMC + 1) + ((yu*iSrc1Stride + xu) << 2); + pDstY0 = pDstY + ((yu*iDstStride + xu) << 2); + g_pelBufOP.addAvg4(pSrcY0Temp, iSrc0Stride, pSrcY1Temp, iSrc1Stride, pDstY0, iDstStride, (1 << 2), (1 << 2), shiftNum, offset, clpRng); + continue; + } + + int sGxdI = 0, sGydI = 0, sGxGy = 0, sGx2 = 0, sGy2 = 0; + int tmpx = 0, tmpy = 0; + m_piDotProductTemp1 = m_piDotProduct1 + offsetPos + ((yu*iWidthG + xu) << 2); + m_piDotProductTemp2 = m_piDotProduct2 + offsetPos + ((yu*iWidthG + xu) << 2); + m_piDotProductTemp3 = m_piDotProduct3 + offsetPos + ((yu*iWidthG + xu) << 2); + m_piDotProductTemp5 = m_piDotProduct5 + offsetPos + ((yu*iWidthG + xu) << 2); + m_piDotProductTemp6 = m_piDotProduct6 + offsetPos + ((yu*iWidthG + xu) << 2); + + g_pelBufOP.calcBlkGradient(xu << 2, yu << 2, m_piDotProductTemp1, m_piDotProductTemp2, m_piDotProductTemp3, m_piDotProductTemp5, m_piDotProductTemp6, sGx2, sGy2, sGxGy, sGxdI, sGydI, iWidthG, iHeightG, (1 << 2)); + + if (sGx2 > 0) + { + tmpx = rightShiftMSB(sGxdI << 3, sGx2); + tmpx = Clip3(-limit, limit, tmpx); + } + if (sGy2 > 0) + { + int mainsGxGy = sGxGy >> 12; + int secsGxGy = sGxGy & ((1 << 12) - 1); + int tmpData = tmpx * mainsGxGy; + tmpData = ((tmpData << 12) + tmpx*secsGxGy) >> 1; + tmpy = rightShiftMSB(((sGydI << 3) - tmpData), sGy2); + tmpy = Clip3(-limit, limit, tmpy); + } + + pSrcY0Temp = pSrcY0 + (stridePredMC + 1) + ((yu*iSrc0Stride + xu) << 2); + pSrcY1Temp = pSrcY1 + (stridePredMC + 1) + ((yu*iSrc0Stride + xu) << 2); + pGradX0 = m_pGradX0 + offsetPos + ((yu*iWidthG + xu) << 2); + pGradX1 = m_pGradX1 + offsetPos + ((yu*iWidthG + xu) << 2); + pGradY0 = m_pGradY0 + offsetPos + ((yu*iWidthG + xu) << 2); + pGradY1 = m_pGradY1 + offsetPos + ((yu*iWidthG + xu) << 2); + + pDstY0 = pDstY + ((yu*iDstStride + xu) << 2); + g_pelBufOP.addBIOAvg4(pSrcY0Temp, iSrc0Stride, pSrcY1Temp, iSrc1Stride, pDstY0, iDstStride, pGradX0, pGradX1, pGradY0, pGradY1, iWidthG, (1 << 2), (1 << 2), (int)tmpx, (int)tmpy, shiftNum, offset, clpRng); + } // xu + } // yu +} + +void InterPrediction::bioSampleExtendBilinearFilter(Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int dim, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng) +{ + Pel const* pSrc = NULL; + Pel* pDst = NULL; + + int vFilterSize = NTAPS_LUMA; + int widthTmp = 0; + int heightTmp = 0; + + for (int cand = 0; cand < 4; cand++) // top, left, bottom and right + { + + if (cand == 0) // top + { + pSrc = src; + pDst = dst; + widthTmp = width; + heightTmp = dim; + } + else if (cand == 1) // left + { + pSrc = src + dim*srcStride; + pDst = dst + dim*dstStride; + widthTmp = dim; + heightTmp = height - 2 * dim; + } + else if (cand == 2) // bottom + { + pSrc = src + (height - dim)*srcStride; + pDst = dst + (height - dim)*dstStride; + widthTmp = width; + heightTmp = dim; + } + else if (cand == 3) // right + { + pSrc = src + dim*srcStride + width - dim; + pDst = dst + dim*dstStride + width - dim; + widthTmp = dim; + heightTmp = height - 2 * dim; + } + + if (fracY == 0) + { + m_if.filterHor(COMPONENT_Y, pSrc, srcStride, pDst, dstStride, widthTmp, heightTmp, fracX, isLast, fmt, clpRng, 1); + } + else if (fracX == 0) + { + m_if.filterVer(COMPONENT_Y, pSrc, srcStride, pDst, dstStride, widthTmp, heightTmp, fracY, true, isLast, fmt, clpRng, 1); + } + else + { + PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][COMPONENT_Y], Size(width, height)); + tmpBuf.stride = width; + + m_if.filterHor(COMPONENT_Y, pSrc - ((vFilterSize >> 1) - 1) * srcStride, srcStride, tmpBuf.buf, tmpBuf.stride, widthTmp, heightTmp + vFilterSize - 1, fracX, false, fmt, clpRng, 1); + m_if.filterVer(COMPONENT_Y, tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, pDst, dstStride, widthTmp, heightTmp, fracY, false, isLast, fmt, clpRng, 1); + } + } +} + +bool InterPrediction::xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* pYuvSrc0, const int src0Stride, const Pel* pYuvSrc1, const int src1Stride, const BitDepths &clipBitDepths) +{ + const int width = pu.lwidth(); + const int height = pu.lheight(); + const int clipbd = clipBitDepths.recon[toChannelType(COMPONENT_Y)]; + const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(clipbd); + const int shift = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)); + const int xUnit = (width >> 2); + const int yUnit = (height >> 2); + + m_bioDistThres = (shift <= 5) ? (((32 << (clipbd - 8))*width*height) >> (5 - shift)) : (((32 << (clipbd - 8))*width*height) << (shift - 5)); + m_bioSubBlkDistThres = (shift <= 5) ? (((64 << (clipbd - 8)) << 4) >> (5 - shift)) : (((64 << (clipbd - 8)) << 4) << (shift - 5)); + + m_bioDistThres >>= distortionShift; + m_bioSubBlkDistThres >>= distortionShift; + + DistParam cDistParam; + Distortion dist = 0; + for (int yu = 0, blkIdx = 0; yu < yUnit; yu++) + { + for (int xu = 0; xu < xUnit; xu++, blkIdx++) + { + const Pel* pPred0 = pYuvSrc0 + ((yu*src0Stride + xu) << 2); + const Pel* pPred1 = pYuvSrc1 + ((yu*src1Stride + xu) << 2); + + m_pcRdCost->setDistParam(cDistParam, pPred0, pPred1, src0Stride, src1Stride, clipbd, COMPONENT_Y, (1 << 2), (1 << 2), 0, 1, false, true); + m_bioPredSubBlkDist[blkIdx] = cDistParam.distFunc(cDistParam); + dist += m_bioPredSubBlkDist[blkIdx]; + } + } + + return (dist >= m_bioDistThres); +} +#endif + +#if JVET_L0256_BIO +void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bBIOApplied ) +#else void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs ) +#endif { const int iRefIdx0 = pu.refIdx[0]; const int iRefIdx1 = pu.refIdx[1]; @@ -623,11 +970,35 @@ void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnit #if JVET_L0646_GBI if( pu.cu->GBiIdx != GBI_DEFAULT ) { +#if JVET_L0256_BIO + CHECK(bBIOApplied, "GBi is disallowed with BIO"); +#endif pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->GBiIdx); return; } #endif +#if JVET_L0256_BIO + if (bBIOApplied) + { + const int src0Stride = pu.lwidth() + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; + const int src1Stride = pu.lwidth() + 2 * JVET_L0256_BIO_EXTEND_SIZE + 2; + const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + 2 * src0Stride + 2; + const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + 2 * src1Stride + 2; + + bool bioEnabled = xCalcBiPredSubBlkDist(pu, pSrcY0, src0Stride, pSrcY1, src1Stride, clipBitDepths); + if (bioEnabled) + { + applyBiOptFlow(pu, pcYuvSrc0, pcYuvSrc1, iRefIdx0, iRefIdx1, pcYuvDst, clipBitDepths); + } + else + { + pcYuvDst.bufs[0].addAvg(CPelBuf(pSrcY0, src0Stride, pu.lumaSize()), CPelBuf(pSrcY1, src1Stride, pu.lumaSize()), clpRngs.comp[0]); + } + } + pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, bBIOApplied); +#else pcYuvDst.addAvg( pcYuvSrc0, pcYuvSrc1, clpRngs ); +#endif } else if( iRefIdx0 >= 0 && iRefIdx1 < 0 ) { @@ -694,8 +1065,25 @@ void InterPrediction::motionCompensation( PredictionUnit &pu, const RefPicList & ); } +#if JVET_L0256_BIO +int InterPrediction::rightShiftMSB(int numer, int denom) +{ + int d; + int msbIdx = 0; + for (msbIdx = 0; msbIdx<32; msbIdx++) + { + if (denom < ((int)1 << msbIdx)) + { + break; + } + } + int shiftIdx = msbIdx - 1; + d = (numer >> shiftIdx); + return d; +} +#endif #if JVET_J0090_MEMORY_BANDWITH_MEASURE void InterPrediction::cacheAssign( CacheModel *cache ) diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index c58fed664..01b13c84e 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -60,10 +60,25 @@ class Mv; // Class definition // ==================================================================================================================== +#if JVET_L0256_BIO +#define BIO_TEMP_BUFFER_SIZE ( MAX_CU_SIZE+2*JVET_L0256_BIO_EXTEND_SIZE ) * ( MAX_CU_SIZE+2*JVET_L0256_BIO_EXTEND_SIZE ) +#endif + class InterPrediction : public WeightPrediction { private: +#if JVET_L0256_BIO + Distortion m_bioDistThres; + Distortion m_bioSubBlkDistThres; + Distortion m_bioPredSubBlkDist[MAX_NUM_PARTS_IN_CTU]; + + int m_piDotProduct1[BIO_TEMP_BUFFER_SIZE]; + int m_piDotProduct2[BIO_TEMP_BUFFER_SIZE]; + int m_piDotProduct3[BIO_TEMP_BUFFER_SIZE]; + int m_piDotProduct5[BIO_TEMP_BUFFER_SIZE]; + int m_piDotProduct6[BIO_TEMP_BUFFER_SIZE]; +#endif protected: InterpolationFilter m_if; @@ -80,15 +95,37 @@ protected: RdCost* m_pcRdCost; int m_iRefListIdx; - + +#if JVET_L0256_BIO + Pel* m_pGradX0; + Pel* m_pGradY0; + Pel* m_pGradX1; + Pel* m_pGradY1; + bool m_subPuMC; + + int rightShiftMSB(int numer, int denom); + void applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &pcYuvSrc0, const CPelUnitBuf &pcYuvSrc1, const int &iRefIdx0, const int &iRefIdx1, PelUnitBuf &pcYuvDst, const BitDepths &clipBitDepths); + bool xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* pYuvSrc0, const int src0Stride, const Pel* pYuvSrc1, const int src1Stride, const BitDepths &clipBitDepths); + void bioSampleExtendBilinearFilter(Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int dim, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng); +#endif void xPredInterUni ( const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi +#if JVET_L0256_BIO + ,const bool& bBIOApplied = false +#endif ); void xPredInterBi ( PredictionUnit& pu, PelUnitBuf &pcYuvPred ); void xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng +#if JVET_L0256_BIO + ,const bool& bBIOApplied = false +#endif ); - + +#if JVET_L0256_BIO + void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bBIOApplied ); +#else void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs ); +#endif void xPredAffineBlk( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng ); static bool xCheckIdenticalMotion( const PredictionUnit& pu ); diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp index 32e4d9d75..abcef170f 100644 --- a/source/Lib/CommonLib/InterpolationFilter.cpp +++ b/source/Lib/CommonLib/InterpolationFilter.cpp @@ -111,6 +111,28 @@ const TFilterCoeff InterpolationFilter::m_chromaFilter[CHROMA_INTERPOLATION_FILT { 0, 2, 63, -1 }, }; +#if JVET_L0256_BIO +const TFilterCoeff InterpolationFilter::m_bilinearFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_BILINEAR] = +{ + { 64, 0, }, + { 60, 4, }, + { 56, 8, }, + { 52, 12, }, + { 48, 16, }, + { 44, 20, }, + { 40, 24, }, + { 36, 28, }, + { 32, 32, }, + { 28, 36, }, + { 24, 40, }, + { 20, 44, }, + { 16, 48, }, + { 12, 52, }, + { 8, 56, }, + { 4, 60, }, +}; +#endif + // ==================================================================================================================== // Private member functions // ==================================================================================================================== @@ -443,7 +465,11 @@ void InterpolationFilter::filterVer(const ClpRng& clpRng, Pel const *src, int sr * \param fmt Chroma format * \param bitDepth Bit depth */ +#if JVET_L0256_BIO +void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx ) +#else void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ) +#endif { if( frac == 0 ) { @@ -452,6 +478,13 @@ void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, i else if( isLuma( compID ) ) { CHECK( frac < 0 || frac >= ( LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE ), "Invalid fraction" ); +#if JVET_L0256_BIO + if( nFilterIdx == 1 ) + { + filterHor<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_bilinearFilter[frac]); + } + else +#endif { filterHor<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac] ); } @@ -481,7 +514,11 @@ void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, i * \param fmt Chroma format * \param bitDepth Bit depth */ +#if JVET_L0256_BIO +void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx) +#else void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ) +#endif { if( frac == 0 ) { @@ -490,6 +527,13 @@ void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, i else if( isLuma( compID ) ) { CHECK( frac < 0 || frac >= ( LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE ), "Invalid fraction" ); +#if JVET_L0256_BIO + if (nFilterIdx == 1) + { + filterVer<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_bilinearFilter[frac]); + } + else +#endif { filterVer<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac] ); } diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h index 4535b6bc5..4f246d9be 100644 --- a/source/Lib/CommonLib/InterpolationFilter.h +++ b/source/Lib/CommonLib/InterpolationFilter.h @@ -56,6 +56,9 @@ class InterpolationFilter { static const TFilterCoeff m_lumaFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_LUMA]; ///< Luma filter taps static const TFilterCoeff m_chromaFilter[CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_CHROMA]; ///< Chroma filter taps +#if JVET_L0256_BIO + static const TFilterCoeff m_bilinearFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_BILINEAR]; ///< bilinear filter taps +#endif public: template<bool isFirst, bool isLast> static void filterCopy( const ClpRng& clpRng, const Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height ); @@ -87,8 +90,13 @@ public: void _initInterpolationFilterX86(); #endif +#if JVET_L0256_BIO + void filterHor(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx = 0); + void filterVer(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx = 0); +#else void filterHor(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ); void filterVer(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ); +#endif #if JVET_J0090_MEMORY_BANDWITH_MEASURE void cacheAssign( CacheModel *cache ) { m_cacheModel = cache; } #endif diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp index 20119d9a8..85306088a 100644 --- a/source/Lib/CommonLib/RdCost.cpp +++ b/source/Lib/CommonLib/RdCost.cpp @@ -164,6 +164,10 @@ void RdCost::init() m_afpDistortFunc[DF_SSE16N_WTD] = RdCost::xGetSSE16N_WTD; #endif +#if JVET_L0256_BIO + m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD; +#endif + #if ENABLE_SIMD_OPT_DIST #ifdef TARGET_SIMD_X86 initRdCostX86(); @@ -318,7 +322,11 @@ void RdCost::setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &c rcDP.maximumDistortionForEarlyExit = std::numeric_limits<Distortion>::max(); } +#if JVET_L0256_BIO +void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard, bool bBIOApplied ) +#else void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard ) +#endif { rcDP.bitDepth = bitDepth; rcDP.compID = compID; @@ -339,6 +347,14 @@ void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, CHECK( useHadamard || rcDP.useMR || subShiftMode > 0, "only used in xDirectMCCost with these default parameters (so far...)" ); +#if JVET_L0256_BIO + if ( bBIOApplied ) + { + rcDP.distFunc = m_afpDistortFunc[ DF_SAD_INTERMEDIATE_BITDEPTH ]; + return; + } +#endif + if( width == 12 ) { rcDP.distFunc = m_afpDistortFunc[ DF_SAD12 ]; diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h index 4e79040c2..07b9f6939 100644 --- a/source/Lib/CommonLib/RdCost.h +++ b/source/Lib/CommonLib/RdCost.h @@ -102,6 +102,9 @@ private: // for distortion static FpDistFunc m_afpDistortFunc[DF_TOTAL_FUNCTIONS]; // [eDFunc] +#if JVET_L0256_BIO + +#endif CostMode m_costMode; double m_distortionWeight[MAX_NUM_COMPONENT]; // only chroma values are used. double m_dLambda; @@ -154,7 +157,11 @@ public: void setDistParam( DistParam &rcDP, const CPelBuf &org, const Pel* piRefY , int iRefStride, int bitDepth, ComponentID compID, int subShiftMode = 0, int step = 1, bool useHadamard = false ); void setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &cur, int bitDepth, ComponentID compID, bool useHadamard = false ); +#if JVET_L0256_BIO + void setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false, bool bBIOApplied = false ); +#else void setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false ); +#endif double getMotionLambda ( bool bIsTransquantBypass ) { return m_dLambdaMotionSAD[(bIsTransquantBypass && m_costMode==COST_MIXED_LOSSLESS_LOSSY_CODING)?1:0]; } void selectMotionLambda ( bool bIsTransquantBypass ) { m_motionLambda = getMotionLambda( bIsTransquantBypass ); } @@ -266,6 +273,10 @@ private: static Distortion xGetSAD_SIMD ( const DistParam& pcDtParam ); template< int iWidth, X86_VEXT vext > static Distortion xGetSAD_NxN_SIMD( const DistParam& pcDtParam ); +#if JVET_L0256_BIO + template< X86_VEXT vext > + static Distortion xGetSAD_IBD_SIMD(const DistParam& pcDtParam); +#endif template< typename Torg, typename Tcur, X86_VEXT vext > static Distortion xGetHADs_SIMD ( const DistParam& pcDtParam ); diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp index 6023e01bc..4bd5fc727 100644 --- a/source/Lib/CommonLib/Slice.cpp +++ b/source/Lib/CommonLib/Slice.cpp @@ -1632,6 +1632,9 @@ SPSNext::SPSNext( SPS& sps ) , m_IMV ( false ) #if !REMOVE_MV_ADAPT_PREC , m_highPrecMv ( false ) +#endif +#if JVET_L0256_BIO + , m_BIO ( false ) #endif , m_DisableMotionCompression ( false ) , m_LMChroma ( false ) diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index d6ef610c7..f122ab69d 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -803,6 +803,9 @@ private: bool m_IMV; // 9 #if !REMOVE_MV_ADAPT_PREC bool m_highPrecMv; +#endif +#if JVET_L0256_BIO + bool m_BIO; #endif bool m_DisableMotionCompression; // 13 bool m_LMChroma; // 17 @@ -865,6 +868,10 @@ public: #if !REMOVE_MV_ADAPT_PREC void setUseHighPrecMv(bool b) { m_highPrecMv = b; } bool getUseHighPrecMv() const { return m_highPrecMv; } +#endif +#if JVET_L0256_BIO + void setUseBIO(bool b) { m_BIO = b; } + bool getUseBIO() const { return m_BIO; } #endif void setDisableMotCompress ( bool b ) { m_DisableMotionCompression = b; } bool getDisableMotCompress () const { return m_DisableMotionCompression; } diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index d8e34d3bb..afc72b3b0 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -69,16 +69,10 @@ #define JVET_L0274 1 #define JVET_L0274_ENCODER_SPEED_UP ( 1 && JVET_L0274 ) // encoder speed-up by pre-calculating position dependent parameters - - - - - - - - - - +#define JVET_L0256_BIO 1 +#if JVET_L0256_BIO +#define JVET_L0256_BIO_EXTEND_SIZE 1 +#endif #define JVET_L0646_GBI 1 // Generalized bi-prediction (GBi) @@ -549,7 +543,13 @@ enum DFunc DF_DEFAULT_ORI = DF_SSE_WTD+8, #endif +#if JVET_L0256_BIO + DF_SAD_INTERMEDIATE_BITDEPTH = 63, + + DF_TOTAL_FUNCTIONS = 64 +#else DF_TOTAL_FUNCTIONS = 63 +#endif }; /// motion vector predictor direction used in AMVP diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index 34d2cb7b4..1add0996f 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -128,6 +128,308 @@ void addAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int s } } +#if JVET_L0256_BIO +template< X86_VEXT vext > +void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *pGradX0, const Pel *pGradX1, const Pel *pGradY0, const Pel*pGradY1, int iGradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) +{ + __m128i mm_tmpx = _mm_unpacklo_epi64(_mm_set1_epi16(tmpx), _mm_set1_epi16(tmpy)); + __m128i mm_boffset = _mm_set1_epi32(1); + __m128i mm_offset = _mm_set1_epi32(offset); + __m128i vibdimin = _mm_set1_epi16(clpRng.min); + __m128i vibdimax = _mm_set1_epi16(clpRng.max); + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x += 4) + { + __m128i mm_a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pGradX0 + x)), _mm_loadl_epi64((const __m128i *)(pGradY0 + x))); + __m128i mm_b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pGradX1 + x)), _mm_loadl_epi64((const __m128i *)(pGradY1 + x))); + mm_a = _mm_sub_epi16(mm_a, mm_b); + mm_b = _mm_mulhi_epi16(mm_a, mm_tmpx); + mm_a = _mm_mullo_epi16(mm_a, mm_tmpx); + + __m128i mm_sum = _mm_add_epi32(_mm_unpacklo_epi16(mm_a, mm_b), _mm_unpackhi_epi16(mm_a, mm_b)); + mm_sum = _mm_srai_epi32(_mm_add_epi32(mm_sum, mm_boffset), 1); + mm_a = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src0 + x))); + mm_b = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src1 + x))); + mm_sum = _mm_add_epi32(_mm_add_epi32(mm_sum, mm_a), _mm_add_epi32(mm_b, mm_offset)); + mm_sum = _mm_packs_epi32(_mm_srai_epi32(mm_sum, shift), mm_a); + mm_sum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_sum)); + _mm_storel_epi64((__m128i *)(dst + x), mm_sum); + } + dst += dstStride; src0 += src0Stride; src1 += src1Stride; + pGradX0 += iGradStride; pGradX1 += iGradStride; pGradY0 += iGradStride; pGradY1 += iGradStride; + } +} + +template< X86_VEXT vext > +void gradFilter_SSE(Pel* piSrc, int iSrcStride, int iWidth, int iHeight, int iGradStride, Pel* piGradX, Pel* piGradY) +{ + __m128i vzero = _mm_setzero_si128(); + Pel* piSrcTmp = piSrc + iSrcStride + 1; + Pel* piGradXTmp = piGradX + iGradStride + 1; + Pel* piGradYTmp = piGradY + iGradStride + 1; + + int iWidthInside = iWidth - 2 * JVET_L0256_BIO_EXTEND_SIZE; + int iHeightInside = iHeight - 2 * JVET_L0256_BIO_EXTEND_SIZE; + + assert((iWidthInside & 3) == 0); + + for (int y = 0; y < iHeightInside; y++) + { + int x = 0; + for (; x < iWidthInside; x += 4) + { + __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x - iSrcStride))); + __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x + iSrcStride))); + __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x - 1))); + __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(piSrcTmp + x + 1))); + + __m128i mmGradVer = _mm_srai_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), 4); + __m128i mmGradHor = _mm_srai_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), 4); + mmGradVer = _mm_packs_epi32(mmGradVer, vzero); + mmGradHor = _mm_packs_epi32(mmGradHor, vzero); + + _mm_storel_epi64((__m128i *)(piGradYTmp + x), mmGradVer); + _mm_storel_epi64((__m128i *)(piGradXTmp + x), mmGradHor); + } + + piGradXTmp += iGradStride; + piGradYTmp += iGradStride; + piSrcTmp += iSrcStride; + } + + piGradXTmp = piGradX + iGradStride + 1; + piGradYTmp = piGradY + iGradStride + 1; + for (int y = 0; y < iHeightInside; y++) + { + piGradXTmp[-1] = piGradXTmp[0]; + piGradXTmp[iWidthInside] = piGradXTmp[iWidthInside - 1]; + piGradXTmp += iGradStride; + + piGradYTmp[-1] = piGradYTmp[0]; + piGradYTmp[iWidthInside] = piGradYTmp[iWidthInside - 1]; + piGradYTmp += iGradStride; + } + + piGradXTmp = piGradX + iGradStride; + piGradYTmp = piGradY + iGradStride; + ::memcpy(piGradXTmp - iGradStride, piGradXTmp, sizeof(Pel)*(iWidth)); + ::memcpy(piGradXTmp + iHeightInside*iGradStride, piGradXTmp + (iHeightInside - 1)*iGradStride, sizeof(Pel)*(iWidth)); + ::memcpy(piGradYTmp - iGradStride, piGradYTmp, sizeof(Pel)*(iWidth)); + ::memcpy(piGradYTmp + iHeightInside*iGradStride, piGradYTmp + (iHeightInside - 1)*iGradStride, sizeof(Pel)*(iWidth)); +} + +template< X86_VEXT vext > +void calcBIOPar_SSE(const Pel* pSrcY0Temp, const Pel* pSrcY1Temp, const Pel* pGradX0, const Pel* pGradX1, const Pel* pGradY0, const Pel* pGradY1, int* m_piDotProductTemp1, int* m_piDotProductTemp2, int* m_piDotProductTemp3, int* m_piDotProductTemp5, int* m_piDotProductTemp6, const int iSrc0Stride, const int iSrc1Stride, const int iGradStride, const int iWidthG, const int iHeightG) +{ + for (int y = 0; y < iHeightG; y++) + { + int x = 0; + for (; x < ((iWidthG >> 3) << 3); x += 8) + { + __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(pSrcY0Temp + x)), 6); + __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(pSrcY1Temp + x)), 6); + __m128i mmGradX0 = _mm_loadu_si128((__m128i*)(pGradX0 + x)); + __m128i mmGradX1 = _mm_loadu_si128((__m128i*)(pGradX1 + x)); + __m128i mmGradY0 = _mm_loadu_si128((__m128i*)(pGradY0 + x)); + __m128i mmGradY1 = _mm_loadu_si128((__m128i*)(pGradY1 + x)); + + __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp); + __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3); + __m128i mmTempY = _mm_srai_epi16(_mm_add_epi16(mmGradY0, mmGradY1), 3); + + // m_piDotProductTemp1 + __m128i mm_b = _mm_mulhi_epi16(mmTempX, mmTempX); + __m128i mm_a = _mm_mullo_epi16(mmTempX, mmTempX); + + __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + __m128i mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp1 + x), mm_l); + _mm_storeu_si128((__m128i *)(m_piDotProductTemp1 + x + 4), mm_h); + + // m_piDotProductTemp2 + mm_b = _mm_mulhi_epi16(mmTempX, mmTempY); + mm_a = _mm_mullo_epi16(mmTempX, mmTempY); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp2 + x), mm_l); + _mm_storeu_si128((__m128i *)(m_piDotProductTemp2 + x + 4), mm_h); + + // m_piDotProductTemp3 + mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempX, mmTemp1); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp3 + x), mm_l); + _mm_storeu_si128((__m128i *)(m_piDotProductTemp3 + x + 4), mm_h); + + // m_piDotProductTemp5 + mm_b = _mm_mulhi_epi16(mmTempY, mmTempY); + mm_a = _mm_mullo_epi16(mmTempY, mmTempY); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp5 + x), mm_l); + _mm_storeu_si128((__m128i *)(m_piDotProductTemp5 + x + 4), mm_h); + + // m_piDotProductTemp6 + mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempY, mmTemp1); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp6 + x), mm_l); + _mm_storeu_si128((__m128i *)(m_piDotProductTemp6 + x + 4), mm_h); + } + + for (; x < ((iWidthG >> 2) << 2); x += 4) + { + __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(pSrcY0Temp + x)), 6); + __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(pSrcY1Temp + x)), 6); + __m128i mmGradX0 = _mm_loadl_epi64((__m128i*)(pGradX0 + x)); + __m128i mmGradX1 = _mm_loadl_epi64((__m128i*)(pGradX1 + x)); + __m128i mmGradY0 = _mm_loadl_epi64((__m128i*)(pGradY0 + x)); + __m128i mmGradY1 = _mm_loadl_epi64((__m128i*)(pGradY1 + x)); + + __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp); + __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3); + __m128i mmTempY = _mm_srai_epi16(_mm_add_epi16(mmGradY0, mmGradY1), 3); + + // m_piDotProductTemp1 + __m128i mm_b = _mm_mulhi_epi16(mmTempX, mmTempX); + __m128i mm_a = _mm_mullo_epi16(mmTempX, mmTempX); + __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp1 + x), mm_l); + + // m_piDotProductTemp2 + mm_b = _mm_mulhi_epi16(mmTempX, mmTempY); + mm_a = _mm_mullo_epi16(mmTempX, mmTempY); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp2 + x), mm_l); + + // m_piDotProductTemp3 + mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempX, mmTemp1); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp3 + x), mm_l); + + // m_piDotProductTemp5 + mm_b = _mm_mulhi_epi16(mmTempY, mmTempY); + mm_a = _mm_mullo_epi16(mmTempY, mmTempY); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp5 + x), mm_l); + + // m_piDotProductTemp6 + mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempY, mmTemp1); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(m_piDotProductTemp6 + x), mm_l); + } + + for (; x < iWidthG; x++) + { + int temp = (pSrcY0Temp[x] >> 6) - (pSrcY1Temp[x] >> 6); + int tempX = (pGradX0[x] + pGradX1[x]) >> 3; + int tempY = (pGradY0[x] + pGradY1[x]) >> 3; + m_piDotProductTemp1[x] = tempX * tempX; + m_piDotProductTemp2[x] = tempX * tempY; + m_piDotProductTemp3[x] = -tempX * temp; + m_piDotProductTemp5[x] = tempY * tempY; + m_piDotProductTemp6[x] = -tempY * temp; + } + + pSrcY0Temp += iSrc0Stride; + pSrcY1Temp += iSrc1Stride; + pGradX0 += iGradStride; + pGradX1 += iGradStride; + pGradY0 += iGradStride; + pGradY1 += iGradStride; + m_piDotProductTemp1 += iWidthG; + m_piDotProductTemp2 += iWidthG; + m_piDotProductTemp3 += iWidthG; + m_piDotProductTemp5 += iWidthG; + m_piDotProductTemp6 += iWidthG; + } +} + +template< X86_VEXT vext > +void calcBlkGradient_SSE(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize) +{ + int *pGx2 = arraysGx2; + int *pGy2 = arraysGy2; + int *pGxGy = arraysGxGy; + int *pGxdI = arraysGxdI; + int *pGydI = arraysGydI; + + // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE + pGx2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGy2 -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGxGy -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGxdI -= (JVET_L0256_BIO_EXTEND_SIZE*width); + pGydI -= (JVET_L0256_BIO_EXTEND_SIZE*width); + + __m128i vzero = _mm_setzero_si128(); + __m128i mmGx2Total = _mm_setzero_si128(); + __m128i mmGy2Total = _mm_setzero_si128(); + __m128i mmGxGyTotal = _mm_setzero_si128(); + __m128i mmGxdITotal = _mm_setzero_si128(); + __m128i mmGydITotal = _mm_setzero_si128(); + + for (int y = -JVET_L0256_BIO_EXTEND_SIZE; y < unitSize + JVET_L0256_BIO_EXTEND_SIZE; y++) + { + __m128i mmsGx2 = _mm_loadu_si128((__m128i*)(pGx2 - 1)); __m128i mmsGx2Sec = _mm_loadl_epi64((__m128i*)(pGx2 + 3)); + __m128i mmsGy2 = _mm_loadu_si128((__m128i*)(pGy2 - 1)); __m128i mmsGy2Sec = _mm_loadl_epi64((__m128i*)(pGy2 + 3)); + __m128i mmsGxGy = _mm_loadu_si128((__m128i*)(pGxGy - 1)); __m128i mmsGxGySec = _mm_loadl_epi64((__m128i*)(pGxGy + 3)); + __m128i mmsGxdI = _mm_loadu_si128((__m128i*)(pGxdI - 1)); __m128i mmsGxdISec = _mm_loadl_epi64((__m128i*)(pGxdI + 3)); + __m128i mmsGydI = _mm_loadu_si128((__m128i*)(pGydI - 1)); __m128i mmsGydISec = _mm_loadl_epi64((__m128i*)(pGydI + 3)); + + mmsGx2 = _mm_add_epi32(mmsGx2, mmsGx2Sec); + mmsGy2 = _mm_add_epi32(mmsGy2, mmsGy2Sec); + mmsGxGy = _mm_add_epi32(mmsGxGy, mmsGxGySec); + mmsGxdI = _mm_add_epi32(mmsGxdI, mmsGxdISec); + mmsGydI = _mm_add_epi32(mmsGydI, mmsGydISec); + + + mmGx2Total = _mm_add_epi32(mmGx2Total, mmsGx2); + mmGy2Total = _mm_add_epi32(mmGy2Total, mmsGy2); + mmGxGyTotal = _mm_add_epi32(mmGxGyTotal, mmsGxGy); + mmGxdITotal = _mm_add_epi32(mmGxdITotal, mmsGxdI); + mmGydITotal = _mm_add_epi32(mmGydITotal, mmsGydI); + + pGx2 += width; + pGy2 += width; + pGxGy += width; + pGxdI += width; + pGydI += width; + } + + mmGx2Total = _mm_hadd_epi32(_mm_hadd_epi32(mmGx2Total, vzero), vzero); + mmGy2Total = _mm_hadd_epi32(_mm_hadd_epi32(mmGy2Total, vzero), vzero); + mmGxGyTotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGxGyTotal, vzero), vzero); + mmGxdITotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGxdITotal, vzero), vzero); + mmGydITotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGydITotal, vzero), vzero); + + sGx2 = _mm_cvtsi128_si32(mmGx2Total); + sGy2 = _mm_cvtsi128_si32(mmGy2Total); + sGxGy = _mm_cvtsi128_si32(mmGxGyTotal); + sGxdI = _mm_cvtsi128_si32(mmGxdITotal); + sGydI = _mm_cvtsi128_si32(mmGydITotal); +} +#endif + template< X86_VEXT vext, int W > void reco_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int16_t *dst, int dstStride, int width, int height, const ClpRng& clpRng ) { @@ -496,6 +798,13 @@ void PelBufferOps::_initPelBufOpsX86() addAvg8 = addAvg_SSE<vext, 8>; addAvg4 = addAvg_SSE<vext, 4>; +#if JVET_L0256_BIO + addBIOAvg4 = addBIOAvg4_SSE<vext>; + bioGradFilter = gradFilter_SSE<vext>; + calcBIOPar = calcBIOPar_SSE<vext>; + calcBlkGradient = calcBlkGradient_SSE<vext>; +#endif + reco8 = reco_SSE<vext, 8>; reco4 = reco_SSE<vext, 4>; diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h index 95383045f..ab54737fa 100644 --- a/source/Lib/CommonLib/x86/RdCostX86.h +++ b/source/Lib/CommonLib/x86/RdCostX86.h @@ -297,6 +297,45 @@ Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam ) return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } +#if JVET_L0256_BIO +template< X86_VEXT vext > +Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) +{ + if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) + return RdCost::xGetSAD(rcDtParam); + + const short* src0 = (const short*)rcDtParam.org.buf; + const short* src1 = (const short*)rcDtParam.cur.buf; + int width = rcDtParam.org.height; + int height = rcDtParam.org.width; + int iSubShift = rcDtParam.subShift; + int iSubStep = (1 << iSubShift); + const int src0Stride = rcDtParam.org.stride * iSubStep; + const int src1Stride = rcDtParam.cur.stride * iSubStep; + + __m128i vtotalsum32 = _mm_setzero_si128(); + __m128i vzero = _mm_setzero_si128(); + for (int y = 0; y < height; y += iSubStep) + { + for (int x = 0; x < width; x += 4) + { + __m128i vsrc1 = _mm_loadl_epi64((const __m128i*)(src0 + x)); + __m128i vsrc2 = _mm_loadl_epi64((const __m128i*)(src1 + x)); + vsrc1 = _mm_cvtepi16_epi32(vsrc1); + vsrc2 = _mm_cvtepi16_epi32(vsrc2); + vtotalsum32 = _mm_add_epi32(vtotalsum32, _mm_abs_epi32(_mm_sub_epi32(vsrc1, vsrc2))); + } + src0 += src0Stride; + src1 += src1Stride; + } + vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero); + vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero); + Distortion uiSum = _mm_cvtsi128_si32(vtotalsum32); + + uiSum <<= iSubShift; + return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); +} +#endif template< int iWidth, X86_VEXT vext > Distortion RdCost::xGetSAD_NxN_SIMD( const DistParam &rcDtParam ) @@ -2422,6 +2461,10 @@ void RdCost::_initRdCostX86() m_afpDistortFunc[DF_HAD32] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; m_afpDistortFunc[DF_HAD64] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; m_afpDistortFunc[DF_HAD16N] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; + +#if JVET_L0256_BIO + m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD_IBD_SIMD<vext>; +#endif } template void RdCost::_initRdCostX86<SIMDX86>(); diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index 6411f4092..5802f9b34 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -794,6 +794,9 @@ void HLSyntaxReader::parseSPSNext( SPSNext& spsNext, const bool usePCM ) READ_FLAG( symbol, "imv_enable_flag" ); spsNext.setUseIMV ( symbol != 0 ); #if !REMOVE_MV_ADAPT_PREC READ_FLAG( symbol, "high_precision_motion_vectors" ); spsNext.setUseHighPrecMv(symbol != 0); +#endif +#if JVET_L0256_BIO + READ_FLAG( symbol, "bio_enable_flag" ); spsNext.setUseBIO ( symbol != 0 ); #endif READ_FLAG( symbol, "disable_motion_compression_flag" ); spsNext.setDisableMotCompress ( symbol != 0 ); READ_FLAG( symbol, "lm_chroma_enabled_flag" ); spsNext.setUseLMChroma ( symbol != 0 ); diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index 71f50a481..cbae17fd0 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -196,6 +196,9 @@ protected: bool m_AffineType; #if !REMOVE_MV_ADAPT_PREC bool m_highPrecMv; +#endif +#if JVET_L0256_BIO + bool m_BIO; #endif bool m_DisableMotionCompression; unsigned m_MTTMode; @@ -619,6 +622,10 @@ public: #if !REMOVE_MV_ADAPT_PREC void setHighPrecisionMv ( bool b ) { m_highPrecMv = b; } bool getHighPrecisionMv () { return m_highPrecMv; } +#endif +#if JVET_L0256_BIO + void setBIO(bool b) { m_BIO = b; } + bool getBIO() const { return m_BIO; } #endif void setDisableMotionCompression ( bool b ) { m_DisableMotionCompression = b; } bool getDisableMotionCompression () const { return m_DisableMotionCompression; } diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index 38661820c..a261753a7 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -840,6 +840,9 @@ void EncLib::xInitSPS(SPS &sps) sps.getSpsNext().setUseIMV ( m_ImvMode != IMV_OFF ); #if !REMOVE_MV_ADAPT_PREC sps.getSpsNext().setUseHighPrecMv ( m_highPrecMv ); +#endif +#if JVET_L0256_BIO + sps.getSpsNext().setUseBIO ( m_BIO ); #endif sps.getSpsNext().setUseAffine ( m_Affine ); sps.getSpsNext().setUseAffineType ( m_AffineType ); diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index 29f4b427f..b84bbf562 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -534,6 +534,9 @@ void HLSWriter::codeSPSNext( const SPSNext& spsNext, const bool usePCM ) WRITE_FLAG( spsNext.getUseIMV() ? 1 : 0, "imv_enable_flag" ); #if !REMOVE_MV_ADAPT_PREC WRITE_FLAG( spsNext.getUseHighPrecMv() ? 1 : 0, "high_precision_motion_vectors"); +#endif +#if JVET_L0256_BIO + WRITE_FLAG( spsNext.getUseBIO() ? 1 : 0, "bio_enable_flag" ); #endif WRITE_FLAG( spsNext.getDisableMotCompress() ? 1 : 0, "disable_motion_compression_flag" ); WRITE_FLAG( spsNext.getUseLMChroma() ? 1 : 0, "lm_chroma_enabled_flag" ); -- GitLab