diff --git a/cfg/encoder_randomaccess_vtm.cfg b/cfg/encoder_randomaccess_vtm.cfg index 63ff58df9358f54e7567d150a6e8993a48ca04ea..29f7902779a5d2f22a7d36e60ec9171ae6bb1e1e 100644 --- a/cfg/encoder_randomaccess_vtm.cfg +++ b/cfg/encoder_randomaccess_vtm.cfg @@ -142,7 +142,8 @@ DepQuant : 1 IMV : 2 ALF : 1 GBi : 1 -GBiFast : 1 +GBiFast : 1 +BIO : 1 # Fast tools PBIntraFast : 1 diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp index ab22f12e29a6ea1870bb89851025095380eb075e..ab7fcde8ec6df67dad3d9bf65d03174e30a64e04 100644 --- a/source/App/EncoderApp/EncApp.cpp +++ b/source/App/EncoderApp/EncApp.cpp @@ -230,6 +230,9 @@ void EncApp::xInitLibCfg() m_cEncLib.setAffineType ( m_AffineType ); #if !REMOVE_MV_ADAPT_PREC m_cEncLib.setHighPrecisionMv (m_highPrecisionMv); +#endif +#if JVET_L0256_BIO + m_cEncLib.setBIO (m_BIO); #endif m_cEncLib.setDisableMotionCompression ( m_DisableMotionCompression ); m_cEncLib.setMTTMode ( m_MTT ); diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index e6723233eeb7f91df4cf1170470b85452e25187e..fc12314b2aeedd2f4357acd99e0cba4fe8f0f06d 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -827,8 +827,11 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) #if !REMOVE_MV_ADAPT_PREC ("HighPrecMv", m_highPrecisionMv, false, "High precision motion vectors for temporal merging (0:off, 1:on) [default: off]") #endif - ("Affine", m_Affine, false, "Enable affine prediction (0:off, 1:on) [default: off]") - ( "AffineType", m_AffineType, true, "Enable affine type prediction (0:off, 1:on) [default: on]" ) + ("Affine", m_Affine, false, "Enable affine prediction (0:off, 1:on) [default: off]") + ("AffineType", m_AffineType, true, "Enable affine type prediction (0:off, 1:on) [default: on]" ) +#if JVET_L0256_BIO + ("BIO", m_BIO, false, "Enable bi-directional optical flow") +#endif ("DisableMotCompression", m_DisableMotionCompression, false, "Disable motion data compression for all modes") ("IMV", m_ImvMode, 2, "Adaptive MV precision Mode (IMV)\n" "\t0: disabled IMV\n" @@ -1943,6 +1946,9 @@ bool EncAppCfg::xCheckParameter() #if !REMOVE_MV_ADAPT_PREC xConfirmPara( m_highPrecisionMv, "High precision MV for temporal merging can only be used with NEXT profile" ); xConfirmPara( m_Affine, "Affine is only allowed with NEXT profile" ); +#endif +#if JVET_L0256_BIO + xConfirmPara( m_BIO, "BIO only allowed with NEXT profile" ); #endif xConfirmPara( m_DisableMotionCompression, "Disable motion data compression only allowed with NEXT profile" ); xConfirmPara( m_MTT, "Multi type tree is only allowed with NEXT profile" ); @@ -3143,6 +3149,9 @@ void EncAppCfg::xPrintParameter() if( !m_QTBT ) msg( VERBOSE, "IMVMaxCand:%d ", m_ImvMaxCand ); #if !REMOVE_MV_ADAPT_PREC msg(VERBOSE, "HighPrecMv:%d ", m_highPrecisionMv); +#endif +#if JVET_L0256_BIO + msg( VERBOSE, "BIO:%d ", m_BIO ); #endif msg( VERBOSE, "DisMDC:%d ", m_DisableMotionCompression ); msg( VERBOSE, "MTT:%d ", m_MTT ); diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h index cf5095f67d48a66364d4b438d0a1447d78fca63d..712a0e1276075c1389a703d31eba94034b292c90 100644 --- a/source/App/EncoderApp/EncAppCfg.h +++ b/source/App/EncoderApp/EncAppCfg.h @@ -213,6 +213,9 @@ protected: bool m_AffineType; #if !REMOVE_MV_ADAPT_PREC bool m_highPrecisionMv; +#endif +#if JVET_L0256_BIO + bool m_BIO; #endif bool m_DisableMotionCompression; unsigned m_MTT; diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp index f31a22044ec3f8d59704b0ccf47387f1f513f70d..2454c501ae976e3885395f6efcc3169531db8c9b 100644 --- a/source/Lib/CommonLib/Buffer.cpp +++ b/source/Lib/CommonLib/Buffer.cpp @@ -62,6 +62,138 @@ void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T #undef ADD_AVG_CORE_INC } +#if JVET_L0256_BIO +void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) +{ + int b = 0; + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x += 4) + { + b = tmpx * (gradX0[x] - gradX1[x]) + tmpy * (gradY0[x] - gradY1[x]); + b = ((b + 1) >> 1); + dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng); + + b = tmpx * (gradX0[x + 1] - gradX1[x + 1]) + tmpy * (gradY0[x + 1] - gradY1[x + 1]); + b = ((b + 1) >> 1); + dst[x + 1] = ClipPel((int16_t)rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng); + + b = tmpx * (gradX0[x + 2] - gradX1[x + 2]) + tmpy * (gradY0[x + 2] - gradY1[x + 2]); + b = ((b + 1) >> 1); + dst[x + 2] = ClipPel((int16_t)rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng); + + b = tmpx * (gradX0[x + 3] - gradX1[x + 3]) + tmpy * (gradY0[x + 3] - gradY1[x + 3]); + b = ((b + 1) >> 1); + dst[x + 3] = ClipPel((int16_t)rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng); + } + dst += dstStride; src0 += src0Stride; src1 += src1Stride; + gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride; + } +} + +void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY) +{ + Pel* srcTmp = pSrc + srcStride + 1; + Pel* gradXTmp = gradX + gradStride + 1; + Pel* gradYTmp = gradY + gradStride + 1; + + for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++) + { + for (int x = 0; x < (width - 2 * BIO_EXTEND_SIZE); x++) + { + gradYTmp[x] = (srcTmp[x + srcStride] - srcTmp[x - srcStride]) >> 4; + gradXTmp[x] = (srcTmp[x + 1] - srcTmp[x - 1]) >> 4; + } + gradXTmp += gradStride; + gradYTmp += gradStride; + srcTmp += srcStride; + } + + gradXTmp = gradX + gradStride + 1; + gradYTmp = gradY + gradStride + 1; + for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++) + { + gradXTmp[-1] = gradXTmp[0]; + gradXTmp[width - 2 * BIO_EXTEND_SIZE] = gradXTmp[width - 2 * BIO_EXTEND_SIZE - 1]; + gradXTmp += gradStride; + + gradYTmp[-1] = gradYTmp[0]; + gradYTmp[width - 2 * BIO_EXTEND_SIZE] = gradYTmp[width - 2 * BIO_EXTEND_SIZE - 1]; + gradYTmp += gradStride; + } + + gradXTmp = gradX + gradStride; + gradYTmp = gradY + gradStride; + ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel)*(width)); + ::memcpy(gradXTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradXTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); + ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width)); + ::memcpy(gradYTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradYTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); +} + +void calcBIOParCore(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG) +{ + for (int y = 0; y < heightG; y++) + { + for (int x = 0; x < widthG; x++) + { + int temp = (srcY0Temp[x] >> 6) - (srcY1Temp[x] >> 6); + int tempX = (gradX0[x] + gradX1[x]) >> 3; + int tempY = (gradY0[x] + gradY1[x]) >> 3; + dotProductTemp1[x] = tempX * tempX; + dotProductTemp2[x] = tempX * tempY; + dotProductTemp3[x] = -tempX * temp; + dotProductTemp5[x] = tempY * tempY; + dotProductTemp6[x] = -tempY * temp; + } + srcY0Temp += src0Stride; + srcY1Temp += src1Stride; + gradX0 += gradStride; + gradX1 += gradStride; + gradY0 += gradStride; + gradY1 += gradStride; + dotProductTemp1 += widthG; + dotProductTemp2 += widthG; + dotProductTemp3 += widthG; + dotProductTemp5 += widthG; + dotProductTemp6 += widthG; + } +} + +void calcBlkGradientCore(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize) +{ + int *Gx2 = arraysGx2; + int *Gy2 = arraysGy2; + int *GxGy = arraysGxGy; + int *GxdI = arraysGxdI; + int *GydI = arraysGydI; + + // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE + Gx2 -= (BIO_EXTEND_SIZE*width); + Gy2 -= (BIO_EXTEND_SIZE*width); + GxGy -= (BIO_EXTEND_SIZE*width); + GxdI -= (BIO_EXTEND_SIZE*width); + GydI -= (BIO_EXTEND_SIZE*width); + + for (int y = -BIO_EXTEND_SIZE; y < unitSize + BIO_EXTEND_SIZE; y++) + { + for (int x = -BIO_EXTEND_SIZE; x < unitSize + BIO_EXTEND_SIZE; x++) + { + sGx2 += Gx2[x]; + sGy2 += Gy2[x]; + sGxGy += GxGy[x]; + sGxdI += GxdI[x]; + sGydI += GydI[x]; + } + Gx2 += width; + Gy2 += width; + GxGy += width; + GxdI += width; + GydI += width; + } +} +#endif + #if ENABLE_SIMD_OPT_GBI && JVET_L0646_GBI void removeWeightHighFreq(int16_t* dst, int dstStride, const int16_t* src, int srcStride, int width, int height, int shift, int gbiWeight) { @@ -138,6 +270,13 @@ PelBufferOps::PelBufferOps() linTf4 = linTfCore<Pel>; linTf8 = linTfCore<Pel>; +#if JVET_L0256_BIO + addBIOAvg4 = addBIOAvgCore; + bioGradFilter = gradFilterCore; + calcBIOPar = calcBIOParCore; + calcBlkGradient = calcBlkGradientCore; +#endif + #if ENABLE_SIMD_OPT_GBI removeWeightHighFreq8 = removeWeightHighFreq; removeWeightHighFreq4 = removeWeightHighFreq; diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index fdf3b962f774c18efc1c9433d496415ffcc74cf2..a0142743f3efd1b287efa3c039b80e038aeb6db6 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -68,6 +68,12 @@ struct PelBufferOps void ( *reco8 ) ( const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, int width, int height, const ClpRng& clpRng ); void ( *linTf4 ) ( const Pel* src0, int src0Stride, Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip ); void ( *linTf8 ) ( const Pel* src0, int src0Stride, Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip ); +#if JVET_L0256_BIO + void(*addBIOAvg4) (const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng); + void(*bioGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY); + void(*calcBIOPar) (const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG); + void(*calcBlkGradient)(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize); +#endif #if ENABLE_SIMD_OPT_GBI void ( *removeWeightHighFreq8) ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight); void ( *removeWeightHighFreq4) ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight); diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 19f2cd861515f2c9d76a6f26f59a10f8c93140d6..02bbfa14e463147ba051ba1c709dec33cc5ef0bb 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -318,6 +318,12 @@ static const int MAX_NUM_GT2_BINS_4x4SUBBLOCK = 4; ///< max static const int MAX_NUM_REG_BINS_2x2SUBBLOCK = 8; ///< max number of context-coded bins (incl. gt2 bins) per 2x2 subblock (chroma) static const int MAX_NUM_GT2_BINS_2x2SUBBLOCK = 2; ///< max number of gt2 bins per 2x2 subblock (chroma) #endif + +#if JVET_L0256_BIO +static const int BIO_EXTEND_SIZE = 1; +static const int BIO_TEMP_BUFFER_SIZE = (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE) * (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE); +#endif + #if JVET_L0646_GBI static const int GBI_NUM = 5; ///< the number of weight options static const int GBI_DEFAULT = ((uint8_t)(GBI_NUM >> 1)); ///< Default weighting index representing for w=0.5 @@ -397,6 +403,10 @@ static const int NTAPS_CHROMA = 4; ///< Numb static const int MAX_LADF_INTERVALS = 5; /// max number of luma adaptive deblocking filter qp offset intervals #endif +#if JVET_L0256_BIO +static const int NTAPS_BILINEAR = 2; ///< Number of taps for bilinear filter +#endif + // ==================================================================================================================== // Macro functions // ==================================================================================================================== diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index c6ea5e914dc34da5d0886b1afb10108142b98bf4..d980b24cc05c874241e46be5acb47342d03830ad 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -55,6 +55,13 @@ InterPrediction::InterPrediction() m_currChromaFormat( NUM_CHROMA_FORMAT ) , m_maxCompIDToPred ( MAX_NUM_COMPONENT ) , m_pcRdCost ( nullptr ) +#if JVET_L0256_BIO +, m_gradX0(nullptr) +, m_gradY0(nullptr) +, m_gradX1(nullptr) +, m_gradY1(nullptr) +, m_subPuMC(false) +#endif { for( uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++ ) { @@ -109,6 +116,13 @@ void InterPrediction::destroy() m_filteredBlockTmp[i][c] = nullptr; } } + +#if JVET_L0256_BIO + xFree(m_gradX0); m_gradX0 = nullptr; + xFree(m_gradY0); m_gradY0 = nullptr; + xFree(m_gradX1); m_gradX1 = nullptr; + xFree(m_gradY1); m_gradY1 = nullptr; +#endif } void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) @@ -127,8 +141,13 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) { for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ ) { +#if JVET_L0256_BIO + int extWidth = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 16; + int extHeight = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 1; +#else int extWidth = MAX_CU_SIZE + 16; int extHeight = MAX_CU_SIZE + 1; +#endif for( uint32_t i = 0; i < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS; i++ ) { m_filteredBlockTmp[i][c] = ( Pel* ) xMalloc( Pel, ( extWidth + 4 ) * ( extHeight + 7 + 4 ) ); @@ -148,7 +167,13 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC ) m_iRefListIdx = -1; - + +#if JVET_L0256_BIO + m_gradX0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_gradY0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_gradX1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); + m_gradY1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE); +#endif } #if !JVET_J0090_MEMORY_BANDWITH_MEASURE @@ -264,6 +289,10 @@ void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const R int fstStep = (!verMC ? puHeight : puWidth); int secStep = (!verMC ? puWidth : puHeight); +#if JVET_L0256_BIO + m_subPuMC = true; +#endif + for (int fstDim = fstStart; fstDim < fstEnd; fstDim += fstStep) { for (int secDim = secStart; secDim < secEnd; secDim += secStep) @@ -299,10 +328,16 @@ void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const R secDim = later - secStep; } } +#if JVET_L0256_BIO + m_subPuMC = false; +#endif } void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi +#if JVET_L0256_BIO + ,const bool& bioApplied /*=false*/ +#endif ) { const SPS &sps = *pu.cs->sps; @@ -332,12 +367,18 @@ void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& const ComponentID compID = ComponentID( comp ); if ( pu.cu->affine ) { +#if JVET_L0256_BIO + CHECK( bioApplied, "BIO is not allowed with affine" ); +#endif xPredAffineBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv, pcYuvPred, bi, pu.cu->slice->clpRng( compID ) ); } else { xPredInterBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv[0], pcYuvPred, bi, pu.cu->slice->clpRng( compID ) - ); +#if JVET_L0256_BIO + ,bioApplied +#endif + ); } } } @@ -347,6 +388,36 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) const PPS &pps = *pu.cs->pps; const Slice &slice = *pu.cs->slice; +#if JVET_L0256_BIO + bool bioApplied = false; + if (pu.cs->sps->getSpsNext().getUseBIO()) + { + if (pu.cu->affine || m_subPuMC) + { + bioApplied = false; + } + else + { + const bool biocheck0 = !(pps.getWPBiPred() && slice.getSliceType() == B_SLICE); + const bool biocheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE); + if (biocheck0 + && biocheck1 + && PU::isBiPredFromDifferentDir(pu) + && !(pu.Y().height == 4 || (pu.Y().width == 4 && pu.Y().height == 8)) + ) + { + bioApplied = true; + } + } + +#if JVET_L0646_GBI + if (pu.cu->cs->sps->getSpsNext().getUseGBi() && bioApplied && pu.cu->GBiIdx != GBI_DEFAULT) + { + bioApplied = false; + } +#endif + } +#endif for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++) { @@ -367,6 +438,9 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) if (pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0) { xPredInterUni ( pu, eRefPicList, pcMbBuf, true +#if JVET_L0256_BIO + ,bioApplied +#endif ); } else @@ -399,13 +473,19 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) } else { +#if JVET_L0256_BIO + xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied ); +#else xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs() ); +#endif } } - void InterPrediction::xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng - ) +#if JVET_L0256_BIO + ,const bool& bioApplied /*=false*/ +#endif +) { JVET_J0090_SET_REF_PICTURE( refPic, compID ); const ChromaFormat chFmt = pu.chromaFormat; @@ -446,24 +526,75 @@ void InterPrediction::xPredInterBlk ( const ComponentID& compID, const Predictio refBuf = refPic->getRecoBuf( CompArea( compID, chFmt, offset, pu.blocks[compID].size() ) ); } +#if JVET_L0256_BIO + // backup data + int backupWidth = width; + int backupHeight = height; + Pel *backupDstBufPtr = dstBuf.buf; + int backupDstBufStride = dstBuf.stride; + + if (bioApplied && compID == COMPONENT_Y) + { + width = width + 2 * BIO_EXTEND_SIZE + 2; + height = height + 2 * BIO_EXTEND_SIZE + 2; + + // change MC output + dstBuf.stride = width; + dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + 2 * dstBuf.stride + 2; + } +#endif + if( yFrac == 0 ) { +#if JVET_L0256_BIO + m_if.filterHor(compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, xFrac, rndRes, chFmt, clpRng); +#else m_if.filterHor(compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width, height, xFrac, rndRes, chFmt, clpRng); +#endif } else if( xFrac == 0 ) { +#if JVET_L0256_BIO + m_if.filterVer(compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, true, rndRes, chFmt, clpRng); +#else m_if.filterVer(compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width, height, yFrac, true, rndRes, chFmt, clpRng); +#endif } else { - PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], pu.blocks[compID]); + PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], pu.blocks[compID]); +#if JVET_L0256_BIO + tmpBuf.stride = dstBuf.stride; +#endif int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA; +#if JVET_L0256_BIO + m_if.filterHor(compID, (Pel*)refBuf.buf - ((vFilterSize >> 1) - 1) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, backupWidth, backupHeight + vFilterSize - 1, xFrac, false, chFmt, clpRng); +#else m_if.filterHor(compID, (Pel*) refBuf.buf - ((vFilterSize >> 1) - 1) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, width, height + vFilterSize - 1, xFrac, false, chFmt, clpRng); +#endif JVET_J0090_SET_CACHE_ENABLE( false ); +#if JVET_L0256_BIO + m_if.filterVer(compID, (Pel*)tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, false, rndRes, chFmt, clpRng); +#else m_if.filterVer(compID, (Pel*) tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, width, height, yFrac, false, rndRes, chFmt, clpRng); +#endif JVET_J0090_SET_CACHE_ENABLE( true ); } +#if JVET_L0256_BIO + if (bioApplied && compID == COMPONENT_Y) + { + refBuf.buf = refBuf.buf - refBuf.stride - 1; + dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + dstBuf.stride + 1; + bioSampleExtendBilinearFilter(refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width - 2, height - 2, 1, xFrac, yFrac, rndRes, chFmt, clpRng); + + // restore data + width = backupWidth; + height = backupHeight; + dstBuf.buf = backupDstBufPtr; + dstBuf.stride = backupDstBufStride; + } +#endif } void InterPrediction::xPredAffineBlk( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng ) @@ -612,8 +743,224 @@ int getMSB( unsigned x ) return msb; } +#if JVET_L0256_BIO +void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths) +{ + const int height = yuvDst.Y().height; + const int width = yuvDst.Y().width; + int heightG = height + 2 * BIO_EXTEND_SIZE; + int widthG = width + 2 * BIO_EXTEND_SIZE; + int offsetPos = widthG*BIO_EXTEND_SIZE + BIO_EXTEND_SIZE; + + Pel* gradX0 = m_gradX0; + Pel* gradX1 = m_gradX1; + Pel* gradY0 = m_gradY0; + Pel* gradY1 = m_gradY1; + + int stridePredMC = widthG + 2; + const Pel* srcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + stridePredMC + 1; + const Pel* srcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + stridePredMC + 1; + const int src0Stride = stridePredMC; + const int src1Stride = stridePredMC; + + Pel* dstY = yuvDst.Y().buf; + const int dstStride = yuvDst.Y().stride; + const Pel* srcY0Temp = srcY0; + const Pel* srcY1Temp = srcY1; + + for (int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++) + { + Pel* dstTempPtr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + stridePredMC + 1; + Pel* gradY = (refList == 0) ? m_gradY0 : m_gradY1; + Pel* gradX = (refList == 0) ? m_gradX0 : m_gradX1; + + g_pelBufOP.bioGradFilter(dstTempPtr, stridePredMC, widthG, heightG, widthG, gradX, gradY); + Pel* padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 2; + for (int y = 0; y< height; y++) + { + padStr[-1] = padStr[0]; + padStr[width] = padStr[width - 1]; + padStr += stridePredMC; + } + + padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 1; + ::memcpy(padStr - stridePredMC, padStr, sizeof(Pel)*(widthG)); + ::memcpy(padStr + height*stridePredMC, padStr + (height - 1)*stridePredMC, sizeof(Pel)*(widthG)); + } + + const ClpRng& clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y); + const int bitDepth = clipBitDepths.recon[toChannelType(COMPONENT_Y)]; + const int shiftNum = IF_INTERNAL_PREC + 1 - bitDepth; + const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; + const int limit = ((int)1 << (4 + IF_INTERNAL_PREC - bitDepth - 5)); + + int* dotProductTemp1 = m_dotProduct1; + int* dotProductTemp2 = m_dotProduct2; + int* dotProductTemp3 = m_dotProduct3; + int* dotProductTemp5 = m_dotProduct5; + int* dotProductTemp6 = m_dotProduct6; + + g_pelBufOP.calcBIOPar(srcY0Temp, srcY1Temp, gradX0, gradX1, gradY0, gradY1, dotProductTemp1, dotProductTemp2, dotProductTemp3, dotProductTemp5, dotProductTemp6, src0Stride, src1Stride, widthG, widthG, heightG); + + int xUnit = (width >> 2); + int yUnit = (height >> 2); + + Pel *dstY0 = dstY; + gradX0 = m_gradX0; gradX1 = m_gradX1; + gradY0 = m_gradY0; gradY1 = m_gradY1; + + for (int yu = 0; yu < yUnit; yu++) + { + for (int xu = 0; xu < xUnit; xu++) + { + if (m_bioPredSubBlkDist[yu*xUnit + xu] < m_bioSubBlkDistThres) + { + srcY0Temp = srcY0 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2); + srcY1Temp = srcY1 + (stridePredMC + 1) + ((yu*src1Stride + xu) << 2); + dstY0 = dstY + ((yu*dstStride + xu) << 2); + g_pelBufOP.addAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, (1 << 2), (1 << 2), shiftNum, offset, clpRng); + continue; + } + + int sGxdI = 0, sGydI = 0, sGxGy = 0, sGx2 = 0, sGy2 = 0; + int tmpx = 0, tmpy = 0; + dotProductTemp1 = m_dotProduct1 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp2 = m_dotProduct2 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp3 = m_dotProduct3 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp5 = m_dotProduct5 + offsetPos + ((yu*widthG + xu) << 2); + dotProductTemp6 = m_dotProduct6 + offsetPos + ((yu*widthG + xu) << 2); + + g_pelBufOP.calcBlkGradient(xu << 2, yu << 2, dotProductTemp1, dotProductTemp2, dotProductTemp3, dotProductTemp5, dotProductTemp6, sGx2, sGy2, sGxGy, sGxdI, sGydI, widthG, heightG, (1 << 2)); + + if (sGx2 > 0) + { + tmpx = rightShiftMSB(sGxdI << 3, sGx2); + tmpx = Clip3(-limit, limit, tmpx); + } + if (sGy2 > 0) + { + int mainsGxGy = sGxGy >> 12; + int secsGxGy = sGxGy & ((1 << 12) - 1); + int tmpData = tmpx * mainsGxGy; + tmpData = ((tmpData << 12) + tmpx*secsGxGy) >> 1; + tmpy = rightShiftMSB(((sGydI << 3) - tmpData), sGy2); + tmpy = Clip3(-limit, limit, tmpy); + } + + srcY0Temp = srcY0 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2); + srcY1Temp = srcY1 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2); + gradX0 = m_gradX0 + offsetPos + ((yu*widthG + xu) << 2); + gradX1 = m_gradX1 + offsetPos + ((yu*widthG + xu) << 2); + gradY0 = m_gradY0 + offsetPos + ((yu*widthG + xu) << 2); + gradY1 = m_gradY1 + offsetPos + ((yu*widthG + xu) << 2); + + dstY0 = dstY + ((yu*dstStride + xu) << 2); + g_pelBufOP.addBIOAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, gradX0, gradX1, gradY0, gradY1, widthG, (1 << 2), (1 << 2), (int)tmpx, (int)tmpy, shiftNum, offset, clpRng); + } // xu + } // yu +} + +void InterPrediction::bioSampleExtendBilinearFilter(Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int dim, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng) +{ + Pel const* pSrc = NULL; + Pel* pDst = NULL; + + int vFilterSize = NTAPS_LUMA; + int widthTmp = 0; + int heightTmp = 0; + + for (int cand = 0; cand < 4; cand++) // top, left, bottom and right + { + + if (cand == 0) // top + { + pSrc = src; + pDst = dst; + widthTmp = width; + heightTmp = dim; + } + else if (cand == 1) // left + { + pSrc = src + dim*srcStride; + pDst = dst + dim*dstStride; + widthTmp = dim; + heightTmp = height - 2 * dim; + } + else if (cand == 2) // bottom + { + pSrc = src + (height - dim)*srcStride; + pDst = dst + (height - dim)*dstStride; + widthTmp = width; + heightTmp = dim; + } + else if (cand == 3) // right + { + pSrc = src + dim*srcStride + width - dim; + pDst = dst + dim*dstStride + width - dim; + widthTmp = dim; + heightTmp = height - 2 * dim; + } + + if (fracY == 0) + { + m_if.filterHor(COMPONENT_Y, pSrc, srcStride, pDst, dstStride, widthTmp, heightTmp, fracX, isLast, fmt, clpRng, 1); + } + else if (fracX == 0) + { + m_if.filterVer(COMPONENT_Y, pSrc, srcStride, pDst, dstStride, widthTmp, heightTmp, fracY, true, isLast, fmt, clpRng, 1); + } + else + { + PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][COMPONENT_Y], Size(width, height)); + tmpBuf.stride = width; + + m_if.filterHor(COMPONENT_Y, pSrc - ((vFilterSize >> 1) - 1) * srcStride, srcStride, tmpBuf.buf, tmpBuf.stride, widthTmp, heightTmp + vFilterSize - 1, fracX, false, fmt, clpRng, 1); + m_if.filterVer(COMPONENT_Y, tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, pDst, dstStride, widthTmp, heightTmp, fracY, false, isLast, fmt, clpRng, 1); + } + } +} + +bool InterPrediction::xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* pYuvSrc0, const int src0Stride, const Pel* pYuvSrc1, const int src1Stride, const BitDepths &clipBitDepths) +{ + const int width = pu.lwidth(); + const int height = pu.lheight(); + const int clipbd = clipBitDepths.recon[toChannelType(COMPONENT_Y)]; + const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(clipbd); + const int shift = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)); + const int xUnit = (width >> 2); + const int yUnit = (height >> 2); + + m_bioDistThres = (shift <= 5) ? (((32 << (clipbd - 8))*width*height) >> (5 - shift)) : (((32 << (clipbd - 8))*width*height) << (shift - 5)); + m_bioSubBlkDistThres = (shift <= 5) ? (((64 << (clipbd - 8)) << 4) >> (5 - shift)) : (((64 << (clipbd - 8)) << 4) << (shift - 5)); + + m_bioDistThres >>= distortionShift; + m_bioSubBlkDistThres >>= distortionShift; + + DistParam cDistParam; + Distortion dist = 0; + for (int yu = 0, blkIdx = 0; yu < yUnit; yu++) + { + for (int xu = 0; xu < xUnit; xu++, blkIdx++) + { + const Pel* pPred0 = pYuvSrc0 + ((yu*src0Stride + xu) << 2); + const Pel* pPred1 = pYuvSrc1 + ((yu*src1Stride + xu) << 2); + + m_pcRdCost->setDistParam(cDistParam, pPred0, pPred1, src0Stride, src1Stride, clipbd, COMPONENT_Y, (1 << 2), (1 << 2), 0, 1, false, true); + m_bioPredSubBlkDist[blkIdx] = cDistParam.distFunc(cDistParam); + dist += m_bioPredSubBlkDist[blkIdx]; + } + } + + return (dist >= m_bioDistThres); +} +#endif + +#if JVET_L0256_BIO +void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied ) +#else void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs ) +#endif { const int iRefIdx0 = pu.refIdx[0]; const int iRefIdx1 = pu.refIdx[1]; @@ -623,11 +970,35 @@ void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnit #if JVET_L0646_GBI if( pu.cu->GBiIdx != GBI_DEFAULT ) { +#if JVET_L0256_BIO + CHECK(bioApplied, "GBi is disallowed with BIO"); +#endif pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->GBiIdx); return; } #endif +#if JVET_L0256_BIO + if (bioApplied) + { + const int src0Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2; + const int src1Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2; + const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + 2 * src0Stride + 2; + const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + 2 * src1Stride + 2; + + bool bioEnabled = xCalcBiPredSubBlkDist(pu, pSrcY0, src0Stride, pSrcY1, src1Stride, clipBitDepths); + if (bioEnabled) + { + applyBiOptFlow(pu, pcYuvSrc0, pcYuvSrc1, iRefIdx0, iRefIdx1, pcYuvDst, clipBitDepths); + } + else + { + pcYuvDst.bufs[0].addAvg(CPelBuf(pSrcY0, src0Stride, pu.lumaSize()), CPelBuf(pSrcY1, src1Stride, pu.lumaSize()), clpRngs.comp[0]); + } + } + pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, bioApplied); +#else pcYuvDst.addAvg( pcYuvSrc0, pcYuvSrc1, clpRngs ); +#endif } else if( iRefIdx0 >= 0 && iRefIdx1 < 0 ) { @@ -694,8 +1065,25 @@ void InterPrediction::motionCompensation( PredictionUnit &pu, const RefPicList & ); } +#if JVET_L0256_BIO +int InterPrediction::rightShiftMSB(int numer, int denom) +{ + int d; + int msbIdx = 0; + for (msbIdx = 0; msbIdx<32; msbIdx++) + { + if (denom < ((int)1 << msbIdx)) + { + break; + } + } + int shiftIdx = msbIdx - 1; + d = (numer >> shiftIdx); + return d; +} +#endif #if JVET_J0090_MEMORY_BANDWITH_MEASURE void InterPrediction::cacheAssign( CacheModel *cache ) diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index c58fed66412d52f3f234057fbfa29c7900936eac..0bff85b70a5f525b26f8734c6e3b2c682cf48846 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -64,6 +64,17 @@ class InterPrediction : public WeightPrediction { private: +#if JVET_L0256_BIO + Distortion m_bioDistThres; + Distortion m_bioSubBlkDistThres; + Distortion m_bioPredSubBlkDist[MAX_NUM_PARTS_IN_CTU]; + + int m_dotProduct1[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct2[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct3[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct5[BIO_TEMP_BUFFER_SIZE]; + int m_dotProduct6[BIO_TEMP_BUFFER_SIZE]; +#endif protected: InterpolationFilter m_if; @@ -80,15 +91,37 @@ protected: RdCost* m_pcRdCost; int m_iRefListIdx; - + +#if JVET_L0256_BIO + Pel* m_gradX0; + Pel* m_gradY0; + Pel* m_gradX1; + Pel* m_gradY1; + bool m_subPuMC; + + int rightShiftMSB(int numer, int denom); + void applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths); + bool xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* yuvSrc0, const int src0Stride, const Pel* yuvSrc1, const int src1Stride, const BitDepths &clipBitDepths); + void bioSampleExtendBilinearFilter(Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int dim, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng); +#endif void xPredInterUni ( const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi +#if JVET_L0256_BIO + ,const bool& bioApplied = false +#endif ); void xPredInterBi ( PredictionUnit& pu, PelUnitBuf &pcYuvPred ); void xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng +#if JVET_L0256_BIO + ,const bool& bioApplied = false +#endif ); - + +#if JVET_L0256_BIO + void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied ); +#else void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs ); +#endif void xPredAffineBlk( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng ); static bool xCheckIdenticalMotion( const PredictionUnit& pu ); diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp index 32e4d9d755c7a110f8d5abefa3a9d38f5ba2ff39..abcef170f7a7675d4930ceaf77218519540b3607 100644 --- a/source/Lib/CommonLib/InterpolationFilter.cpp +++ b/source/Lib/CommonLib/InterpolationFilter.cpp @@ -111,6 +111,28 @@ const TFilterCoeff InterpolationFilter::m_chromaFilter[CHROMA_INTERPOLATION_FILT { 0, 2, 63, -1 }, }; +#if JVET_L0256_BIO +const TFilterCoeff InterpolationFilter::m_bilinearFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_BILINEAR] = +{ + { 64, 0, }, + { 60, 4, }, + { 56, 8, }, + { 52, 12, }, + { 48, 16, }, + { 44, 20, }, + { 40, 24, }, + { 36, 28, }, + { 32, 32, }, + { 28, 36, }, + { 24, 40, }, + { 20, 44, }, + { 16, 48, }, + { 12, 52, }, + { 8, 56, }, + { 4, 60, }, +}; +#endif + // ==================================================================================================================== // Private member functions // ==================================================================================================================== @@ -443,7 +465,11 @@ void InterpolationFilter::filterVer(const ClpRng& clpRng, Pel const *src, int sr * \param fmt Chroma format * \param bitDepth Bit depth */ +#if JVET_L0256_BIO +void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx ) +#else void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ) +#endif { if( frac == 0 ) { @@ -452,6 +478,13 @@ void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, i else if( isLuma( compID ) ) { CHECK( frac < 0 || frac >= ( LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE ), "Invalid fraction" ); +#if JVET_L0256_BIO + if( nFilterIdx == 1 ) + { + filterHor<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_bilinearFilter[frac]); + } + else +#endif { filterHor<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac] ); } @@ -481,7 +514,11 @@ void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, i * \param fmt Chroma format * \param bitDepth Bit depth */ +#if JVET_L0256_BIO +void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx) +#else void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ) +#endif { if( frac == 0 ) { @@ -490,6 +527,13 @@ void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, i else if( isLuma( compID ) ) { CHECK( frac < 0 || frac >= ( LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE ), "Invalid fraction" ); +#if JVET_L0256_BIO + if (nFilterIdx == 1) + { + filterVer<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_bilinearFilter[frac]); + } + else +#endif { filterVer<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac] ); } diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h index 4535b6bc56ba52fd6130e4f7e8a5443fac1a7770..4f246d9bed3ff90076ed125212a79a9da8761112 100644 --- a/source/Lib/CommonLib/InterpolationFilter.h +++ b/source/Lib/CommonLib/InterpolationFilter.h @@ -56,6 +56,9 @@ class InterpolationFilter { static const TFilterCoeff m_lumaFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_LUMA]; ///< Luma filter taps static const TFilterCoeff m_chromaFilter[CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_CHROMA]; ///< Chroma filter taps +#if JVET_L0256_BIO + static const TFilterCoeff m_bilinearFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_BILINEAR]; ///< bilinear filter taps +#endif public: template<bool isFirst, bool isLast> static void filterCopy( const ClpRng& clpRng, const Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height ); @@ -87,8 +90,13 @@ public: void _initInterpolationFilterX86(); #endif +#if JVET_L0256_BIO + void filterHor(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx = 0); + void filterVer(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx = 0); +#else void filterHor(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ); void filterVer(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng ); +#endif #if JVET_J0090_MEMORY_BANDWITH_MEASURE void cacheAssign( CacheModel *cache ) { m_cacheModel = cache; } #endif diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp index 20119d9a8feaeaef1bf16ee72a81714e60521571..572eacb8ba30219900a0137a0d55f05157511430 100644 --- a/source/Lib/CommonLib/RdCost.cpp +++ b/source/Lib/CommonLib/RdCost.cpp @@ -164,6 +164,10 @@ void RdCost::init() m_afpDistortFunc[DF_SSE16N_WTD] = RdCost::xGetSSE16N_WTD; #endif +#if JVET_L0256_BIO + m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD; +#endif + #if ENABLE_SIMD_OPT_DIST #ifdef TARGET_SIMD_X86 initRdCostX86(); @@ -318,7 +322,11 @@ void RdCost::setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &c rcDP.maximumDistortionForEarlyExit = std::numeric_limits<Distortion>::max(); } +#if JVET_L0256_BIO +void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard, bool bioApplied ) +#else void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard ) +#endif { rcDP.bitDepth = bitDepth; rcDP.compID = compID; @@ -339,6 +347,14 @@ void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, CHECK( useHadamard || rcDP.useMR || subShiftMode > 0, "only used in xDirectMCCost with these default parameters (so far...)" ); +#if JVET_L0256_BIO + if ( bioApplied ) + { + rcDP.distFunc = m_afpDistortFunc[ DF_SAD_INTERMEDIATE_BITDEPTH ]; + return; + } +#endif + if( width == 12 ) { rcDP.distFunc = m_afpDistortFunc[ DF_SAD12 ]; diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h index 4e79040c21fb739fc75a13b0643d4798989099d7..3c95f90adf54158c5cfce66572b7dedc82613f7d 100644 --- a/source/Lib/CommonLib/RdCost.h +++ b/source/Lib/CommonLib/RdCost.h @@ -154,7 +154,11 @@ public: void setDistParam( DistParam &rcDP, const CPelBuf &org, const Pel* piRefY , int iRefStride, int bitDepth, ComponentID compID, int subShiftMode = 0, int step = 1, bool useHadamard = false ); void setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &cur, int bitDepth, ComponentID compID, bool useHadamard = false ); +#if JVET_L0256_BIO + void setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false, bool bioApplied = false ); +#else void setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false ); +#endif double getMotionLambda ( bool bIsTransquantBypass ) { return m_dLambdaMotionSAD[(bIsTransquantBypass && m_costMode==COST_MIXED_LOSSLESS_LOSSY_CODING)?1:0]; } void selectMotionLambda ( bool bIsTransquantBypass ) { m_motionLambda = getMotionLambda( bIsTransquantBypass ); } @@ -266,6 +270,10 @@ private: static Distortion xGetSAD_SIMD ( const DistParam& pcDtParam ); template< int iWidth, X86_VEXT vext > static Distortion xGetSAD_NxN_SIMD( const DistParam& pcDtParam ); +#if ENABLE_SIMD_OPT_BIO + template< X86_VEXT vext > + static Distortion xGetSAD_IBD_SIMD(const DistParam& pcDtParam); +#endif template< typename Torg, typename Tcur, X86_VEXT vext > static Distortion xGetHADs_SIMD ( const DistParam& pcDtParam ); diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp index 356ec1c2618aea010a1cbb289fcea99c2f71a929..61e3c8fd94c8b40ed4dd23ce7710a32ca06e2c92 100644 --- a/source/Lib/CommonLib/Slice.cpp +++ b/source/Lib/CommonLib/Slice.cpp @@ -1734,6 +1734,9 @@ SPSNext::SPSNext( SPS& sps ) , m_IMV ( false ) #if !REMOVE_MV_ADAPT_PREC , m_highPrecMv ( false ) +#endif +#if JVET_L0256_BIO + , m_BIO ( false ) #endif , m_DisableMotionCompression ( false ) , m_LMChroma ( false ) diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index 7dbbc4611e5ff80854c32ebfc7e1ad16db98c1bc..8cbcfee8795e65767309db01ed782f10b68b45ef 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -806,6 +806,9 @@ private: bool m_IMV; // 9 #if !REMOVE_MV_ADAPT_PREC bool m_highPrecMv; +#endif +#if JVET_L0256_BIO + bool m_BIO; #endif bool m_DisableMotionCompression; // 13 bool m_LMChroma; // 17 @@ -880,6 +883,10 @@ public: #if !REMOVE_MV_ADAPT_PREC void setUseHighPrecMv(bool b) { m_highPrecMv = b; } bool getUseHighPrecMv() const { return m_highPrecMv; } +#endif +#if JVET_L0256_BIO + void setUseBIO(bool b) { m_BIO = b; } + bool getUseBIO() const { return m_BIO; } #endif void setDisableMotCompress ( bool b ) { m_DisableMotionCompression = b; } bool getDisableMotCompress () const { return m_DisableMotionCompression; } diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index 347ae35904ff9d6e1b86fafb1aabdd833015081a..472b3428168d2b7d107d8418d1b0042f6f45c2c0 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -85,6 +85,8 @@ #define L0074_SUBBLOCK_DEBLOCKING 1 +#define JVET_L0256_BIO 1 + #define JVET_L0646_GBI 1 // Generalized bi-prediction (GBi) #define JVET_L0628_4TAP_INTRA 1 // 4-tap intra-interpolation filtering with switching between Gaussian and DCT-IF filters for luma component @@ -264,6 +266,7 @@ #if ENABLE_SIMD_OPT_BUFFER && JVET_L0646_GBI #define ENABLE_SIMD_OPT_GBI 1 ///< SIMD optimization for GBi #endif +#define ENABLE_SIMD_OPT_BIO ( JVET_L0256_BIO && ENABLE_SIMD_OPT ) ///< SIMD optimization for BIO // End of SIMD optimizations @@ -558,7 +561,13 @@ enum DFunc DF_DEFAULT_ORI = DF_SSE_WTD+8, #endif +#if JVET_L0256_BIO + DF_SAD_INTERMEDIATE_BITDEPTH = 63, + + DF_TOTAL_FUNCTIONS = 64 +#else DF_TOTAL_FUNCTIONS = 63 +#endif }; /// motion vector predictor direction used in AMVP diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index 34d2cb7b417ef2278b0c9fe55277eeec2dd068b1..92f52655a47940016d708dadc68a152a00a1315b 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -128,6 +128,308 @@ void addAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int s } } +#if ENABLE_SIMD_OPT_BIO +template< X86_VEXT vext > +void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) +{ + __m128i mm_tmpx = _mm_unpacklo_epi64(_mm_set1_epi16(tmpx), _mm_set1_epi16(tmpy)); + __m128i mm_boffset = _mm_set1_epi32(1); + __m128i mm_offset = _mm_set1_epi32(offset); + __m128i vibdimin = _mm_set1_epi16(clpRng.min); + __m128i vibdimax = _mm_set1_epi16(clpRng.max); + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x += 4) + { + __m128i mm_a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX0 + x)), _mm_loadl_epi64((const __m128i *)(gradY0 + x))); + __m128i mm_b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX1 + x)), _mm_loadl_epi64((const __m128i *)(gradY1 + x))); + mm_a = _mm_sub_epi16(mm_a, mm_b); + mm_b = _mm_mulhi_epi16(mm_a, mm_tmpx); + mm_a = _mm_mullo_epi16(mm_a, mm_tmpx); + + __m128i mm_sum = _mm_add_epi32(_mm_unpacklo_epi16(mm_a, mm_b), _mm_unpackhi_epi16(mm_a, mm_b)); + mm_sum = _mm_srai_epi32(_mm_add_epi32(mm_sum, mm_boffset), 1); + mm_a = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src0 + x))); + mm_b = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src1 + x))); + mm_sum = _mm_add_epi32(_mm_add_epi32(mm_sum, mm_a), _mm_add_epi32(mm_b, mm_offset)); + mm_sum = _mm_packs_epi32(_mm_srai_epi32(mm_sum, shift), mm_a); + mm_sum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_sum)); + _mm_storel_epi64((__m128i *)(dst + x), mm_sum); + } + dst += dstStride; src0 += src0Stride; src1 += src1Stride; + gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride; + } +} + +template< X86_VEXT vext > +void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY) +{ + __m128i vzero = _mm_setzero_si128(); + Pel* srcTmp = src + srcStride + 1; + Pel* gradXTmp = gradX + gradStride + 1; + Pel* gradYTmp = gradY + gradStride + 1; + + int widthInside = width - 2 * BIO_EXTEND_SIZE; + int heightInside = height - 2 * BIO_EXTEND_SIZE; + + assert((widthInside & 3) == 0); + + for (int y = 0; y < heightInside; y++) + { + int x = 0; + for (; x < widthInside; x += 4) + { + __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - srcStride))); + __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + srcStride))); + __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - 1))); + __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + 1))); + + __m128i mmGradVer = _mm_srai_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), 4); + __m128i mmGradHor = _mm_srai_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), 4); + mmGradVer = _mm_packs_epi32(mmGradVer, vzero); + mmGradHor = _mm_packs_epi32(mmGradHor, vzero); + + _mm_storel_epi64((__m128i *)(gradYTmp + x), mmGradVer); + _mm_storel_epi64((__m128i *)(gradXTmp + x), mmGradHor); + } + + gradXTmp += gradStride; + gradYTmp += gradStride; + srcTmp += srcStride; + } + + gradXTmp = gradX + gradStride + 1; + gradYTmp = gradY + gradStride + 1; + for (int y = 0; y < heightInside; y++) + { + gradXTmp[-1] = gradXTmp[0]; + gradXTmp[widthInside] = gradXTmp[widthInside - 1]; + gradXTmp += gradStride; + + gradYTmp[-1] = gradYTmp[0]; + gradYTmp[widthInside] = gradYTmp[widthInside - 1]; + gradYTmp += gradStride; + } + + gradXTmp = gradX + gradStride; + gradYTmp = gradY + gradStride; + ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel)*(width)); + ::memcpy(gradXTmp + heightInside*gradStride, gradXTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width)); + ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width)); + ::memcpy(gradYTmp + heightInside*gradStride, gradYTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width)); +} + +template< X86_VEXT vext > +void calcBIOPar_SSE(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG) +{ + for (int y = 0; y < heightG; y++) + { + int x = 0; + for (; x < ((widthG >> 3) << 3); x += 8) + { + __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Temp + x)), 6); + __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Temp + x)), 6); + __m128i mmGradX0 = _mm_loadu_si128((__m128i*)(gradX0 + x)); + __m128i mmGradX1 = _mm_loadu_si128((__m128i*)(gradX1 + x)); + __m128i mmGradY0 = _mm_loadu_si128((__m128i*)(gradY0 + x)); + __m128i mmGradY1 = _mm_loadu_si128((__m128i*)(gradY1 + x)); + + __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp); + __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3); + __m128i mmTempY = _mm_srai_epi16(_mm_add_epi16(mmGradY0, mmGradY1), 3); + + // m_piDotProductTemp1 + __m128i mm_b = _mm_mulhi_epi16(mmTempX, mmTempX); + __m128i mm_a = _mm_mullo_epi16(mmTempX, mmTempX); + + __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + __m128i mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp1 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp1 + x + 4), mm_h); + + // m_piDotProductTemp2 + mm_b = _mm_mulhi_epi16(mmTempX, mmTempY); + mm_a = _mm_mullo_epi16(mmTempX, mmTempY); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp2 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp2 + x + 4), mm_h); + + // m_piDotProductTemp3 + mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempX, mmTemp1); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp3 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp3 + x + 4), mm_h); + + // m_piDotProductTemp5 + mm_b = _mm_mulhi_epi16(mmTempY, mmTempY); + mm_a = _mm_mullo_epi16(mmTempY, mmTempY); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp5 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp5 + x + 4), mm_h); + + // m_piDotProductTemp6 + mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempY, mmTemp1); + + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + mm_h = _mm_unpackhi_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp6 + x), mm_l); + _mm_storeu_si128((__m128i *)(dotProductTemp6 + x + 4), mm_h); + } + + for (; x < ((widthG >> 2) << 2); x += 4) + { + __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(srcY0Temp + x)), 6); + __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(srcY1Temp + x)), 6); + __m128i mmGradX0 = _mm_loadl_epi64((__m128i*)(gradX0 + x)); + __m128i mmGradX1 = _mm_loadl_epi64((__m128i*)(gradX1 + x)); + __m128i mmGradY0 = _mm_loadl_epi64((__m128i*)(gradY0 + x)); + __m128i mmGradY1 = _mm_loadl_epi64((__m128i*)(gradY1 + x)); + + __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp); + __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3); + __m128i mmTempY = _mm_srai_epi16(_mm_add_epi16(mmGradY0, mmGradY1), 3); + + // m_piDotProductTemp1 + __m128i mm_b = _mm_mulhi_epi16(mmTempX, mmTempX); + __m128i mm_a = _mm_mullo_epi16(mmTempX, mmTempX); + __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp1 + x), mm_l); + + // m_piDotProductTemp2 + mm_b = _mm_mulhi_epi16(mmTempX, mmTempY); + mm_a = _mm_mullo_epi16(mmTempX, mmTempY); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp2 + x), mm_l); + + // m_piDotProductTemp3 + mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempX, mmTemp1); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp3 + x), mm_l); + + // m_piDotProductTemp5 + mm_b = _mm_mulhi_epi16(mmTempY, mmTempY); + mm_a = _mm_mullo_epi16(mmTempY, mmTempY); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp5 + x), mm_l); + + // m_piDotProductTemp6 + mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1); + mm_a = _mm_mullo_epi16(mmTempY, mmTemp1); + mm_l = _mm_unpacklo_epi16(mm_a, mm_b); + + _mm_storeu_si128((__m128i *)(dotProductTemp6 + x), mm_l); + } + + for (; x < widthG; x++) + { + int temp = (srcY0Temp[x] >> 6) - (srcY1Temp[x] >> 6); + int tempX = (gradX0[x] + gradX1[x]) >> 3; + int tempY = (gradY0[x] + gradY1[x]) >> 3; + dotProductTemp1[x] = tempX * tempX; + dotProductTemp2[x] = tempX * tempY; + dotProductTemp3[x] = -tempX * temp; + dotProductTemp5[x] = tempY * tempY; + dotProductTemp6[x] = -tempY * temp; + } + + srcY0Temp += src0Stride; + srcY1Temp += src1Stride; + gradX0 += gradStride; + gradX1 += gradStride; + gradY0 += gradStride; + gradY1 += gradStride; + dotProductTemp1 += widthG; + dotProductTemp2 += widthG; + dotProductTemp3 += widthG; + dotProductTemp5 += widthG; + dotProductTemp6 += widthG; + } +} + +template< X86_VEXT vext > +void calcBlkGradient_SSE(int sx, int sy, int *arraysGx2, int *arraysGxGy, int *arraysGxdI, int *arraysGy2, int *arraysGydI, int &sGx2, int &sGy2, int &sGxGy, int &sGxdI, int &sGydI, int width, int height, int unitSize) +{ + int *Gx2 = arraysGx2; + int *Gy2 = arraysGy2; + int *GxGy = arraysGxGy; + int *GxdI = arraysGxdI; + int *GydI = arraysGydI; + + // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE + Gx2 -= (BIO_EXTEND_SIZE*width); + Gy2 -= (BIO_EXTEND_SIZE*width); + GxGy -= (BIO_EXTEND_SIZE*width); + GxdI -= (BIO_EXTEND_SIZE*width); + GydI -= (BIO_EXTEND_SIZE*width); + + __m128i vzero = _mm_setzero_si128(); + __m128i mmGx2Total = _mm_setzero_si128(); + __m128i mmGy2Total = _mm_setzero_si128(); + __m128i mmGxGyTotal = _mm_setzero_si128(); + __m128i mmGxdITotal = _mm_setzero_si128(); + __m128i mmGydITotal = _mm_setzero_si128(); + + for (int y = -BIO_EXTEND_SIZE; y < unitSize + BIO_EXTEND_SIZE; y++) + { + __m128i mmsGx2 = _mm_loadu_si128((__m128i*)(Gx2 - 1)); __m128i mmsGx2Sec = _mm_loadl_epi64((__m128i*)(Gx2 + 3)); + __m128i mmsGy2 = _mm_loadu_si128((__m128i*)(Gy2 - 1)); __m128i mmsGy2Sec = _mm_loadl_epi64((__m128i*)(Gy2 + 3)); + __m128i mmsGxGy = _mm_loadu_si128((__m128i*)(GxGy - 1)); __m128i mmsGxGySec = _mm_loadl_epi64((__m128i*)(GxGy + 3)); + __m128i mmsGxdI = _mm_loadu_si128((__m128i*)(GxdI - 1)); __m128i mmsGxdISec = _mm_loadl_epi64((__m128i*)(GxdI + 3)); + __m128i mmsGydI = _mm_loadu_si128((__m128i*)(GydI - 1)); __m128i mmsGydISec = _mm_loadl_epi64((__m128i*)(GydI + 3)); + + mmsGx2 = _mm_add_epi32(mmsGx2, mmsGx2Sec); + mmsGy2 = _mm_add_epi32(mmsGy2, mmsGy2Sec); + mmsGxGy = _mm_add_epi32(mmsGxGy, mmsGxGySec); + mmsGxdI = _mm_add_epi32(mmsGxdI, mmsGxdISec); + mmsGydI = _mm_add_epi32(mmsGydI, mmsGydISec); + + + mmGx2Total = _mm_add_epi32(mmGx2Total, mmsGx2); + mmGy2Total = _mm_add_epi32(mmGy2Total, mmsGy2); + mmGxGyTotal = _mm_add_epi32(mmGxGyTotal, mmsGxGy); + mmGxdITotal = _mm_add_epi32(mmGxdITotal, mmsGxdI); + mmGydITotal = _mm_add_epi32(mmGydITotal, mmsGydI); + + Gx2 += width; + Gy2 += width; + GxGy += width; + GxdI += width; + GydI += width; + } + + mmGx2Total = _mm_hadd_epi32(_mm_hadd_epi32(mmGx2Total, vzero), vzero); + mmGy2Total = _mm_hadd_epi32(_mm_hadd_epi32(mmGy2Total, vzero), vzero); + mmGxGyTotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGxGyTotal, vzero), vzero); + mmGxdITotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGxdITotal, vzero), vzero); + mmGydITotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGydITotal, vzero), vzero); + + sGx2 = _mm_cvtsi128_si32(mmGx2Total); + sGy2 = _mm_cvtsi128_si32(mmGy2Total); + sGxGy = _mm_cvtsi128_si32(mmGxGyTotal); + sGxdI = _mm_cvtsi128_si32(mmGxdITotal); + sGydI = _mm_cvtsi128_si32(mmGydITotal); +} +#endif + template< X86_VEXT vext, int W > void reco_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int16_t *dst, int dstStride, int width, int height, const ClpRng& clpRng ) { @@ -496,6 +798,13 @@ void PelBufferOps::_initPelBufOpsX86() addAvg8 = addAvg_SSE<vext, 8>; addAvg4 = addAvg_SSE<vext, 4>; +#if ENABLE_SIMD_OPT_BIO + addBIOAvg4 = addBIOAvg4_SSE<vext>; + bioGradFilter = gradFilter_SSE<vext>; + calcBIOPar = calcBIOPar_SSE<vext>; + calcBlkGradient = calcBlkGradient_SSE<vext>; +#endif + reco8 = reco_SSE<vext, 8>; reco4 = reco_SSE<vext, 4>; diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h index 95383045fe72d78572e9ab132f91fb13588e7b8d..4d87189ca052e0d8ab14a17980b92c2e1406c13e 100644 --- a/source/Lib/CommonLib/x86/RdCostX86.h +++ b/source/Lib/CommonLib/x86/RdCostX86.h @@ -297,6 +297,45 @@ Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam ) return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } +#if ENABLE_SIMD_OPT_BIO +template< X86_VEXT vext > +Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) +{ + if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) + return RdCost::xGetSAD(rcDtParam); + + const short* src0 = (const short*)rcDtParam.org.buf; + const short* src1 = (const short*)rcDtParam.cur.buf; + int width = rcDtParam.org.height; + int height = rcDtParam.org.width; + int subShift = rcDtParam.subShift; + int subStep = (1 << subShift); + const int src0Stride = rcDtParam.org.stride * subStep; + const int src1Stride = rcDtParam.cur.stride * subStep; + + __m128i vtotalsum32 = _mm_setzero_si128(); + __m128i vzero = _mm_setzero_si128(); + for (int y = 0; y < height; y += subStep) + { + for (int x = 0; x < width; x += 4) + { + __m128i vsrc1 = _mm_loadl_epi64((const __m128i*)(src0 + x)); + __m128i vsrc2 = _mm_loadl_epi64((const __m128i*)(src1 + x)); + vsrc1 = _mm_cvtepi16_epi32(vsrc1); + vsrc2 = _mm_cvtepi16_epi32(vsrc2); + vtotalsum32 = _mm_add_epi32(vtotalsum32, _mm_abs_epi32(_mm_sub_epi32(vsrc1, vsrc2))); + } + src0 += src0Stride; + src1 += src1Stride; + } + vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero); + vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero); + Distortion uiSum = _mm_cvtsi128_si32(vtotalsum32); + + uiSum <<= subShift; + return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); +} +#endif template< int iWidth, X86_VEXT vext > Distortion RdCost::xGetSAD_NxN_SIMD( const DistParam &rcDtParam ) @@ -2422,6 +2461,10 @@ void RdCost::_initRdCostX86() m_afpDistortFunc[DF_HAD32] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; m_afpDistortFunc[DF_HAD64] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; m_afpDistortFunc[DF_HAD16N] = RdCost::xGetHADs_SIMD<Pel, Pel, vext>; + +#if ENABLE_SIMD_OPT_BIO + m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD_IBD_SIMD<vext>; +#endif } template void RdCost::_initRdCostX86<SIMDX86>(); diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index 5afb188dc802e0b87573f4b2d8c3f13c421b1b89..908bd0020d4c585116b5414f000b47b2aff9f038 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -794,6 +794,9 @@ void HLSyntaxReader::parseSPSNext( SPSNext& spsNext, const bool usePCM ) READ_FLAG( symbol, "imv_enable_flag" ); spsNext.setUseIMV ( symbol != 0 ); #if !REMOVE_MV_ADAPT_PREC READ_FLAG( symbol, "high_precision_motion_vectors" ); spsNext.setUseHighPrecMv(symbol != 0); +#endif +#if JVET_L0256_BIO + READ_FLAG( symbol, "bio_enable_flag" ); spsNext.setUseBIO ( symbol != 0 ); #endif READ_FLAG( symbol, "disable_motion_compression_flag" ); spsNext.setDisableMotCompress ( symbol != 0 ); READ_FLAG( symbol, "lm_chroma_enabled_flag" ); spsNext.setUseLMChroma ( symbol != 0 ); diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index 6ccf7acdff8a3f0792770fd94edd9196d187a1ad..00826ad0231d92fbcea10c1c35c75a093149368e 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -199,6 +199,9 @@ protected: bool m_AffineType; #if !REMOVE_MV_ADAPT_PREC bool m_highPrecMv; +#endif +#if JVET_L0256_BIO + bool m_BIO; #endif bool m_DisableMotionCompression; unsigned m_MTTMode; @@ -632,6 +635,10 @@ public: #if !REMOVE_MV_ADAPT_PREC void setHighPrecisionMv ( bool b ) { m_highPrecMv = b; } bool getHighPrecisionMv () { return m_highPrecMv; } +#endif +#if JVET_L0256_BIO + void setBIO(bool b) { m_BIO = b; } + bool getBIO() const { return m_BIO; } #endif void setDisableMotionCompression ( bool b ) { m_DisableMotionCompression = b; } bool getDisableMotionCompression () const { return m_DisableMotionCompression; } diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index 3e4b088bdf4739f18998a6aede40021e8839bddb..5f0cdc5613c9418498a623bf9db84c66a1ce12d3 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -843,6 +843,9 @@ void EncLib::xInitSPS(SPS &sps) sps.getSpsNext().setUseIMV ( m_ImvMode != IMV_OFF ); #if !REMOVE_MV_ADAPT_PREC sps.getSpsNext().setUseHighPrecMv ( m_highPrecMv ); +#endif +#if JVET_L0256_BIO + sps.getSpsNext().setUseBIO ( m_BIO ); #endif sps.getSpsNext().setUseAffine ( m_Affine ); sps.getSpsNext().setUseAffineType ( m_AffineType ); diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index a7738748090c9813d940f9e5cc5c9726df36f990..7ad505f512d60bc9a81a7895d096f1ceb4ddee52 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -534,6 +534,9 @@ void HLSWriter::codeSPSNext( const SPSNext& spsNext, const bool usePCM ) WRITE_FLAG( spsNext.getUseIMV() ? 1 : 0, "imv_enable_flag" ); #if !REMOVE_MV_ADAPT_PREC WRITE_FLAG( spsNext.getUseHighPrecMv() ? 1 : 0, "high_precision_motion_vectors"); +#endif +#if JVET_L0256_BIO + WRITE_FLAG( spsNext.getUseBIO() ? 1 : 0, "bio_enable_flag" ); #endif WRITE_FLAG( spsNext.getDisableMotCompress() ? 1 : 0, "disable_motion_compression_flag" ); WRITE_FLAG( spsNext.getUseLMChroma() ? 1 : 0, "lm_chroma_enabled_flag" );