diff --git a/cfg/encoder_lowdelay_P_vtm.cfg b/cfg/encoder_lowdelay_P_vtm.cfg index 2d24145ce7acb6aabf6cc9e5b58d88f1aa0d3db0..9b3089579746d5ef9d60de6861d0896e9a4125c0 100644 --- a/cfg/encoder_lowdelay_P_vtm.cfg +++ b/cfg/encoder_lowdelay_P_vtm.cfg @@ -130,6 +130,7 @@ LMCSEnable : 1 # LMCS: 0: disable, 1:enable LMCSSignalType : 0 # Input signal type: 0:SDR, 1:HDR-PQ, 2:HDR-HLG LMCSUpdateCtrl : 2 # LMCS model update control: 0:RA, 1:AI, 2:LDB/LDP MIP : 1 +PROF : 1 # Fast tools PBIntraFast : 1 diff --git a/cfg/encoder_lowdelay_vtm.cfg b/cfg/encoder_lowdelay_vtm.cfg index 1c7925030439df8d5d21787f90c455d2d77d3a1c..148f03230bde4390f01110b46104acdc695b9252 100644 --- a/cfg/encoder_lowdelay_vtm.cfg +++ b/cfg/encoder_lowdelay_vtm.cfg @@ -134,6 +134,7 @@ LMCSEnable : 1 # LMCS: 0: disable, 1:enable LMCSSignalType : 0 # Input signal type: 0:SDR, 1:HDR-PQ, 2:HDR-HLG LMCSUpdateCtrl : 2 # LMCS model update control: 0:RA, 1:AI, 2:LDB/LDP MIP : 1 +PROF : 1 # Fast tools PBIntraFast : 1 diff --git a/cfg/encoder_randomaccess_vtm.cfg b/cfg/encoder_randomaccess_vtm.cfg index 7938ee72a22e3e306fe3f352ed1508229168130f..2e4c7a0332fbd01f2bda17df62aded26c0a047bd 100644 --- a/cfg/encoder_randomaccess_vtm.cfg +++ b/cfg/encoder_randomaccess_vtm.cfg @@ -152,6 +152,7 @@ LMCSUpdateCtrl : 0 # LMCS model update control: 0:RA, 1:AI, 2 MIP : 1 DMVR : 1 SMVD : 1 +PROF : 1 # Fast tools PBIntraFast : 1 diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp index 9d54934350bdf8d3f7fda545951cb022b7f26cc3..cb741fa353b14f8e95c114d3e9dff9240737067a 100644 --- a/source/App/EncoderApp/EncApp.cpp +++ b/source/App/EncoderApp/EncApp.cpp @@ -254,6 +254,9 @@ void EncApp::xInitLibCfg() m_cEncLib.setSubPuMvpMode ( m_SubPuMvpMode ); m_cEncLib.setAffine ( m_Affine ); m_cEncLib.setAffineType ( m_AffineType ); +#if JVET_O0070_PROF + m_cEncLib.setPROF ( m_PROF ); +#endif m_cEncLib.setBIO (m_BIO); m_cEncLib.setUseLMChroma ( m_LMChroma ); m_cEncLib.setCclmCollocatedChromaFlag ( m_cclmCollocatedChromaFlag ); diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index 053b1c4b73ae7837e5fece3ab9e136438a3e666f..8e2167f99801d0a549f39842786d328e937c1b78 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -860,6 +860,9 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) ("MMVD", m_MMVD, true, "Enable Merge mode with Motion Vector Difference (0:off, 1:on) [default: 1]") ("Affine", m_Affine, false, "Enable affine prediction (0:off, 1:on) [default: off]") ("AffineType", m_AffineType, true, "Enable affine type prediction (0:off, 1:on) [default: on]" ) +#if JVET_O0070_PROF + ("PROF", m_PROF, false, "Enable Prediction refinement with optical flow for affine mode (0:off, 1:on) [default: off]") +#endif ("BIO", m_BIO, false, "Enable bi-directional optical flow") ("IMV", m_ImvMode, 1, "Adaptive MV precision Mode (IMV)\n" "\t0: disabled\n" @@ -2515,20 +2518,26 @@ bool EncAppCfg::xCheckParameter() xConfirmPara( m_uiMinQT[1] < 1<<MIN_CU_LOG2, "Minimum QT size should be larger than or equal to 4"); xConfirmPara( m_uiCTUSize < 16, "Maximum partition width size should be larger than or equal to 16"); xConfirmPara( m_uiCTUSize < 16, "Maximum partition height size should be larger than or equal to 16"); +#if !JVET_O0640_PICTURE_SIZE_CONSTRAINT xConfirmPara( (m_iSourceWidth % (1<<MIN_CU_LOG2))!=0, "Resulting coded frame width must be a multiple of the minimum unit size"); xConfirmPara( (m_iSourceHeight % (1<<MIN_CU_LOG2))!=0, "Resulting coded frame height must be a multiple of the minimum unit size"); xConfirmPara( (m_iSourceWidth % (1<<MIN_CU_LOG2))!=0, "Resulting coded frame width must be a multiple of the minimum unit size"); xConfirmPara( (m_iSourceHeight % (1<<MIN_CU_LOG2))!=0, "Resulting coded frame height must be a multiple of the minimum unit size"); xConfirmPara( (m_iSourceWidth % (1<<MIN_CU_LOG2))!=0, "Resulting coded frame width must be a multiple of the minimum unit size"); xConfirmPara( (m_iSourceHeight % (1<<MIN_CU_LOG2))!=0, "Resulting coded frame height must be a multiple of the minimum unit size"); +#endif xConfirmPara( m_uiMaxCUDepth < 1, "MaxPartitionDepth must be greater than zero"); xConfirmPara( (m_uiMaxCUWidth >> m_uiMaxCUDepth) < 4, "Minimum partition width size should be larger than or equal to 8"); xConfirmPara( (m_uiMaxCUHeight >> m_uiMaxCUDepth) < 4, "Minimum partition height size should be larger than or equal to 8"); xConfirmPara( m_uiMaxCUWidth < 16, "Maximum partition width size should be larger than or equal to 16"); xConfirmPara( m_uiMaxCUHeight < 16, "Maximum partition height size should be larger than or equal to 16"); +#if JVET_O0640_PICTURE_SIZE_CONSTRAINT + xConfirmPara( (m_iSourceWidth % (std::max(8, int(m_uiMaxCUWidth >> (m_uiMaxCUDepth - 1))))) != 0, "Resulting coded frame width must be a multiple of Max(8, the minimum CU size)"); + xConfirmPara( (m_iSourceHeight % (std::max(8, int(m_uiMaxCUHeight >> (m_uiMaxCUDepth - 1))))) != 0, "Resulting coded frame height must be a multiple of Max(8, the minimum CU size)"); +#else xConfirmPara( (m_iSourceWidth % (m_uiMaxCUWidth >> (m_uiMaxCUDepth-1)))!=0, "Resulting coded frame width must be a multiple of the minimum CU size"); xConfirmPara( (m_iSourceHeight % (m_uiMaxCUHeight >> (m_uiMaxCUDepth-1)))!=0, "Resulting coded frame height must be a multiple of the minimum CU size"); - +#endif #if MAX_TB_SIZE_SIGNALLING xConfirmPara( m_log2MaxTbSize > 6, "Log2MaxTbSize must be 6 or smaller." ); #endif @@ -2549,6 +2558,10 @@ bool EncAppCfg::xCheckParameter() if ( m_Affine == 0 ) { m_maxNumAffineMergeCand = m_SubPuMvpMode; +#if JVET_O0070_PROF + if (m_PROF) msg(WARNING, "PROF is forcefully disabled when Affine is off \n"); + m_PROF = false; +#endif } xConfirmPara( m_MTS < 0 || m_MTS > 3, "MTS must be greater than 0 smaller than 4" ); @@ -3359,6 +3372,9 @@ void EncAppCfg::xPrintParameter() { msg( VERBOSE, "AffineType:%d ", m_AffineType ); } +#if JVET_O0070_PROF + msg(VERBOSE, "PROF:%d ", m_PROF); +#endif msg(VERBOSE, "SubPuMvp:%d+%d ", m_SubPuMvpMode & 1, (m_SubPuMvpMode & 2) == 2); msg( VERBOSE, "DualITree:%d ", m_dualTree ); msg( VERBOSE, "IMV:%d ", m_ImvMode ); diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h index 6a0f421b677cc2204b621467b9466285ab4e3323..afb9f1024b4376a5e85781d5fe5be468d656453b 100644 --- a/source/App/EncoderApp/EncAppCfg.h +++ b/source/App/EncoderApp/EncAppCfg.h @@ -253,6 +253,9 @@ protected: int m_SubPuMvpMode; bool m_Affine; bool m_AffineType; +#if JVET_O0070_PROF + bool m_PROF; +#endif bool m_BIO; int m_LMChroma; bool m_cclmCollocatedChromaFlag; diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h index 42be89b58354401bf9e554e2edc844d0e997d5b8..da2c5e064614d95f4cd2e3ba49fe81a2eadc6982 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.h +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.h @@ -115,7 +115,7 @@ protected: static const int m_classToFilterMapping[NUM_FIXED_FILTER_SETS][MAX_NUM_ALF_CLASSES]; static const int m_fixedFilterSetCoeff[ALF_FIXED_FILTER_NUM][MAX_NUM_ALF_LUMA_COEFF]; short m_fixedFilterSetCoeffDec[NUM_FIXED_FILTER_SETS][MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; -#if JVET_O0090_ALF_CHROMA_FILTER_ALTERNATIVES_CTB +#if JVET_O0090_ALF_CHROMA_FILTER_ALTERNATIVES_CTB || JVET_O_MAX_NUM_ALF_APS_8 short m_coeffApsLuma[ALF_CTB_MAX_NUM_APS][MAX_NUM_ALF_LUMA_COEFF * MAX_NUM_ALF_CLASSES]; short m_clippApsLuma[ALF_CTB_MAX_NUM_APS][MAX_NUM_ALF_LUMA_COEFF * MAX_NUM_ALF_CLASSES]; #else diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp index 663d34dd00339d5b7f8c8c109f8feb41c280ec22..7af9b1fdf06e828e665bca22880b3f2c5230ff8f 100644 --- a/source/Lib/CommonLib/Buffer.cpp +++ b/source/Lib/CommonLib/Buffer.cpp @@ -42,6 +42,83 @@ #include "Buffer.h" #include "InterpolationFilter.h" +#if JVET_O0070_PROF +void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng) +{ + int idx = 0; + const int dIshift = 1; + const int dIoffset = 1 << (dIshift - 1); + + for (int h = 0; h < height; h++) + { + for (int w = 0; w < width; w++) + { + int32_t dI = dMvX[idx] * gradX[w] + dMvY[idx] * gradY[w]; + dI = (dI + dIoffset) >> dIshift; + + dI = (src[w] + dI + offset) >> shiftNum; + dst[w] = (Pel)ClipPel(dI, clpRng); + + idx++; + } + gradX += gradStride; + gradY += gradStride; + dst += dstStride; + src += srcStride; + } +} + +template<bool l1PROFEnabled = true> +void applyBiPROFCore (Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t w0, const ClpRng& clpRng) +{ + int idx = 16; + int32_t dI0 = 0; + int32_t dI1 = 0; + const int dIshift = 1; + const int dIoffset = 1 << (dIshift - 1); + + const int clipbd = clpRng.bd; + const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + g_GbiLog2WeightBase; + const int offset = (1 << (shiftNum - 1)) + (IF_INTERNAL_OFFS << g_GbiLog2WeightBase); + + const int8_t w1 = g_GbiWeightBase - w0; + + for (int h = 0; h < height; h++) + { + if (!(h & 3)) idx -= 16; + idx += 4; + + for (int w = 0; w < width; w++) + { + if (!(w & 3)) idx -= 4; + dI0 = dMvX0[idx] * gradX0[w] + dMvY0[idx] * gradY0[w]; + dI0 = (dI0 + dIoffset) >> dIshift; + if (l1PROFEnabled) + { + dI1 = dMvX1[idx] * gradX1[w] + dMvY1[idx] * gradY1[w]; + dI1 = (dI1 + dIoffset) >> dIshift; + dst[w] = (Pel)ClipPel(rightShift(((src0[w] + dI0) * w0 + (src1[w] + dI1) * w1 + offset), shiftNum), clpRng); + } + else + dst[w] = (Pel)ClipPel(rightShift(((src0[w] + dI0) * w0 + src1[w] * w1 + offset), shiftNum), clpRng); + + idx++; + } + + gradX0 += gradStride; + gradY0 += gradStride; + if (l1PROFEnabled) + { + gradX1 += gradStride; + gradY1 += gradStride; + } + dst += dstStride; + src0 += srcStride; + src1 += srcStride; + } +} +#endif + template< typename T > void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T* dest, int dstStride, int width, int height, int rshift, int offset, const ClpRng& clpRng ) { @@ -86,6 +163,9 @@ void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Str } } +#if JVET_O0070_PROF +template<bool PAD = true> +#endif void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth) { Pel* srcTmp = pSrc + srcStride + 1; @@ -97,14 +177,23 @@ void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStr { for (int x = 0; x < (width - 2 * BIO_EXTEND_SIZE); x++) { +#if JVET_O0570_GRAD_SIMP + gradYTmp[x] = ( srcTmp[x + srcStride] >> shift1 ) - ( srcTmp[x - srcStride] >> shift1 ); + gradXTmp[x] = ( srcTmp[x + 1] >> shift1 ) - ( srcTmp[x - 1] >> shift1 ); +#else gradYTmp[x] = (srcTmp[x + srcStride] - srcTmp[x - srcStride]) >> shift1; gradXTmp[x] = (srcTmp[x + 1] - srcTmp[x - 1]) >> shift1; +#endif } gradXTmp += gradStride; gradYTmp += gradStride; srcTmp += srcStride; } +#if JVET_O0070_PROF + if (PAD) + { +#endif gradXTmp = gradX + gradStride + 1; gradYTmp = gradY + gradStride + 1; for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++) @@ -124,6 +213,9 @@ void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStr ::memcpy(gradXTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradXTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width)); ::memcpy(gradYTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradYTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width)); +#if JVET_O0070_PROF + } +#endif } void calcBIOParCore(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG, const int bitDepth) @@ -280,6 +372,13 @@ PelBufferOps::PelBufferOps() removeHighFreq4 = removeHighFreq; #endif +#if JVET_O0070_PROF + profGradFilter = gradFilterCore <false>; + applyPROF = applyPROFCore; + applyBiPROF[1] = applyBiPROFCore; + applyBiPROF[0] = applyBiPROFCore <false>; + roundIntVector = nullptr; +#endif } PelBufferOps g_pelBufOP = PelBufferOps(); diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index 5c287d84f7402454d793450f58b932a65c7a0d36..b595ee53445ef1336d4684dc9090d74a1dad354f 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -79,6 +79,12 @@ struct PelBufferOps void ( *removeHighFreq8) ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height); void ( *removeHighFreq4) ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height); #endif +#if JVET_O0070_PROF + void (*profGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth); + void (*applyPROF) (Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng); + void (*applyBiPROF[2]) (Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t gbiWeightL0, const ClpRng& clpRng); + void (*roundIntVector) (int* v, int size, unsigned int nShift, const int dmvLimit); +#endif }; extern PelBufferOps g_pelBufOP; diff --git a/source/Lib/CommonLib/CodingStructure.cpp b/source/Lib/CommonLib/CodingStructure.cpp index 36d3109af203229fbbe75f7a39754ad409055632..71b0c4ce17c710cd2907479ca59f0995b40815ae 100644 --- a/source/Lib/CommonLib/CodingStructure.cpp +++ b/source/Lib/CommonLib/CodingStructure.cpp @@ -67,6 +67,12 @@ CodingStructure::CodingStructure(CUCache& cuCache, PUCache& puCache, TUCache& tu , m_cuCache ( cuCache ) , m_puCache ( puCache ) , m_tuCache ( tuCache ) +#if JVET_O0070_PROF + , bestParent ( nullptr ) +#endif +#if JVET_O1170_CHECK_BV_AT_DECODER + , resetIBCBuffer (false) +#endif { for( uint32_t i = 0; i < MAX_NUM_COMPONENT; i++ ) { @@ -1439,4 +1445,4 @@ IbcLumaCoverage CodingStructure::getIbcLumaCoverage(const CompArea& chromaArea) return coverage; } -#endif \ No newline at end of file +#endif diff --git a/source/Lib/CommonLib/CodingStructure.h b/source/Lib/CommonLib/CodingStructure.h index db56903aa1a01590686b33e8729793912e17a96e..8589a75a78e47024d8e7fdadc2982458059a6700 100644 --- a/source/Lib/CommonLib/CodingStructure.h +++ b/source/Lib/CommonLib/CodingStructure.h @@ -97,7 +97,11 @@ public: bool isLossless; const SPS *sps; const PPS *pps; +#if JVET_O_MAX_NUM_ALF_APS_8 + APS* alfApss[ALF_CTB_MAX_NUM_APS]; +#else APS* alfApss[MAX_NUM_APS]; +#endif APS * lmcsAps; const VPS *vps; const PreCalcValues* pcv; @@ -195,10 +199,6 @@ public: LutMotionCand motionLut; -#if JVET_O1170_CHECK_BV_AT_DECODER - bool resetIBCBuffer; -#endif - void addMiToLut(static_vector<MotionInfo, MAX_NUM_HMVP_CANDS>& lut, const MotionInfo &mi); private: @@ -234,7 +234,13 @@ private: MotionInfo *m_motionBuf; public: - +#if JVET_O0070_PROF + CodingStructure *bestParent; +#endif +#if JVET_O1170_CHECK_BV_AT_DECODER + bool resetIBCBuffer; +#endif + MotionBuf getMotionBuf( const Area& _area ); MotionBuf getMotionBuf( const UnitArea& _area ) { return getMotionBuf( _area.Y() ); } MotionBuf getMotionBuf() { return getMotionBuf( area.Y() ); } diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index d7347880934615c86d8d58ed3bdaaa8c9f2a0cb8..15f3f0650b406e841b62af05b841fec9d09baee7 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -190,7 +190,11 @@ static const int MAX_NUM_ALF_COEFF = MAX_ALF_FILTE static const int MAX_ALF_PADDING_SIZE = 4; static const int ALF_FIXED_FILTER_NUM = 64; +#if JVET_O_MAX_NUM_ALF_APS_8 +static const int ALF_CTB_MAX_NUM_APS = 8; +#else static const int ALF_CTB_MAX_NUM_APS = 6; +#endif static const int NUM_FIXED_FILTER_SETS = 16; static const int NUM_TOTAL_FILTER_SETS = NUM_FIXED_FILTER_SETS + ALF_CTB_MAX_NUM_APS; @@ -371,6 +375,10 @@ static const int MAX_NUM_GT2_BINS_2x2SUBBLOCK = 2; ///< max static const int BIO_EXTEND_SIZE = 1; static const int BIO_TEMP_BUFFER_SIZE = (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE) * (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE); +#if JVET_O0070_PROF +static const int PROF_BORDER_EXT_W = 1; +static const int PROF_BORDER_EXT_H = 1; +#endif static const int GBI_NUM = 5; ///< the number of weight options static const int GBI_DEFAULT = ((uint8_t)(GBI_NUM >> 1)); ///< Default weighting index representing for w=0.5 static const int GBI_SIZE_CONSTRAINT = 256; ///< disabling GBi if cu size is smaller than 256 diff --git a/source/Lib/CommonLib/ContextModelling.h b/source/Lib/CommonLib/ContextModelling.h index ba1335b4bac1eb3d09ad3ff929f6bff71dfcb79b..af67152c7a99756347c8692c9679de86f34f3946 100644 --- a/source/Lib/CommonLib/ContextModelling.h +++ b/source/Lib/CommonLib/ContextModelling.h @@ -129,11 +129,23 @@ public: } } #undef UPDATE + + +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + int ctxOfs = std::min((sumAbs+1)>>1, 3) + ( diag < 2 ? 4 : 0 ); +#else int ctxOfs = std::min( sumAbs, 5 ) + ( diag < 2 ? 6 : 0 ); +#endif + if( m_chType == CHANNEL_TYPE_LUMA ) { +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + ctxOfs += diag < 5 ? 4 : 0; +#else ctxOfs += diag < 5 ? 6 : 0; +#endif } + m_tmplCpDiag = diag; m_tmplCpSum1 = sumAbs - numPos; return m_sigFlagCtxSet[std::max( 0, state-1 )]( ctxOfs ); diff --git a/source/Lib/CommonLib/Contexts.cpp b/source/Lib/CommonLib/Contexts.cpp index d1d3c9cff6db554e56b8cd59c9f550aac8c7dea5..8903da8fae4c4653fdab9fbff1ff9bb953a7b9ad 100755 --- a/source/Lib/CommonLib/Contexts.cpp +++ b/source/Lib/CommonLib/Contexts.cpp @@ -538,6 +538,50 @@ const CtxSet ContextSetCfg::SigCoeffGroup[] = const CtxSet ContextSetCfg::SigFlag[] = { +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + ContextSetCfg::addCtxSet + ({ + { 88, 166, 182, 169, 101, 167, 168, 155, 194, 213, 183, 156, }, + { 132, 152, 168, 140, 177, 182, 154, 155, 151, 213, 169, 156, }, + { 89, 138, 139, 140, 150, 139, 140, 141, 138, 185, 141, 157, }, + { 12, 9, 9, 10, 9, 9, 9, 9, 8, 8, 8, 10, }, + }), + ContextSetCfg::addCtxSet + ({ + { 27, 167, 168, 140, 180, 199, 214, 186, }, + { 133, 138, 139, 140, 181, 214, 200, 157, }, + { 134, 153, 154, 155, 167, 186, 186, 143, }, + { 9, 9, 9, 13, 5, 5, 8, 9, }, + }), + ContextSetCfg::addCtxSet + ({ + { 152, 156, 186, 202, 182, 249, 247, 207, 182, 223, 223, 223, }, + { 123, 142, 172, 218, 138, 250, 248, 223, 125, 223, 223, 223, }, + { 93, 142, 143, 175, 153, 223, 223, 238, 154, 223, 223, 223, }, + { 9, 12, 8, 8, 8, 8, 8, 5, 8, 0, 0, 0, }, + }), + ContextSetCfg::addCtxSet + ({ + { 182, 171, 143, 190, 183, 223, 223, 223, }, + { 168, 156, 216, 249, 169, 223, 223, 223, }, + { 138, 173, 157, 223, 170, 223, 223, 223, }, + { 8, 12, 8, 8, 4, 0, 0, 0, }, + }), + ContextSetCfg::addCtxSet + ({ + { 123, 175, 223, 223, 212, 223, 223, 223, 0, 223, 223, 223, }, + { 123, 223, 205, 223, 138, 223, 223, 223, 196, 223, 223, 223, }, + { 107, 206, 223, 223, 93, 223, 223, 238, 55, 223, 223, 223, }, + { 8, 8, 8, 8, 8, 0, 4, 4, 0, 0, 0, 0, }, + }), + ContextSetCfg::addCtxSet + ({ + { 167, 187, 249, 207, 181, 223, 223, 223, }, + { 167, 157, 191, 223, 152, 223, 223, 223, }, + { 152, 236, 223, 223, 123, 223, 223, 223, }, + { 8, 8, 8, 8, 4, 0, 0, 0, }, + }), +#else ContextSetCfg::addCtxSet ({ { 88, 166, 152, 182, 168, 154, 116, 167, 182, 168, 183, 155, 208, 213, 183, 183, 169, 185, }, @@ -580,6 +624,7 @@ const CtxSet ContextSetCfg::SigFlag[] = { 137, 250, 223, 237, 234, 223, 123, 223, 223, 223, 223, 223, }, { 8, 8, 1, 8, 8, 8, 4, 0, 0, 0, 0, 0, }, }) +#endif }; const CtxSet ContextSetCfg::ParFlag[] = diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp index 152fe036d39b71d6d30e1c68b7ba0a52d2e9cd90..0e97c92f19bd72f750b604bd0b7ecc65f1676f1d 100644 --- a/source/Lib/CommonLib/DepQuant.cpp +++ b/source/Lib/CommonLib/DepQuant.cpp @@ -401,12 +401,20 @@ namespace DQIntern const int diag = m_scanId2BlkPos[nextScanIdx].x + m_scanId2BlkPos[nextScanIdx].y; if( m_chType == CHANNEL_TYPE_LUMA ) { +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + scanInfo.sigCtxOffsetNext = ( diag < 2 ? 8 : diag < 5 ? 4 : 0 ); +#else scanInfo.sigCtxOffsetNext = ( diag < 2 ? 12 : diag < 5 ? 6 : 0 ); +#endif scanInfo.gtxCtxOffsetNext = ( diag < 1 ? 16 : diag < 3 ? 11 : diag < 10 ? 6 : 1 ); } else { +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + scanInfo.sigCtxOffsetNext = ( diag < 2 ? 4 : 0 ); +#else scanInfo.sigCtxOffsetNext = ( diag < 2 ? 6 : 0 ); +#endif scanInfo.gtxCtxOffsetNext = ( diag < 1 ? 6 : 1 ); } scanInfo.nextInsidePos = nextScanIdx & m_sbbMask; @@ -452,7 +460,11 @@ namespace DQIntern static const unsigned sm_numCtxSetsSig = 3; static const unsigned sm_numCtxSetsGtx = 2; static const unsigned sm_maxNumSigSbbCtx = 2; +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + static const unsigned sm_maxNumSigCtx = 12; +#else static const unsigned sm_maxNumSigCtx = 18; +#endif static const unsigned sm_maxNumGtxCtx = 21; private: @@ -570,7 +582,11 @@ namespace DQIntern { BinFracBits* bits = m_sigFracBits [ ctxSetId ]; const CtxSet& ctxSet = Ctx::SigFlag [ chType + 2*ctxSetId ]; +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + const unsigned numCtx = ( chType == CHANNEL_TYPE_LUMA ? 12 : 8 ); +#else const unsigned numCtx = ( chType == CHANNEL_TYPE_LUMA ? 18 : 12 ); +#endif for( unsigned ctxId = 0; ctxId < numCtx; ctxId++ ) { bits[ ctxId ] = fracBitsAccess.getFracBitsArray( ctxSet( ctxId ) ); @@ -679,7 +695,11 @@ namespace DQIntern { CHECKD( lambda <= 0.0, "Lambda must be greater than 0" ); +#if JVET_O0919_TS_MIN_QP + const int qpDQ = cQP.Qp(tu.mtsIdx==MTS_SKIP && isLuma(compID)) + 1; +#else const int qpDQ = cQP.Qp + 1; +#endif const int qpPer = qpDQ / 6; const int qpRem = qpDQ - 6 * qpPer; const SPS& sps = *tu.cs->sps; @@ -748,7 +768,11 @@ namespace DQIntern } //----- set dequant parameters ----- +#if JVET_O0919_TS_MIN_QP + const int qpDQ = cQP.Qp(tu.mtsIdx==MTS_SKIP && isLuma(compID)) + 1; +#else const int qpDQ = cQP.Qp + 1; +#endif const int qpPer = qpDQ / 6; const int qpRem = qpDQ - 6 * qpPer; const SPS& sps = *tu.cs->sps; @@ -1163,7 +1187,11 @@ namespace DQIntern } #undef UPDATE TCoeff sumGt1 = sumAbs1 - sumNum; +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + m_sigFracBits = m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + std::min( (sumAbs1+1)>>1, 3 )]; +#else m_sigFracBits = m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + (sumAbs1 < 5 ? sumAbs1 : 5)]; +#endif m_coeffFracBits = m_gtxFracBitsArray[scanInfo.gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)]; TCoeff sumAbs = m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos] >> 8; @@ -1277,7 +1305,11 @@ namespace DQIntern TCoeff sumNum = tinit & 7; TCoeff sumAbs1 = ( tinit >> 3 ) & 31; TCoeff sumGt1 = sumAbs1 - sumNum; +#if JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION + m_sigFracBits = m_sigFracBitsArray[ scanInfo.sigCtxOffsetNext + std::min( (sumAbs1+1)>>1, 3 ) ]; +#else m_sigFracBits = m_sigFracBitsArray[ scanInfo.sigCtxOffsetNext + ( sumAbs1 < 5 ? sumAbs1 : 5 ) ]; +#endif m_coeffFracBits = m_gtxFracBitsArray[ scanInfo.gtxCtxOffsetNext + ( sumGt1 < 4 ? sumGt1 : 4 ) ]; } } @@ -1721,7 +1753,11 @@ void DepQuant::quant( TransformUnit &tu, const ComponentID &compID, const CCoeff if( tu.cs->slice->getDepQuantEnabledFlag() && (tu.mtsIdx != MTS_SKIP || !isLuma(compID)) ) { //===== scaling matrix ==== +#if JVET_O0919_TS_MIN_QP + const int qpDQ = cQP.Qp(tu.mtsIdx==MTS_SKIP && isLuma(compID)) + 1; +#else const int qpDQ = cQP.Qp + 1; +#endif const int qpPer = qpDQ / 6; const int qpRem = qpDQ - 6 * qpPer; const CompArea &rect = tu.blocks[compID]; @@ -1744,7 +1780,11 @@ void DepQuant::dequant( const TransformUnit &tu, CoeffBuf &dstCoeff, const Compo { if( tu.cs->slice->getDepQuantEnabledFlag() && (tu.mtsIdx != MTS_SKIP || !isLuma(compID)) ) { +#if JVET_O0919_TS_MIN_QP + const int qpDQ = cQP.Qp(tu.mtsIdx==MTS_SKIP && isLuma(compID)) + 1; +#else const int qpDQ = cQP.Qp + 1; +#endif const int qpPer = qpDQ / 6; const int qpRem = qpDQ - 6 * qpPer; const CompArea &rect = tu.blocks[compID]; diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index 0a9d1d834c67eb7174508a52d17c33343864e620..ae31a9a74c9bc98dc72441fd5c763ff6798cace7 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -57,6 +57,11 @@ InterPrediction::InterPrediction() , m_maxCompIDToPred ( MAX_NUM_COMPONENT ) , m_pcRdCost ( nullptr ) , m_storedMv ( nullptr ) +#if JVET_O0070_PROF +, m_skipPROF (false) +, m_encOnly (false) +, m_isBi (false) +#endif , m_gradX0(nullptr) , m_gradY0(nullptr) , m_gradX1(nullptr) @@ -471,6 +476,9 @@ void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& if ( pu.cu->affine ) { CHECK( bioApplied, "BIO is not allowed with affine" ); +#if JVET_O0070_PROF + m_iRefListIdx = eRefPicList; +#endif xPredAffineBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv, pcYuvPred, bi, pu.cu->slice->clpRng( compID ) ); } else @@ -527,7 +535,13 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred) if (biocheck0 && biocheck1 && PU::isBiPredFromDifferentDir(pu) +#if JVET_O0634_BDOF_SIZE_CONSTRAINT + && (pu.Y().height >= 8) + && (pu.Y().width >= 8) + && ((pu.Y().height * pu.Y().width) >= 128) +#else && pu.Y().height != 4 +#endif ) { bioApplied = true; @@ -915,13 +929,99 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio const int iVerMax = ( sps.getPicHeightInLumaSamples() + iOffset - pu.Y().y - 1 ) << iMvShift; const int iVerMin = ( -(int)pu.cs->pcv->maxCUHeight - iOffset - (int)pu.Y().y + 1 ) << iMvShift; +#if !JVET_O0070_PROF PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], pu.blocks[compID]); +#endif const int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA; const int shift = iBit - 4 + MV_FRACTIONAL_BITS_INTERNAL; bool wrapRef = false; const bool subblkMVSpreadOverLimit = isSubblockVectorSpreadOverLimit( iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY, pu.interDir ); +#if JVET_O0070_PROF + bool enablePROF = (sps.getUsePROF()) && (!m_skipPROF) && (compID == COMPONENT_Y); + enablePROF &= !((pu.cu->affineType == AFFINEMODEL_6PARAM && _mv[0] == _mv[1] && _mv[0] == _mv[2]) || (pu.cu->affineType == AFFINEMODEL_4PARAM && _mv[0] == _mv[1])); + enablePROF &= !subblkMVSpreadOverLimit; + const int profThres = 1 << (iBit + (m_isBi ? 1 : 0)); + enablePROF &= !m_encOnly || pu.cu->slice->getCheckLDC() || iDMvHorX > profThres || iDMvHorY > profThres || iDMvVerX > profThres || iDMvVerY > profThres || iDMvHorX < -profThres || iDMvHorY < -profThres || iDMvVerX < -profThres || iDMvVerY < -profThres; + + if (compID == COMPONENT_Y) + { + m_applyPROF[m_iRefListIdx] = enablePROF; + } + + bool isLast = enablePROF ? false : !bi; + + const int cuExtW = pu.blocks[compID].width + PROF_BORDER_EXT_W * 2; + const int cuExtH = pu.blocks[compID].height + PROF_BORDER_EXT_H * 2; + + PelBuf gradXExt(m_gradBuf[m_iRefListIdx][0], cuExtW, cuExtH); + PelBuf gradYExt(m_gradBuf[m_iRefListIdx][1], cuExtW, cuExtH); + + const int MAX_FILTER_SIZE = std::max<int>(NTAPS_LUMA, NTAPS_CHROMA); + const int dstExtW = ((blockWidth + PROF_BORDER_EXT_W * 2 + 7) >> 3) << 3; + const int dstExtH = blockHeight + PROF_BORDER_EXT_H * 2; + PelBuf dstExtBuf(m_filteredBlockTmp[1][compID], dstExtW, dstExtH); + + const int refExtH = dstExtH + MAX_FILTER_SIZE - 1; + PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], dstExtW, refExtH); + + PelBuf &dstBuf = dstPic.bufs[compID]; + + int *dMvScaleHor = m_dMvBuf[m_iRefListIdx]; + int *dMvScaleVer = m_dMvBuf[m_iRefListIdx] + 16; + + if (enablePROF && !bi) + { + int* dMvH = dMvScaleHor; + int* dMvV = dMvScaleVer; + int quadHorX = iDMvHorX << 2; + int quadHorY = iDMvHorY << 2; + int quadVerX = iDMvVerX << 2; + int quadVerY = iDMvVerY << 2; + + dMvH[0] = ((iDMvHorX + iDMvVerX) << 1) - ((quadHorX + quadVerX) << 1); + dMvV[0] = ((iDMvHorY + iDMvVerY) << 1) - ((quadHorY + quadVerY) << 1); + + for (int w = 1; w < blockWidth; w++) + { + dMvH[w] = dMvH[w - 1] + quadHorX; + dMvV[w] = dMvV[w - 1] + quadHorY; + } + + dMvH += blockWidth; + dMvV += blockWidth; + for (int h = 1; h < blockHeight; h++) + { + for (int w = 0; w < blockWidth; w++) + { + dMvH[w] = dMvH[w - blockWidth] + quadVerX; + dMvV[w] = dMvV[w - blockWidth] + quadVerY; + } + dMvH += blockWidth; + dMvV += blockWidth; + } + + const int bdlimit = std::max<int>(6, clpRng.bd - 6); + const int dmvLimit = 1 << bdlimit; + + if (!g_pelBufOP.roundIntVector) + { + for (int idx = 0; idx < blockWidth * blockHeight; idx++) + { + roundAffineMv(dMvScaleHor[idx], dMvScaleVer[idx], shift); + dMvScaleHor[idx] = Clip3(-dmvLimit, dmvLimit - 1, dMvScaleHor[idx]); + dMvScaleVer[idx] = Clip3(-dmvLimit, dmvLimit - 1, dMvScaleVer[idx]); + } + } + else + { + int sz = blockWidth * blockHeight; + g_pelBufOP.roundIntVector(dMvScaleHor, sz, shift, dmvLimit); + g_pelBufOP.roundIntVector(dMvScaleVer, sz, shift, dmvLimit); + } + } +#endif // get prediction block by block for ( int h = 0; h < cxHeight; h += blockHeight ) { @@ -1007,23 +1107,108 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio } const CPelBuf refBuf = refPic->getRecoBuf( CompArea( compID, chFmt, pu.blocks[compID].offset(xInt + w, yInt + h), pu.blocks[compID] ), wrapRef ); +#if !JVET_O0070_PROF PelBuf &dstBuf = dstPic.bufs[compID]; +#endif + +#if JVET_O0070_PROF + Pel* ref = (Pel*) refBuf.buf; + Pel* dst = dstBuf.buf + w + h * dstBuf.stride; + + int refStride = refBuf.stride; + int dstStride = dstBuf.stride; + + int bw = blockWidth; + int bh = blockHeight; + + if (enablePROF) + { + dst = dstExtBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + dstStride = dstExtBuf.stride; + } +#endif if ( yFrac == 0 ) { +#if JVET_O0070_PROF + m_if.filterHor( compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, xFrac, isLast, chFmt, clpRng); +#else m_if.filterHor( compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf + w + h * dstBuf.stride, dstBuf.stride, blockWidth, blockHeight, xFrac, !bi, chFmt, clpRng ); +#endif } else if ( xFrac == 0 ) { +#if JVET_O0070_PROF + m_if.filterVer( compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, yFrac, true, isLast, chFmt, clpRng); +#else m_if.filterVer( compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf + w + h * dstBuf.stride, dstBuf.stride, blockWidth, blockHeight, yFrac, true, !bi, chFmt, clpRng ); +#endif } else { +#if JVET_O0070_PROF + m_if.filterHor( compID, (Pel*)ref - ((vFilterSize>>1) -1)*refStride, refStride, tmpBuf.buf, tmpBuf.stride, bw, bh+vFilterSize-1, xFrac, false, chFmt, clpRng); +#else m_if.filterHor( compID, (Pel*) refBuf.buf - ((vFilterSize>>1) -1)*refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, blockWidth, blockHeight+vFilterSize-1, xFrac, false, chFmt, clpRng); +#endif JVET_J0090_SET_CACHE_ENABLE( false ); +#if JVET_O0070_PROF + m_if.filterVer( compID, tmpBuf.buf + ((vFilterSize>>1) -1)*tmpBuf.stride, tmpBuf.stride, dst, dstStride, bw, bh, yFrac, false, isLast, chFmt, clpRng); +#else m_if.filterVer( compID, tmpBuf.buf + ((vFilterSize>>1) -1)*tmpBuf.stride, tmpBuf.stride, dstBuf.buf + w + h * dstBuf.stride, dstBuf.stride, blockWidth, blockHeight, yFrac, false, !bi, chFmt, clpRng); +#endif JVET_J0090_SET_CACHE_ENABLE( true ); } +#if JVET_O0070_PROF + if (enablePROF) + { + const int shift = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); + const int xOffset = xFrac >> 3; + const int yOffset = yFrac >> 3; + + const int refOffset = (blockHeight + 1) * refStride; + const int dstOffset = (blockHeight + 1)* dstStride; + + const Pel* refPel = ref - (1 - yOffset) * refStride + xOffset - 1; + Pel* dstPel = dst - dstStride - 1; + for (int pw = 0; pw < blockWidth + 2; pw++) + { + dstPel[pw] = leftShift_round(refPel[pw], shift) - (Pel)IF_INTERNAL_OFFS; + dstPel[pw+dstOffset] = leftShift_round(refPel[pw+refOffset], shift) - (Pel)IF_INTERNAL_OFFS; + } + + refPel = ref + yOffset * refBuf.stride + xOffset; + dstPel = dst; + for (int ph = 0; ph < blockHeight; ph++, refPel += refStride, dstPel += dstStride) + { + dstPel[-1] = leftShift_round(refPel[-1], shift) - (Pel)IF_INTERNAL_OFFS; + dstPel[blockWidth] = leftShift_round(refPel[blockWidth], shift) - (Pel)IF_INTERNAL_OFFS; + } + + PelBuf gradXBuf = gradXExt.subBuf(w, h, blockWidth + 2, blockHeight + 2); + PelBuf gradYBuf = gradYExt.subBuf(w, h, blockWidth + 2, blockHeight + 2); + g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, gradXBuf.stride, gradXBuf.buf, gradYBuf.buf, clpRng.bd); + + const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); + const Pel offset = (1 << (shiftNum - 1)) + IF_INTERNAL_OFFS; + Pel* src = dstExtBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + Pel* gX = gradXBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + Pel* gY = gradYBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + + Pel * dstY = dstBuf.bufAt(w, h); + + if (!bi) + { + g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, shiftNum, offset, clpRng); + } + else + { + PelBuf srcExtBuf(src, dstExtBuf.stride, Size(blockWidth, blockHeight)); + PelBuf destBuf(dstY, dstBuf.stride, Size(blockWidth, blockHeight)); + destBuf.copyFrom(srcExtBuf); + } + } +#endif } } } @@ -1231,6 +1416,17 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB if( iRefIdx0 >= 0 && iRefIdx1 >= 0 ) { +#if JVET_O0070_PROF + if (pu.cu->affine && (m_applyPROF[0] || m_applyPROF[1])) + { + xApplyBiPROF(pu, pcYuvSrc0.bufs[COMPONENT_Y], pcYuvSrc1.bufs[COMPONENT_Y], pcYuvDst.bufs[COMPONENT_Y], clpRngs.comp[COMPONENT_Y]); + pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->GBiIdx, true); +#if JVET_O0108_DIS_DMVR_BDOF_CIIP + CHECK(yuvDstTmp, "yuvDstTmp is disallowed with PROF"); +#endif + return; + } +#endif #if JVET_O0681_DIS_BPWA_CIIP if( pu.cu->GBiIdx != GBI_DEFAULT && (yuvDstTmp || !pu.mhIntraFlag) ) #else @@ -1337,6 +1533,124 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB } } +#if JVET_O0070_PROF +void InterPrediction::xApplyBiPROF(const PredictionUnit &pu, const CPelBuf& pcYuvSrc0, const CPelBuf& pcYuvSrc1, PelBuf& pcYuvDst, const ClpRng& clpRng) +{ + int blockWidth = AFFINE_MIN_BLOCK_SIZE; + int blockHeight = AFFINE_MIN_BLOCK_SIZE; + + CHECK(!m_applyPROF[0] && !m_applyPROF[1], "xApplyBiPROF() applies PROF for at least one list."); + const int width = pu.Y().width; + const int height = pu.Y().height; + + const int bit = MAX_CU_DEPTH; + const int shift = bit - 4 + MV_FRACTIONAL_BITS_INTERNAL; + const int bdlimit = std::max<int>(6, clpRng.bd - 6); + const int dmvLimit = 1 << bdlimit; + + for (int list = 0; list < 2; list++) + { + if (m_applyPROF[list]) + { + Mv mvLT = pu.mvAffi[list][0]; + Mv mvRT = pu.mvAffi[list][1]; + Mv mvLB = pu.mvAffi[list][2]; + + int dMvHorX, dMvHorY, dMvVerX, dMvVerY; + dMvHorX = (mvRT - mvLT).getHor() << (bit - g_aucLog2[width]); + dMvHorY = (mvRT - mvLT).getVer() << (bit - g_aucLog2[width]); + if (pu.cu->affineType == AFFINEMODEL_6PARAM) + { + dMvVerX = (mvLB - mvLT).getHor() << (bit - g_aucLog2[height]); + dMvVerY = (mvLB - mvLT).getVer() << (bit - g_aucLog2[height]); + } + else + { + dMvVerX = -dMvHorY; + dMvVerY = dMvHorX; + } + + int *dMvScaleHor = m_dMvBuf[list]; + int *dMvScaleVer = m_dMvBuf[list] + 16; + + int* dMvH = dMvScaleHor; + int* dMvV = dMvScaleVer; + int quadHorX = dMvHorX << 2; + int quadHorY = dMvHorY << 2; + int quadVerX = dMvVerX << 2; + int quadVerY = dMvVerY << 2; + + dMvH[0] = ((dMvHorX + dMvVerX) << 1) - ((quadHorX + quadVerX) << 1); + dMvV[0] = ((dMvHorY + dMvVerY) << 1) - ((quadHorY + quadVerY) << 1); + + for (int w = 1; w < blockWidth; w++) + { + dMvH[w] = dMvH[w - 1] + quadHorX; + dMvV[w] = dMvV[w - 1] + quadHorY; + } + + dMvH += blockWidth; + dMvV += blockWidth; + for (int h = 1; h < blockHeight; h++) + { + for (int w = 0; w < blockWidth; w++) + { + dMvH[w] = dMvH[w - blockWidth] + quadVerX; + dMvV[w] = dMvV[w - blockWidth] + quadVerY; + } + dMvH += blockWidth; + dMvV += blockWidth; + } + + if (!g_pelBufOP.roundIntVector) + { + for (int idx = 0; idx < blockWidth * blockHeight; idx++) + { + roundAffineMv(dMvScaleHor[idx], dMvScaleVer[idx], shift); + dMvScaleHor[idx] = Clip3(-dmvLimit, dmvLimit - 1, dMvScaleHor[idx]); + dMvScaleVer[idx] = Clip3(-dmvLimit, dmvLimit - 1, dMvScaleVer[idx]); + } + } + else + { + int sz = blockWidth * blockHeight; + g_pelBufOP.roundIntVector(dMvScaleHor, sz, shift, dmvLimit); + g_pelBufOP.roundIntVector(dMvScaleVer, sz, shift, dmvLimit); + } + } + } + + const int cuExtW = width + PROF_BORDER_EXT_W * 2; + const int cuExtH = height + PROF_BORDER_EXT_H * 2; + + PelBuf gradXExt0 = PelBuf(m_gradBuf[REF_PIC_LIST_0][0], cuExtW, cuExtH); + PelBuf gradYExt0 = PelBuf(m_gradBuf[REF_PIC_LIST_0][1], cuExtW, cuExtH); + PelBuf gradXExt1 = PelBuf(m_gradBuf[REF_PIC_LIST_1][0], cuExtW, cuExtH); + PelBuf gradYExt1 = PelBuf(m_gradBuf[REF_PIC_LIST_1][1], cuExtW, cuExtH); + + Pel* gX0 = gradXExt0.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + Pel* gY0 = gradYExt0.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + Pel* gX1 = gradXExt1.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + Pel* gY1 = gradYExt1.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); + + int *dMvX0 = m_dMvBuf[REF_PIC_LIST_0]; + int *dMvY0 = m_dMvBuf[REF_PIC_LIST_0] + 16; + int *dMvX1 = m_dMvBuf[REF_PIC_LIST_1]; + int *dMvY1 = m_dMvBuf[REF_PIC_LIST_1] + 16; + + const Pel* srcY0 = pcYuvSrc0.bufAt(0, 0); + const Pel* srcY1 = pcYuvSrc1.bufAt(0, 0); + Pel* dstY = pcYuvDst.bufAt(0, 0); + + if(m_applyPROF[0] && m_applyPROF[1]) + g_pelBufOP.applyBiPROF[1](dstY, pcYuvDst.stride, srcY0, srcY1, pcYuvSrc0.stride, width, height, gX0, gY0, gX1, gY1, gradXExt0.stride, dMvX0, dMvY0, dMvX1, dMvY1, blockWidth, getGbiWeight(pu.cu->GBiIdx, REF_PIC_LIST_0), clpRng); + else if (m_applyPROF[0]) + g_pelBufOP.applyBiPROF[0](dstY, pcYuvDst.stride, srcY0, srcY1, pcYuvSrc0.stride, width, height, gX0, gY0, gX1, gY1, gradXExt0.stride, dMvX0, dMvY0, dMvX1, dMvY1, blockWidth, getGbiWeight(pu.cu->GBiIdx, REF_PIC_LIST_0), clpRng); + else + g_pelBufOP.applyBiPROF[0](dstY, pcYuvDst.stride, srcY1, srcY0, pcYuvSrc0.stride, width, height, gX1, gY1, gX0, gY0, gradXExt0.stride, dMvX1, dMvY1, dMvX0, dMvY0, blockWidth, getGbiWeight(pu.cu->GBiIdx, REF_PIC_LIST_1), clpRng); +} +#endif + void InterPrediction::motionCompensation( PredictionUnit &pu, PelUnitBuf &predBuf, const RefPicList &eRefPicList , const bool luma, const bool chroma #if JVET_O0108_DIS_DMVR_BDOF_CIIP @@ -1440,7 +1754,13 @@ void InterPrediction::motionCompensation( PredictionUnit &pu, PelUnitBuf &predBu if (biocheck0 && biocheck1 && PU::isBiPredFromDifferentDir(pu) +#if JVET_O0634_BDOF_SIZE_CONSTRAINT + && (pu.Y().height >= 8) + && (pu.Y().width >= 8) + && ((pu.Y().height * pu.Y().width) >= 128) +#else && pu.Y().height != 4 +#endif ) { bioApplied = true; diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index a6bcd6939dbfded81abc982e5b2036e825746148..fc9703d303aa1dae42bdc7ec7cddd1736aac2594 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -108,6 +108,15 @@ protected: Mv(-2, 2), Mv(-1, 2), Mv(0, 2), Mv(1, 2), Mv(2, 2) }; uint64_t m_SADsArray[((2 * DMVR_NUM_ITERATION) + 1) * ((2 * DMVR_NUM_ITERATION) + 1)]; +#if JVET_O0070_PROF + Pel m_gradBuf[2][2][(MAX_CU_SIZE + 2) * (MAX_CU_SIZE + 2)]; + int m_dMvBuf[2][16 * 2]; + bool m_applyPROF[2]; + bool m_skipPROF; + bool m_encOnly; + bool m_isBi; +#endif + Pel* m_gradX0; Pel* m_gradY0; Pel* m_gradX1; @@ -149,6 +158,9 @@ protected: void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied, PelUnitBuf* yuvDstTmp = NULL ); #else void xWeightedAverage ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied ); +#endif +#if JVET_O0070_PROF + void xApplyBiPROF (const PredictionUnit& pu, const CPelBuf& pcYuvSrc0, const CPelBuf& pcYuvSrc1, PelBuf& pcYuvDst, const ClpRng& clpRng); #endif void xPredAffineBlk( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng ); diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp index b3e3d9f9b588d2d0366592c1a5dc553c4b3977ae..a32727d252c738d8f5f1874d7b631bc489735548 100644 --- a/source/Lib/CommonLib/Picture.cpp +++ b/source/Lib/CommonLib/Picture.cpp @@ -961,7 +961,11 @@ Slice *Picture::swapSliceObject(Slice * p, uint32_t i) slices[i] = p; pTmp->setSPS(0); pTmp->setPPS(0); +#if JVET_O_MAX_NUM_ALF_APS_8 + memset(pTmp->getAlfAPSs(), 0, sizeof(*pTmp->getAlfAPSs())*ALF_CTB_MAX_NUM_APS); +#else memset(pTmp->getAlfAPSs(), 0, sizeof(*pTmp->getAlfAPSs())*MAX_NUM_APS); +#endif pTmp->setLmcsAPS(0); return pTmp; diff --git a/source/Lib/CommonLib/Quant.cpp b/source/Lib/CommonLib/Quant.cpp index 94ca3df437c07fc540e513e4b95ad6d5e498382d..71ca554f0148e2ad52516b588d51bfb7591be9d3 100644 --- a/source/Lib/CommonLib/Quant.cpp +++ b/source/Lib/CommonLib/Quant.cpp @@ -65,6 +65,9 @@ QpParam::QpParam(const int qpy, const ChannelType chType, const int qpBdOffset, +#if JVET_O0919_TS_MIN_QP + const int minQpPrimeTsMinus4, +#endif const int chromaQPOffset, const ChromaFormat chFmt, const int dqp ) @@ -91,9 +94,26 @@ QpParam::QpParam(const int qpy, baseQp = Clip3( 0, MAX_QP+qpBdOffset, baseQp + dqp ); +#if JVET_O0919_TS_MIN_QP + Qps[0] =baseQp; + pers[0]=baseQp/6; + rems[0]=baseQp%6; + + int baseQpTS = baseQp; + + if( isLuma( chType ) ) + { + baseQpTS = std::max(baseQpTS , 4 + minQpPrimeTsMinus4); + } + + Qps[1] = baseQpTS; + pers[1] = baseQpTS / 6; + rems[1] = baseQpTS % 6; +#else Qp =baseQp; per=baseQp/6; rem=baseQp%6; +#endif } QpParam::QpParam(const TransformUnit& tu, const ComponentID &compIDX, const int QP /*= -MAX_INT*/) @@ -126,7 +146,11 @@ QpParam::QpParam(const TransformUnit& tu, const ComponentID &compIDX, const int int dqp = 0; +#if JVET_O0919_TS_MIN_QP + *this = QpParam(QP <= -MAX_INT ? tu.cu->qp : QP, toChannelType(compID), tu.cs->sps->getQpBDOffset(toChannelType(compID)), tu.cs->sps->getMinQpPrimeTsMinus4(toChannelType(compID)), chromaQpOffset, tu.chromaFormat, dqp); +#else *this = QpParam(QP <= -MAX_INT ? tu.cu->qp : QP, toChannelType(compID), tu.cs->sps->getQpBDOffset(toChannelType(compID)), chromaQpOffset, tu.chromaFormat, dqp); +#endif } @@ -397,8 +421,13 @@ void Quant::dequant(const TransformUnit &tu, const bool needSqrtAdjustment = TU::needsBlockSizeTrafoScale( tu, compID ); const int iTransformShift = (bClipTransformShiftTo0 ? std::max<int>(0, originalTransformShift) : originalTransformShift) + (needSqrtAdjustment?-1:0); +#if JVET_O0919_TS_MIN_QP + const int QP_per = cQP.per(isTransformSkip); + const int QP_rem = cQP.rem(isTransformSkip); +#else const int QP_per = cQP.per; const int QP_rem = cQP.rem; +#endif const int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0); @@ -943,7 +972,11 @@ void Quant::quant(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf CHECK(scalingListType >= SCALING_LIST_NUM, "Invalid scaling list"); const uint32_t uiLog2TrWidth = g_aucLog2[uiWidth]; const uint32_t uiLog2TrHeight = g_aucLog2[uiHeight]; +#if JVET_O0919_TS_MIN_QP + int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem(useTransformSkip), uiLog2TrWidth, uiLog2TrHeight); +#else int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrWidth, uiLog2TrHeight); +#endif const bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, useTransformSkip); @@ -951,7 +984,11 @@ void Quant::quant(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf // compensated by a bit-shift (the quantised result will be sqrt(2) * larger than required). // The quantScale table and shift is used to compensate for this. const bool needSqrtAdjustment= TU::needsBlockSizeTrafoScale( tu, compID ); +#if JVET_O0919_TS_MIN_QP + const int defaultQuantisationCoefficient = g_quantScales[needSqrtAdjustment?1:0][cQP.rem(useTransformSkip)]; +#else const int defaultQuantisationCoefficient = g_quantScales[needSqrtAdjustment?1:0][cQP.rem]; +#endif int iTransformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange) + ( needSqrtAdjustment?-1:0); if (useTransformSkip && sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag()) @@ -960,7 +997,11 @@ void Quant::quant(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf } +#if JVET_O0919_TS_MIN_QP + const int iQBits = QUANT_SHIFT + cQP.per(useTransformSkip) + iTransformShift; +#else const int iQBits = QUANT_SHIFT + cQP.per + iTransformShift; +#endif // QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset const int64_t iAdd = int64_t(tu.cs->slice->isIRAP() ? 171 : 85) << int64_t(iQBits - 9); @@ -1014,7 +1055,11 @@ bool Quant::xNeedRDOQ(TransformUnit &tu, const ComponentID &compID, const CCoeff const uint32_t uiLog2TrWidth = g_aucLog2[uiWidth]; const uint32_t uiLog2TrHeight = g_aucLog2[uiHeight]; +#if JVET_O0919_TS_MIN_QP + int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem(useTransformSkip), uiLog2TrWidth, uiLog2TrHeight); +#else int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrWidth, uiLog2TrHeight); +#endif const bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (useTransformSkip != 0)); @@ -1024,7 +1069,11 @@ bool Quant::xNeedRDOQ(TransformUnit &tu, const ComponentID &compID, const CCoeff * Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result */ const bool needSqrtAdjustment= TU::needsBlockSizeTrafoScale( tu, compID ); +#if JVET_O0919_TS_MIN_QP + const int defaultQuantisationCoefficient = g_quantScales[needSqrtAdjustment?1:0][cQP.rem(useTransformSkip)]; +#else const int defaultQuantisationCoefficient = g_quantScales[needSqrtAdjustment?1:0][cQP.rem]; +#endif int iTransformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange) + (needSqrtAdjustment?-1:0); if (useTransformSkip && sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag()) @@ -1033,7 +1082,11 @@ bool Quant::xNeedRDOQ(TransformUnit &tu, const ComponentID &compID, const CCoeff } +#if JVET_O0919_TS_MIN_QP + const int iQBits = QUANT_SHIFT + cQP.per(useTransformSkip) + iTransformShift; +#else const int iQBits = QUANT_SHIFT + cQP.per + iTransformShift; +#endif assert(iQBits>=0); // QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset @@ -1066,13 +1119,22 @@ void Quant::transformSkipQuantOneSample(TransformUnit &tu, const ComponentID &co const int iTransformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange); const int scalingListType = getScalingListType(tu.cu->predMode, compID); const bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true); +#if JVET_O0919_TS_MIN_QP + const bool useTransformSkip = tu.mtsIdx == MTS_SKIP && isLuma(compID); + const int defaultQuantisationCoefficient = g_quantScales[0][cQP.rem(useTransformSkip)]; +#else const int defaultQuantisationCoefficient = g_quantScales[0][cQP.rem]; +#endif CHECK( scalingListType >= SCALING_LIST_NUM, "Invalid scaling list" ); const uint32_t uiLog2TrWidth = g_aucLog2[uiWidth]; const uint32_t uiLog2TrHeight = g_aucLog2[uiHeight]; +#if JVET_O0919_TS_MIN_QP + const int *const piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem(useTransformSkip), uiLog2TrWidth, uiLog2TrHeight); +#else const int *const piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrWidth, uiLog2TrHeight); +#endif /* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be * implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the @@ -1080,7 +1142,11 @@ void Quant::transformSkipQuantOneSample(TransformUnit &tu, const ComponentID &co * Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result */ +#if JVET_O0919_TS_MIN_QP + const int iQBits = QUANT_SHIFT + cQP.per(useTransformSkip) + iTransformShift; +#else const int iQBits = QUANT_SHIFT + cQP.per + iTransformShift; +#endif // QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset const int iAdd = int64_t(bUseHalfRoundingPoint ? 256 : (tu.cs->slice->isIRAP() ? 171 : 85)) << int64_t(iQBits - 9); TCoeff transformedCoefficient; @@ -1117,8 +1183,13 @@ void Quant::invTrSkipDeQuantOneSample(TransformUnit &tu, const ComponentID &comp const CompArea &rect = tu.blocks[compID]; const uint32_t uiWidth = rect.width; const uint32_t uiHeight = rect.height; +#if JVET_O0919_TS_MIN_QP + const int QP_per = cQP.per(tu.mtsIdx==MTS_SKIP && isLuma(compID)); + const int QP_rem = cQP.rem(tu.mtsIdx==MTS_SKIP && isLuma(compID)); +#else const int QP_per = cQP.per; const int QP_rem = cQP.rem; +#endif const int maxLog2TrDynamicRange = sps.getMaxLog2TrDynamicRange(toChannelType(compID)); const int channelBitDepth = sps.getBitDepth(toChannelType(compID)); const int iTransformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange); diff --git a/source/Lib/CommonLib/Quant.h b/source/Lib/CommonLib/Quant.h index b53de50b4884da9a9f8f4b2c5238beaf5eb56078..16eba76cbc871781b021e5b6ebfe1909bc67c25f 100644 --- a/source/Lib/CommonLib/Quant.h +++ b/source/Lib/CommonLib/Quant.h @@ -65,17 +65,30 @@ struct TrQuantParams }; /// QP struct +#if JVET_O0919_TS_MIN_QP +class QpParam +#else struct QpParam +#endif { +#if JVET_O0919_TS_MIN_QP + int Qps[2]; + int pers[2]; + int rems[2]; +#else int Qp; int per; int rem; +#endif private: QpParam(const int qpy, const ChannelType chType, const int qpBdOffset, +#if JVET_O0919_TS_MIN_QP + const int minQpPrimeTsMinus4, +#endif const int chromaQPOffset, const ChromaFormat chFmt, const int dqp ); @@ -84,6 +97,12 @@ public: QpParam(const TransformUnit& tu, const ComponentID &compID, const int QP = -MAX_INT); +#if JVET_O0919_TS_MIN_QP + int Qp ( const bool ts ) const { return Qps [ts?1:0]; } + int per( const bool ts ) const { return pers[ts?1:0]; } + int rem( const bool ts ) const { return rems[ts?1:0]; } +#endif + }; // END STRUCT DEFINITION QpParam /// transform and quantization class diff --git a/source/Lib/CommonLib/QuantRDOQ.cpp b/source/Lib/CommonLib/QuantRDOQ.cpp index 30e3ba2e25340ca6c46e851055c8e5b475cb26b3..add6b384f439d91995bc054809056ede253450b6 100644 --- a/source/Lib/CommonLib/QuantRDOQ.cpp +++ b/source/Lib/CommonLib/QuantRDOQ.cpp @@ -628,13 +628,25 @@ void QuantRDOQ::xRateDistOptQuant(TransformUnit &tu, const ComponentID &compID, const bool needSqrtAdjustment= TU::needsBlockSizeTrafoScale( tu, compID ); +#if JVET_O0919_TS_MIN_QP + const bool isTransformSkip = tu.mtsIdx==MTS_SKIP && isLuma(compID); + const double *const pdErrScale = xGetErrScaleCoeffSL(scalingListType, uiLog2BlockWidth, uiLog2BlockHeight, cQP.rem(isTransformSkip)); + const int *const piQCoef = getQuantCoeff(scalingListType, cQP.rem(isTransformSkip), uiLog2BlockWidth, uiLog2BlockHeight); +#else const double *const pdErrScale = xGetErrScaleCoeffSL(scalingListType, uiLog2BlockWidth, uiLog2BlockHeight, cQP.rem); const int *const piQCoef = getQuantCoeff(scalingListType, cQP.rem, uiLog2BlockWidth, uiLog2BlockHeight); const bool isTransformSkip = tu.mtsIdx==MTS_SKIP && isLuma(compID); +#endif const bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, isTransformSkip); +#if JVET_O0919_TS_MIN_QP + const int defaultQuantisationCoefficient = g_quantScales[ needSqrtAdjustment ?1:0][cQP.rem(isTransformSkip)]; + const double defaultErrorScale = xGetErrScaleCoeffNoScalingList(scalingListType, (uiLog2BlockWidth-1), (uiLog2BlockHeight-1), cQP.rem(isTransformSkip)); + const int iQBits = QUANT_SHIFT + cQP.per(isTransformSkip) + iTransformShift + (needSqrtAdjustment?-1:0); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits +#else const int defaultQuantisationCoefficient = g_quantScales[ needSqrtAdjustment ?1:0][cQP.rem]; const double defaultErrorScale = xGetErrScaleCoeffNoScalingList(scalingListType, (uiLog2BlockWidth-1), (uiLog2BlockHeight-1), cQP.rem); const int iQBits = QUANT_SHIFT + cQP.per + iTransformShift + (needSqrtAdjustment?-1:0); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits +#endif const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange); @@ -1078,10 +1090,17 @@ void QuantRDOQ::xRateDistOptQuant(TransformUnit &tu, const ComponentID &compID, if( cctx.signHiding() && uiAbsSum>=2) { +#if JVET_O0919_TS_MIN_QP + const double inverseQuantScale = double(g_invQuantScales[0][cQP.rem(isTransformSkip)]); + int64_t rdFactor = (int64_t)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per(isTransformSkip))) / m_dLambda / 16 + / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth))) + + 0.5); +#else const double inverseQuantScale = double(g_invQuantScales[0][cQP.rem]); int64_t rdFactor = (int64_t)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per)) / m_dLambda / 16 / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth))) + 0.5); +#endif int lastCG = -1; int absSum = 0 ; @@ -1252,9 +1271,16 @@ void QuantRDOQ::xRateDistOptQuantTS( TransformUnit &tu, const ComponentID &compI #endif const bool needsSqrt2Scale = TU::needsSqrt2Scale( tu, compID ); // should always be false - transform-skipped blocks don't require sqrt(2) compensation. +#if JVET_O0919_TS_MIN_QP + const bool isTransformSkip = tu.mtsIdx==MTS_SKIP && isLuma(compID); + const int qBits = QUANT_SHIFT + qp.per(isTransformSkip) + transformShift + ( needsSqrt2Scale ? -1 : 0 ); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits + const int quantisationCoefficient = g_quantScales[needsSqrt2Scale?1:0][qp.rem(isTransformSkip)]; + const double errorScale = xGetErrScaleCoeff( TU::needsSqrt2Scale( tu, compID ), width, height, qp.rem(isTransformSkip), maxLog2TrDynamicRange, channelBitDepth ); +#else const int qBits = QUANT_SHIFT + qp.per + transformShift + (needsSqrt2Scale?-1:0); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits const int quantisationCoefficient = g_quantScales[needsSqrt2Scale?1:0][qp.rem]; const double errorScale = xGetErrScaleCoeff( TU::needsSqrt2Scale( tu, compID ), width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth ); +#endif const TCoeff entropyCodingMaximum = ( 1 << maxLog2TrDynamicRange ) - 1; @@ -1500,13 +1526,25 @@ void QuantRDOQ::forwardRDPCM( TransformUnit &tu, const ComponentID &compID, cons #endif const bool needsSqrt2Scale = TU::needsSqrt2Scale(tu, compID); // should always be false - transform-skipped blocks don't require sqrt(2) compensation. +#if JVET_O0919_TS_MIN_QP + const bool isTransformSkip = tu.mtsIdx==MTS_SKIP && isLuma(compID); + const int qBits = QUANT_SHIFT + qp.per(isTransformSkip) + transformShift + ( needsSqrt2Scale ? -1 : 0 ); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits + const int quantisationCoefficient = g_quantScales[needsSqrt2Scale ? 1 : 0][qp.rem(isTransformSkip)]; + const double errorScale = xGetErrScaleCoeff(TU::needsSqrt2Scale(tu, compID), width, height, qp.rem(isTransformSkip), maxLog2TrDynamicRange, channelBitDepth); +#else const int qBits = QUANT_SHIFT + qp.per + transformShift + (needsSqrt2Scale ? -1 : 0); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits const int quantisationCoefficient = g_quantScales[needsSqrt2Scale ? 1 : 0][qp.rem]; const double errorScale = xGetErrScaleCoeff(TU::needsSqrt2Scale(tu, compID), width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth); +#endif TrQuantParams trQuantParams; +#if JVET_O0919_TS_MIN_QP + trQuantParams.rightShift = (IQUANT_SHIFT - (transformShift + qp.per(isTransformSkip))); + trQuantParams.qScale = g_invQuantScales[needsSqrt2Scale ? 1 : 0][qp.rem(isTransformSkip)]; +#else trQuantParams.rightShift = (IQUANT_SHIFT - (transformShift + qp.per)); trQuantParams.qScale = g_invQuantScales[needsSqrt2Scale ? 1 : 0][qp.rem]; +#endif const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1; diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp index b7bccee56532eb6a1520331742354b75d3be1129..e2d06bda5921ad3659b0fc6a7affead2ff84d56e 100644 --- a/source/Lib/CommonLib/Slice.cpp +++ b/source/Lib/CommonLib/Slice.cpp @@ -1453,6 +1453,9 @@ SPS::SPS() , m_LFNST ( false ) , m_Affine ( false ) , m_AffineType ( false ) +#if JVET_O0070_PROF +, m_PROF ( false ) +#endif , m_MHIntra ( false ) , m_Triangle ( false ) #if LUMA_ADAPTIVE_DEBLOCKING_FILTER_QP_OFFSET diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index 1446966ad0d511659282272a6e578eb98149b6ee..cc195899c8e3deb1852af8ad2d5a38dac5b14509 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -735,6 +735,9 @@ private: // Parameter BitDepths m_bitDepths; int m_qpBDOffset[MAX_NUM_CHANNEL_TYPE]; +#if JVET_O0919_TS_MIN_QP + int m_minQpMinus4[MAX_NUM_CHANNEL_TYPE]; // QP_internal - QP_input; +#endif int m_pcmBitDepths[MAX_NUM_CHANNEL_TYPE]; bool m_bPCMFilterDisableFlag; @@ -797,6 +800,9 @@ private: bool m_SMVD; bool m_Affine; bool m_AffineType; +#if JVET_O0070_PROF + bool m_PROF; +#endif bool m_GBi; // bool m_MHIntra; bool m_Triangle; @@ -933,6 +939,10 @@ public: int getDifferentialLumaChromaBitDepth() const { return int(m_bitDepths.recon[CHANNEL_TYPE_LUMA]) - int(m_bitDepths.recon[CHANNEL_TYPE_CHROMA]); } int getQpBDOffset(ChannelType type) const { return m_qpBDOffset[type]; } void setQpBDOffset(ChannelType type, int i) { m_qpBDOffset[type] = i; } +#if JVET_O0919_TS_MIN_QP + int getMinQpPrimeTsMinus4(ChannelType type) const { return m_minQpMinus4[type]; } + void setMinQpPrimeTsMinus4(ChannelType type, int i) { m_minQpMinus4[type] = i; } +#endif void setSAOEnabledFlag(bool bVal) { m_saoEnabledFlag = bVal; } bool getSAOEnabledFlag() const { return m_saoEnabledFlag; } @@ -1016,6 +1026,10 @@ public: bool getUseAffine () const { return m_Affine; } void setUseAffineType ( bool b ) { m_AffineType = b; } bool getUseAffineType () const { return m_AffineType; } +#if JVET_O0070_PROF + void setUsePROF ( bool b ) { m_PROF = b; } + bool getUsePROF () const { return m_PROF; } +#endif void setUseLMChroma ( bool b ) { m_LMChroma = b; } bool getUseLMChroma () const { return m_LMChroma; } void setCclmCollocatedChromaFlag( bool b ) { m_cclmCollocatedChromaFlag = b; } @@ -1559,7 +1573,11 @@ private: uint32_t m_uiMaxTTSizeIChroma; uint32_t m_uiMaxBTSize; +#if JVET_O_MAX_NUM_ALF_APS_8 + APS* m_alfApss[ALF_CTB_MAX_NUM_APS]; +#else APS* m_alfApss[MAX_NUM_APS]; +#endif bool m_tileGroupAlfEnabledFlag[MAX_NUM_COMPONENT]; int m_tileGroupNumAps; std::vector<int> m_tileGroupLumaApsId; @@ -2067,7 +2085,11 @@ protected: ParameterSetMap<APS> m_apsMap; ParameterSetMap<DPS> m_dpsMap; +#if JVET_O_MAX_NUM_ALF_APS_8 + APS* m_apss[ALF_CTB_MAX_NUM_APS]; +#else APS* m_apss[MAX_NUM_APS]; +#endif int m_activeDPSId; // -1 for nothing active int m_activeSPSId; // -1 for nothing active diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp index ab45af5ed2e5cae336fec0a6d378c24c5bfd5bc4..d45791d5d0dc44ec8d24bd8292d3c2fbc87faf4a 100644 --- a/source/Lib/CommonLib/TrQuant.cpp +++ b/source/Lib/CommonLib/TrQuant.cpp @@ -774,6 +774,11 @@ void TrQuant::getTrTypes(const TransformUnit tu, const ComponentID compID, int & trTypeHor = DCT2; trTypeVer = DCT2; +#if JVET_O0538_SPS_CONTROL_ISP_SBT + if (!tu.cs->sps->getUseMTS()) + return; +#endif + if (isImplicitMTS || isISP) { int width = tu.blocks[compID].width; diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index 9e4a29405954f4db0f8489466768a2c5e68eb1f6..d5138ac17133ff8b7bb0d3191ae30e3a646ad1f1 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -50,15 +50,27 @@ #include <assert.h> #include <cassert> -#define JVET_O0455_IBC_MAX_MERGE_NUM 1 // JVET-O0455: Control the max number of IBC merge candidates independently from regular merge candidates +#define JVET_O0640_PICTURE_SIZE_CONSTRAINT 1 // JVET-O0640: Picture width and height shall be a multiple of Max(8, minCU size) + +#define JVET_O_MAX_NUM_ALF_APS_8 1 // JVET-O: number of ALF APSs is reduced to 8 + +#define JVET_O0070_PROF 1 // JVET-O0070 method 4-2.1a: Prediction refinement with optical flow for affine mode + +#define JVET_O0570_GRAD_SIMP 1 // JVET-O0570/JVET-O0211, SMID friendly spatial gradient calculation #define JVET_O1170_IBC_VIRTUAL_BUFFER 1 // JVET-O1170/O1171: IBC virtual buffer #if JVET_O1170_IBC_VIRTUAL_BUFFER #define JVET_O1170_CHECK_BV_AT_DECODER 1 // For decoder to check if a BV is valid or not #endif +#define JVET_O0538_SPS_CONTROL_ISP_SBT 1 // JVET-O0538: SPS control for ISP and SBT transform + +#define JVET_O0634_BDOF_SIZE_CONSTRAINT 1 // JVET-O0634: BDOF applied CU size align with DMVR + #define JVET_O0213_RESTRICT_LFNST_TO_MAX_TB_SIZE 1 // JVET-O0213: Block size restriction of LFNST to maximum transform size +#define JVET_O0617_SIG_FLAG_CONTEXT_REDUCTION 1 // JVET-O0617: Significant flag context reduction + #define JVET_O0244_DELTA_POC 1 // JVET-O0244: weighted prediction in SPS and delta POC #define JVET_O1153_INTRA_CHROMAMODE_CODING 1 //JVET-O1153: simplified intra chromamode coding @@ -91,6 +103,8 @@ #define JVET_O0366_AFFINE_BCW 1 // JVET-O0366: Simplifications on BCW index derivation process +#define JVET_O0919_TS_MIN_QP 1 // JVET-O0919: Minimum QP for Transform Skip Mode + #define JVET_O1168_CU_CHROMA_QP_OFFSET 1 // JVET-O1168: cu chroma QP offset #define JVET_O0368_LFNST_WITH_DCT2_ONLY 1 // JVET-O0368/O0292/O0521/O0466: disable LFNST for non-DCT2 MTS candidates normatively diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index ae39695890e2df4acf95c4e5e35f739a9c9971e7..6e82e3f411a3b3666cc0ec707988be68d023abd2 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -43,7 +43,7 @@ #include "CommonDefX86.h" #include "CommonLib/Unit.h" #include "CommonLib/Buffer.h" - +#include "CommonLib/InterpolationFilter.h" #if ENABLE_SIMD_OPT_BUFFER #ifdef TARGET_SIMD_X86 @@ -278,10 +278,222 @@ void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1St } } +#if JVET_O0070_PROF +template< X86_VEXT vext > +void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng) +{ + CHECKD((width & 3), "block width error!"); + + __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_src; + __m128i mm_dIoffset = _mm_set1_epi32(1); + __m128i mm_offset = _mm_set1_epi32(offset); + __m128i vibdimin = _mm_set1_epi32(clpRng.min); + __m128i vibdimax = _mm_set1_epi32(clpRng.max); + __m128i vzero = _mm_setzero_si128(); + + for (int h = 0; h < height; h++) + { + const int* vX = dMvX; + const int* vY = dMvY; + const Pel* gX = gradX; + const Pel* gY = gradY; + const Pel* src = srcPel; + Pel* dst = dstPel; + + for (int w = 0; w < width; w += 4) + { + mm_dmvx = _mm_loadu_si128((const __m128i *)vX); + mm_dmvy = _mm_loadu_si128((const __m128i *)vY); + mm_gradx = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX)); + mm_grady = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY)); + mm_src = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)src)); + + mm_dI = _mm_add_epi32(_mm_mullo_epi32(mm_dmvx, mm_gradx), _mm_mullo_epi32(mm_dmvy, mm_grady)); + mm_dI = _mm_srai_epi32(_mm_add_epi32(mm_dI, mm_dIoffset), 1); + mm_dI = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(mm_dI, mm_src), mm_offset), shiftNum); + mm_dI = _mm_packs_epi32(_mm_min_epi32(vibdimax, _mm_max_epi32(vibdimin, mm_dI)), vzero); + _mm_storel_epi64((__m128i *)dst, mm_dI); + + vX += 4; vY += 4; gX += 4; gY += 4; src += 4; dst += 4; + } + dMvX += dMvStride; + dMvY += dMvStride; + gradX += gradStride; + gradY += gradStride; + srcPel += srcStride; + dstPel += dstStride; + } +} + +template< X86_VEXT vext, bool l1PROFEnabled = true> +void applyBiPROF_SSE(Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t w0, const ClpRng& clpRng) +{ + const int rShift = IF_INTERNAL_PREC - clpRng.bd; + const int shiftNum = (rShift > 2 ? rShift : 2) + g_GbiLog2WeightBase; + const int offset = (1 << (shiftNum - 1)) + (IF_INTERNAL_OFFS << g_GbiLog2WeightBase); + const int8_t w1 = g_GbiWeightBase - w0; + + __m128i mm_offset = _mm_set1_epi32(offset); + __m128i mm_w0 = _mm_set1_epi32(w0); + __m128i mm_w1 = _mm_set1_epi32(w1); + __m128i vibdimin = _mm_set1_epi32(clpRng.min); + __m128i vibdimax = _mm_set1_epi32(clpRng.max); + __m128i vzero = _mm_setzero_si128(); + + __m128i mm_dmvx0, mm_dmvy0, mm_dmvx1, mm_dmvy1, mm_gradx0, mm_grady0, mm_gradx1, mm_grady1, mm_src0, mm_src1; + __m128i mm_dI0, mm_dI1, mm_dI; + __m128i mm_dIoffset = _mm_set1_epi32(1); + const int *mmMvX0, *mmMvY0, *mmMvX1, *mmMvY1; + const Pel *gX0, *gY0, *gX1, *gY1; + + for (int h = 0; h < height; h++) + { + if (!(h & 3)) + { + mmMvX0 = dMvX0; + mmMvY0 = dMvY0; + if (l1PROFEnabled) + { + mmMvX1 = dMvX1; + mmMvY1 = dMvY1; + } + } + + mm_dmvx0 = _mm_loadu_si128((const __m128i *)mmMvX0); + mm_dmvy0 = _mm_loadu_si128((const __m128i *)mmMvY0); + gX0 = gradX0; + gY0 = gradY0; + + if (l1PROFEnabled) + { + mm_dmvx1 = _mm_loadu_si128((const __m128i *)mmMvX1); + mm_dmvy1 = _mm_loadu_si128((const __m128i *)mmMvY1); + gX1 = gradX1; + gY1 = gradY1; + } + + const Pel* pSrc0 = src0; + const Pel* pSrc1 = src1; + Pel* pDst = dst; + + for (int w = 0; w < width; w += 4) + { + mm_src0 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)pSrc0)); + mm_src1 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)pSrc1)); + mm_gradx0 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX0)); + mm_grady0 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY0)); + mm_dI0 = _mm_add_epi32(_mm_mullo_epi32(mm_dmvx0, mm_gradx0), _mm_mullo_epi32(mm_dmvy0, mm_grady0)); + mm_dI0 = _mm_srai_epi32(_mm_add_epi32(mm_dI0, mm_dIoffset), 1); + mm_dI0 = _mm_mullo_epi32(_mm_add_epi32(mm_src0, mm_dI0), mm_w0); + gX0 += 4; gY0 += 4; + + if (l1PROFEnabled) + { + mm_gradx1 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX1)); + mm_grady1 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY1)); + mm_dI1 = _mm_add_epi32(_mm_mullo_epi32(mm_dmvx1, mm_gradx1), _mm_mullo_epi32(mm_dmvy1, mm_grady1)); + mm_dI1 = _mm_srai_epi32(_mm_add_epi32(mm_dI1, mm_dIoffset), 1); + mm_dI1 = _mm_mullo_epi32(_mm_add_epi32(mm_src1, mm_dI1), mm_w1); + gX1 += 4; gY1 += 4; + } + else + mm_dI1 = _mm_mullo_epi32(mm_src1, mm_w1); + + mm_dI = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(mm_dI0, mm_dI1), mm_offset), shiftNum); + mm_dI = _mm_packs_epi32(_mm_min_epi32(vibdimax, _mm_max_epi32(vibdimin, mm_dI)), vzero); + _mm_storel_epi64((__m128i *)pDst, mm_dI); + + pSrc0 += 4; pSrc1 += 4; pDst += 4; + } + + mmMvX0 += dMvStride; + mmMvY0 += dMvStride; + gradX0 += gradStride; + gradY0 += gradStride; + + if (l1PROFEnabled) + { + mmMvX1 += dMvStride; + mmMvY1 += dMvStride; + gradX1 += gradStride; + gradY1 += gradStride; + } + + src0 += srcStride; + src1 += srcStride; + dst += dstStride; + } +} + template< X86_VEXT vext > +void roundIntVector_SIMD(int* v, int size, unsigned int nShift, const int dmvLimit) +{ + CHECKD(size % 16 != 0, "Size must be multiple of 16!"); +#ifdef USE_AVX512 + if (vext >= AVX512 && size >= 16) + { + __m512i dMvMin = _mm256_set1_epi32(-dmvLimit); + __m512i dMvMax = _mm256_set1_epi32(dmvLimit - 1 ); + __m512i nOffset = _mm512_set1_epi32((1 << (nShift - 1))); + __m512i vones = _mm512_set1_epi32(1); + __m512i vzero = _mm512_setzero_si512(); + for (int i = 0; i < size; i += 16, v += 16) + { + __m512i src = _mm512_loadu_si512(v); + __mmask16 mask = _mm512_cmpge_epi32_mask(src, vzero); + src = __mm512_add_epi32(src, nOffset); + __mm512i dst = _mm512_srai_epi32(_mm512_mask_sub_epi32(src, mask, src, vones), nShift); + dst = _mm512_min_epi32(dMvMax, _mm512_max_epi32(dMvMin, dst)); + _mm512_storeu_si512(v, dst); + } + } + else +#endif +#ifdef USE_AVX2 + if (vext >= AVX2 && size >= 8) + { + __m256i dMvMin = _mm256_set1_epi32(-dmvLimit); + __m256i dMvMax = _mm256_set1_epi32(dmvLimit - 1); + __m256i nOffset = _mm256_set1_epi32(1 << (nShift - 1)); + __m256i vzero = _mm256_setzero_si256(); + for (int i = 0; i < size; i += 8, v += 8) + { + __m256i src = _mm256_lddqu_si256((__m256i*)v); + __m256i of = _mm256_cmpgt_epi32(src, vzero); + __m256i dst = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(src, nOffset), of), nShift); + dst = _mm256_min_epi32(dMvMax, _mm256_max_epi32(dMvMin, dst)); + _mm256_storeu_si256((__m256i*)v, dst); + } + } + else +#endif + { + __m128i dMvMin = _mm_set1_epi32(-dmvLimit); + __m128i dMvMax = _mm_set1_epi32(dmvLimit - 1); + __m128i nOffset = _mm_set1_epi32((1 << (nShift - 1))); + __m128i vzero = _mm_setzero_si128(); + for (int i = 0; i < size; i += 4, v += 4) + { + __m128i src = _mm_loadu_si128((__m128i*)v); + __m128i of = _mm_cmpgt_epi32(src, vzero); + __m128i dst = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(src, nOffset), of), nShift); + dst = _mm_min_epi32(dMvMax, _mm_max_epi32(dMvMin, dst)); + _mm_storeu_si128((__m128i*)v, dst); + } + } +} +#endif + +#if JVET_O0070_PROF +template< X86_VEXT vext, bool PAD = true> +#else +template< X86_VEXT vext > +#endif void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth) { +#if !JVET_O0570_GRAD_SIMP __m128i vzero = _mm_setzero_si128(); +#endif Pel* srcTmp = src + srcStride + 1; Pel* gradXTmp = gradX + gradStride + 1; Pel* gradYTmp = gradY + gradStride + 1; @@ -289,33 +501,84 @@ void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStri int widthInside = width - 2 * BIO_EXTEND_SIZE; int heightInside = height - 2 * BIO_EXTEND_SIZE; int shift1 = std::max<int>(6, bitDepth - 6); - +#if JVET_O0570_GRAD_SIMP + __m128i mmShift1 = _mm_cvtsi32_si128( shift1 ); +#endif assert((widthInside & 3) == 0); - for (int y = 0; y < heightInside; y++) +#if JVET_O0570_GRAD_SIMP + if ( ( widthInside & 7 ) == 0 ) { - int x = 0; - for (; x < widthInside; x += 4) +#endif + for (int y = 0; y < heightInside; y++) { - __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - srcStride))); - __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + srcStride))); - __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - 1))); - __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + 1))); + int x = 0; +#if JVET_O0570_GRAD_SIMP + for ( ; x < widthInside; x += 8 ) + { + __m128i mmPixTop = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x - srcStride ) ), mmShift1 ); + __m128i mmPixBottom = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x + srcStride ) ), mmShift1 ); + __m128i mmPixLeft = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x - 1 ) ), mmShift1 ); + __m128i mmPixRight = _mm_sra_epi16( _mm_loadu_si128( ( __m128i* ) ( srcTmp + x + 1 ) ), mmShift1 ); - __m128i mmGradVer = _mm_sra_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), _mm_cvtsi32_si128(shift1)); - __m128i mmGradHor = _mm_sra_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), _mm_cvtsi32_si128(shift1)); - mmGradVer = _mm_packs_epi32(mmGradVer, vzero); - mmGradHor = _mm_packs_epi32(mmGradHor, vzero); + __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop ); + __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft ); - _mm_storel_epi64((__m128i *)(gradYTmp + x), mmGradVer); - _mm_storel_epi64((__m128i *)(gradXTmp + x), mmGradHor); + _mm_storeu_si128( ( __m128i * ) ( gradYTmp + x ), mmGradVer ); + _mm_storeu_si128( ( __m128i * ) ( gradXTmp + x ), mmGradHor ); + } +#else + for (; x < widthInside; x += 4) + { + __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - srcStride))); + __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + srcStride))); + __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - 1))); + __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + 1))); + + __m128i mmGradVer = _mm_sra_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), _mm_cvtsi32_si128(shift1)); + __m128i mmGradHor = _mm_sra_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), _mm_cvtsi32_si128(shift1)); + mmGradVer = _mm_packs_epi32(mmGradVer, vzero); + mmGradHor = _mm_packs_epi32(mmGradHor, vzero); + + _mm_storel_epi64((__m128i *)(gradYTmp + x), mmGradVer); + _mm_storel_epi64((__m128i *)(gradXTmp + x), mmGradHor); + } +#endif + gradXTmp += gradStride; + gradYTmp += gradStride; + srcTmp += srcStride; } - - gradXTmp += gradStride; - gradYTmp += gradStride; - srcTmp += srcStride; +#if JVET_O0570_GRAD_SIMP } + else + { + __m128i mmPixTop = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp - srcStride ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp ) ) ), mmShift1 ); + for ( int y = 0; y < heightInside; y += 2 ) + { + __m128i mmPixBottom = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp + srcStride ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp + ( srcStride << 1 ) ) ) ), mmShift1 ); + __m128i mmPixLeft = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp - 1 ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp - 1 + srcStride ) ) ), mmShift1 ); + __m128i mmPixRight = _mm_sra_epi16( _mm_unpacklo_epi64( _mm_loadl_epi64( (__m128i*) ( srcTmp + 1 ) ), _mm_loadl_epi64( (__m128i*) ( srcTmp + 1 + srcStride ) ) ), mmShift1 ); + + __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop ); + __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft ); + + _mm_storel_epi64( (__m128i *) gradYTmp, mmGradVer ); + _mm_storel_epi64( (__m128i *) ( gradYTmp + gradStride ), _mm_unpackhi_epi64( mmGradVer, mmGradHor ) ); + _mm_storel_epi64( (__m128i *) gradXTmp, mmGradHor ); + _mm_storel_epi64( (__m128i *) ( gradXTmp + gradStride ), _mm_unpackhi_epi64( mmGradHor, mmGradVer ) ); + + mmPixTop = mmPixBottom; + gradXTmp += gradStride << 1; + gradYTmp += gradStride << 1; + srcTmp += srcStride << 1; + } + } +#endif +#if JVET_O0070_PROF + if (PAD) + { +#endif gradXTmp = gradX + gradStride + 1; gradYTmp = gradY + gradStride + 1; for (int y = 0; y < heightInside; y++) @@ -335,6 +598,9 @@ void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStri ::memcpy(gradXTmp + heightInside*gradStride, gradXTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width)); ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width)); ::memcpy(gradYTmp + heightInside*gradStride, gradYTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width)); +#if JVET_O0070_PROF + } +#endif } template< X86_VEXT vext > @@ -934,6 +1200,13 @@ void PelBufferOps::_initPelBufOpsX86() removeHighFreq8 = removeHighFreq_SSE<vext, 8>; removeHighFreq4 = removeHighFreq_SSE<vext, 4>; #endif +#if JVET_O0070_PROF + profGradFilter = gradFilter_SSE<vext, false>; + applyPROF = applyPROF_SSE<vext>; + applyBiPROF[1] = applyBiPROF_SSE<vext>; + applyBiPROF[0] = applyBiPROF_SSE<vext, false>; + roundIntVector = roundIntVector_SIMD<vext>; +#endif } template void PelBufferOps::_initPelBufOpsX86<SIMDX86>(); diff --git a/source/Lib/DecoderLib/CABACReader.cpp b/source/Lib/DecoderLib/CABACReader.cpp index ea18a3700b54f48b2e38f21104ca2a13978448ee..1289aeaa535a7efa74f84384425c757e6064cf81 100755 --- a/source/Lib/DecoderLib/CABACReader.cpp +++ b/source/Lib/DecoderLib/CABACReader.cpp @@ -3032,7 +3032,11 @@ void CABACReader::residual_lfnst_mode( CodingUnit& cu ) if( cu.ispMode != NOT_INTRA_SUBPARTITIONS || cu.mipFlag == true || ( CS::isDualITree( *cu.cs ) && cu.chType == CHANNEL_TYPE_CHROMA && std::min( cu.blocks[ 1 ].width, cu.blocks[ 1 ].height ) < 4 ) #if JVET_O0213_RESTRICT_LFNST_TO_MAX_TB_SIZE +#if JVET_O0545_MAX_TB_SIGNALLING + || ( cu.blocks[ 0 ].width > cu.cs->sps->getMaxTbSize() || cu.blocks[ 0 ].height > cu.cs->sps->getMaxTbSize() ) +#else || ( cu.blocks[ 0 ].width > MAX_TB_SIZEY || cu.blocks[ 0 ].height > MAX_TB_SIZEY ) +#endif #endif ) { diff --git a/source/Lib/DecoderLib/DecLib.cpp b/source/Lib/DecoderLib/DecLib.cpp index 6bfde1cb5e2e09fa89f18e0020379ccc0571a1cf..a8b680961d7c05a057bdd5c29dd37484560c3795 100644 --- a/source/Lib/DecoderLib/DecLib.cpp +++ b/source/Lib/DecoderLib/DecLib.cpp @@ -206,10 +206,8 @@ bool tryDecodePicture( Picture* pcEncPic, const int expectedPoc, const std::stri { std::copy( pic->getAlfCtuEnableFlag()[compIdx].begin(), pic->getAlfCtuEnableFlag()[compIdx].end(), pcEncPic->getAlfCtuEnableFlag()[compIdx].begin() ); } -#if JVET_N0415_CTB_ALF pcEncPic->resizeAlfCtbFilterIndex(pic->cs->pcv->sizeInCtus); memcpy( pcEncPic->getAlfCtbFilterIndex(), pic->getAlfCtbFilterIndex(), sizeof(short)*pic->cs->pcv->sizeInCtus ); -#endif #if JVET_O0090_ALF_CHROMA_FILTER_ALTERNATIVES_CTB std::copy( pic->getAlfCtuAlternative(COMPONENT_Cb).begin(), pic->getAlfCtuAlternative(COMPONENT_Cb).end(), pcEncPic->getAlfCtuAlternative(COMPONENT_Cb).begin() ); @@ -742,7 +740,11 @@ void DecLib::xActivateParameterSets() if (m_bFirstSliceInPicture) { APS** apss = m_parameterSetManager.getAPSs(); +#if JVET_O_MAX_NUM_ALF_APS_8 + memset(apss, 0, sizeof(*apss) * ALF_CTB_MAX_NUM_APS); +#else memset(apss, 0, sizeof(*apss) * MAX_NUM_APS); +#endif const PPS *pps = m_parameterSetManager.getPPS(m_apcSlicePilot->getPPSId()); // this is a temporary PPS object. Do not store this value CHECK(pps == 0, "No PPS present"); @@ -937,7 +939,11 @@ void DecLib::xActivateParameterSets() { EXIT("Error - a new PPS has been decoded while processing a picture"); } +#if JVET_O_MAX_NUM_ALF_APS_8 + for (int i = 0; i < ALF_CTB_MAX_NUM_APS; i++) +#else for (int i = 0; i < MAX_NUM_APS; i++) +#endif { APS* aps = m_parameterSetManager.getAPS(i, ALF_APS); if (aps && m_parameterSetManager.getAPSChangedFlag(i, ALF_APS)) @@ -1195,7 +1201,7 @@ bool DecLib::xDecodeSlice(InputNALUnit &nalu, int &iSkipFrame, int iPOCLastDispl if (endCtuIdx == numberOfCtusInFrame) EXIT("Cannot find the last CTU index of the current slice"); - while (pcSlice->getSliceCurEndBrickIdx() == tileMap.getBrickIdxBsMap(endCtuIdx) && endCtuIdx < numberOfCtusInFrame) + while ( (endCtuIdx < numberOfCtusInFrame) && (pcSlice->getSliceCurEndBrickIdx() == tileMap.getBrickIdxBsMap(endCtuIdx)) ) { endCtuIdx++; } @@ -1437,6 +1443,7 @@ void DecLib::xDecodeVPS( InputNALUnit& nalu ) VPS* vps = new VPS(); m_HLSReader.setBitstream( &nalu.getBitstream() ); m_HLSReader.parseVPS( vps ); + delete vps; } void DecLib::xDecodeDPS( InputNALUnit& nalu ) diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index 03fbd78e7725625814697478597aaaddaf669940..992ef6f58986835074fecc0eade0fc3fd3d14717 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -1102,6 +1102,11 @@ void HLSyntaxReader::parseSPS(SPS* pcSPS) pcSPS->setBitDepth(CHANNEL_TYPE_CHROMA, 8 + uiCode); pcSPS->setQpBDOffset(CHANNEL_TYPE_CHROMA, (int) (6*uiCode) ); +#if JVET_O0919_TS_MIN_QP + READ_UVLC( uiCode, "min_qp_prime_ts_minus4" ); + pcSPS->setMinQpPrimeTsMinus4(CHANNEL_TYPE_LUMA, uiCode); +#endif + READ_UVLC( uiCode, "log2_max_pic_order_cnt_lsb_minus4" ); pcSPS->setBitsForPOC( 4 + uiCode ); CHECK(uiCode > 12, "Invalid code"); READ_FLAG( uiCode, "sps_idr_rpl_present_flag" ); pcSPS->setIDRRefParamListPresent( (bool) uiCode); @@ -1184,6 +1189,12 @@ void HLSyntaxReader::parseSPS(SPS* pcSPS) READ_UVLC(uiCode, "log2_min_luma_coding_block_size_minus2"); int log2MinCUSize = uiCode + 2; pcSPS->setLog2MinCodingBlockSize(log2MinCUSize); + +#if JVET_O0640_PICTURE_SIZE_CONSTRAINT + CHECK((pcSPS->getPicWidthInLumaSamples() % (std::max(8, int(pcSPS->getMaxCUWidth() >> (pcSPS->getMaxCodingDepth() - 1))))) != 0, "Coded frame width must be a multiple of Max(8, the minimum unit size)"); + CHECK((pcSPS->getPicHeightInLumaSamples() % (std::max(8, int(pcSPS->getMaxCUHeight() >> (pcSPS->getMaxCodingDepth() - 1))))) != 0, "Coded frame height must be a multiple of Max(8, the minimum unit size)"); +#endif + READ_FLAG(uiCode, "partition_constraints_override_enabled_flag"); pcSPS->setSplitConsOverrideEnabledFlag(uiCode); READ_UVLC(uiCode, "sps_log2_diff_min_qt_min_cb_intra_tile_group_luma"); minQT[0] = 1 << (uiCode + pcSPS->getLog2MinCodingBlockSize()); READ_UVLC(uiCode, "sps_log2_diff_min_qt_min_cb_inter_tile_group"); minQT[1] = 1 << (uiCode + pcSPS->getLog2MinCodingBlockSize()); @@ -1307,6 +1318,9 @@ void HLSyntaxReader::parseSPS(SPS* pcSPS) if ( pcSPS->getUseAffine() ) { READ_FLAG( uiCode, "affine_type_flag" ); pcSPS->setUseAffineType ( uiCode != 0 ); +#if JVET_O0070_PROF + READ_FLAG( uiCode, "sps_prof_enabled_flag"); pcSPS->setUsePROF ( uiCode != 0 ); +#endif #if JVET_O0438_SPS_AFFINE_AMVR_FLAG READ_FLAG( uiCode, "sps_affine_amvr_enabled_flag" ); pcSPS->setAffineAmvrEnabledFlag ( uiCode != 0 ); #endif @@ -1804,7 +1818,11 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, ParameterSetManager *para if (uiCode) { #if JVET_O0288_UNIFY_ALF_SLICE_TYPE_REMOVAL +#if JVET_O_MAX_NUM_ALF_APS_8 + READ_CODE(3, uiCode, "tile_group_num_APS"); +#else xReadTruncBinCode(uiCode, ALF_CTB_MAX_NUM_APS + 1); +#endif #else if (pcSlice->isIntra()) { @@ -1812,7 +1830,11 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, ParameterSetManager *para } else { +#if JVET_O_MAX_NUM_ALF_APS_8 + READ_CODE(3, uiCode, "tile_group_num_APS"); +#else xReadTruncBinCode(uiCode, ALF_CTB_MAX_NUM_APS + 1); +#endif } #endif int numAps = uiCode; @@ -1820,7 +1842,11 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, ParameterSetManager *para std::vector<int> apsId(numAps, -1); for (int i = 0; i < numAps; i++) { +#if JVET_O_MAX_NUM_ALF_APS_8 + READ_CODE(3, uiCode, "tile_group_aps_id"); +#else READ_CODE(5, uiCode, "tile_group_aps_id"); +#endif apsId[i] = uiCode; } @@ -1841,7 +1867,11 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, ParameterSetManager *para if (alfChromaIdc) { #if JVET_O0288_UNIFY_ALF_SLICE_TYPE_REMOVAL +#if JVET_O_MAX_NUM_ALF_APS_8 + READ_CODE(3, uiCode, "tile_group_aps_id_chroma"); +#else READ_CODE(5, uiCode, "tile_group_aps_id_chroma"); +#endif #else if (pcSlice->isIntra() && pcSlice->getTileGroupNumAps() == 1) { @@ -1849,7 +1879,11 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, ParameterSetManager *para } else { +#if JVET_O_MAX_NUM_ALF_APS_8 + READ_CODE(3, uiCode, "tile_group_aps_id_chroma"); +#else READ_CODE(5, uiCode, "tile_group_aps_id_chroma"); +#endif } #endif pcSlice->setTileGroupApsIdChroma(uiCode); diff --git a/source/Lib/EncoderLib/CABACWriter.cpp b/source/Lib/EncoderLib/CABACWriter.cpp index b86518d2c0323170de0afc46b056a4e9d6da48f4..c8e8f8f89b0246d1055e3ab167ca50dd7b86e341 100755 --- a/source/Lib/EncoderLib/CABACWriter.cpp +++ b/source/Lib/EncoderLib/CABACWriter.cpp @@ -2915,7 +2915,11 @@ void CABACWriter::residual_lfnst_mode( const CodingUnit& cu, CUCtx& cuCtx ) if( cu.ispMode != NOT_INTRA_SUBPARTITIONS || cu.mipFlag == true || ( CS::isDualITree( *cu.cs ) && cu.chType == CHANNEL_TYPE_CHROMA && std::min( cu.blocks[ 1 ].width, cu.blocks[ 1 ].height ) < 4 ) #if JVET_O0213_RESTRICT_LFNST_TO_MAX_TB_SIZE +#if JVET_O0545_MAX_TB_SIGNALLING + || ( cu.blocks[ 0 ].width > cu.cs->sps->getMaxTbSize() || cu.blocks[ 0 ].height > cu.cs->sps->getMaxTbSize() ) +#else || ( cu.blocks[ 0 ].width > MAX_TB_SIZEY || cu.blocks[ 0 ].height > MAX_TB_SIZEY ) +#endif #endif ) { diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index 97956172c11aea7968071fdc8191c530a11b7eed..a3c85e561c39afd65f859d1c686060d30ed4b4ab 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -501,7 +501,11 @@ void EncAdaptiveLoopFilter::create( const EncCfg* encCfg, const int picWidth, co m_diffFilterCoeff[i] = new int[MAX_NUM_ALF_LUMA_COEFF]; } +#if JVET_O_MAX_NUM_ALF_APS_8 + m_apsIdStart = ALF_CTB_MAX_NUM_APS; +#else m_apsIdStart = (int)MAX_NUM_APS; +#endif m_ctbDistortionFixedFilter = new double[m_numCTUsInPic]; for (int comp = 0; comp < MAX_NUM_COMPONENT; comp++) { @@ -654,15 +658,27 @@ void EncAdaptiveLoopFilter::ALFProcess(CodingStructure& cs, const double *lambda { if (cs.slice->getPendingRasInit() || cs.slice->isIDRorBLA()) { +#if JVET_O_MAX_NUM_ALF_APS_8 + memset(cs.slice->getAlfAPSs(), 0, sizeof(*cs.slice->getAlfAPSs())*ALF_CTB_MAX_NUM_APS); + m_apsIdStart = ALF_CTB_MAX_NUM_APS; +#else memset(cs.slice->getAlfAPSs(), 0, sizeof(*cs.slice->getAlfAPSs())*MAX_NUM_APS); m_apsIdStart = (int)MAX_NUM_APS; +#endif m_apsMap->clear(); +#if JVET_O_MAX_NUM_ALF_APS_8 + for (int i = 0; i < ALF_CTB_MAX_NUM_APS; i++) +#else for (int i = 0; i < MAX_NUM_APS; i++) +#endif { APS* alfAPS = m_apsMap->getPS((i << NUM_APS_TYPE_LEN) + ALF_APS); m_apsMap->clearChangedFlag((i << NUM_APS_TYPE_LEN) + ALF_APS); if (alfAPS) + { + alfAPS->getAlfAPSParam().reset(); alfAPS = nullptr; + } } } AlfParam alfParam; @@ -2963,16 +2979,28 @@ void EncAdaptiveLoopFilter::setCtuEnableFlag( uint8_t** ctuFlags, ChannelType ch std::vector<int> EncAdaptiveLoopFilter::getAvaiApsIdsLuma(CodingStructure& cs, int &newApsId) { APS** apss = cs.slice->getAlfAPSs(); +#if JVET_O_MAX_NUM_ALF_APS_8 + for (int i = 0; i < ALF_CTB_MAX_NUM_APS; i++) +#else for (int i = 0; i < MAX_NUM_APS; i++) +#endif { apss[i] = m_apsMap->getPS((i << NUM_APS_TYPE_LEN) + ALF_APS); } std::vector<int> result; int apsIdChecked = 0, curApsId = m_apsIdStart; +#if JVET_O_MAX_NUM_ALF_APS_8 + if (curApsId < ALF_CTB_MAX_NUM_APS) +#else if (curApsId < int(MAX_NUM_APS)) +#endif { +#if JVET_O_MAX_NUM_ALF_APS_8 + while (apsIdChecked < ALF_CTB_MAX_NUM_APS && !cs.slice->isIntra() && result.size() < ALF_CTB_MAX_NUM_APS && !cs.slice->getPendingRasInit() && !cs.slice->isIDRorBLA()) +#else while (apsIdChecked < MAX_NUM_APS && !cs.slice->isIntra() && result.size() < (ALF_CTB_MAX_NUM_APS - 1) && !cs.slice->getPendingRasInit() && !cs.slice->isIDRorBLA()) +#endif { APS* curAPS = cs.slice->getAlfAPSs()[curApsId]; if (curAPS && curAPS->getTemporalId() <= cs.slice->getTLayer() && curAPS->getAlfAPSParam().newFilterFlag[CHANNEL_TYPE_LUMA]) @@ -2980,7 +3008,11 @@ std::vector<int> EncAdaptiveLoopFilter::getAvaiApsIdsLuma(CodingStructure& cs, i result.push_back(curApsId); } apsIdChecked++; +#if JVET_O_MAX_NUM_ALF_APS_8 + curApsId = (curApsId + 1) % ALF_CTB_MAX_NUM_APS; +#else curApsId = (curApsId + 1) % MAX_NUM_APS; +#endif } } cs.slice->setTileGroupNumAps((int)result.size()); @@ -2988,10 +3020,17 @@ std::vector<int> EncAdaptiveLoopFilter::getAvaiApsIdsLuma(CodingStructure& cs, i newApsId = m_apsIdStart - 1; if (newApsId < 0) { +#if JVET_O_MAX_NUM_ALF_APS_8 + newApsId = ALF_CTB_MAX_NUM_APS - 1; +#else newApsId = (int)MAX_NUM_APS - 1; +#endif } - +#if JVET_O_MAX_NUM_ALF_APS_8 + CHECK(newApsId >= ALF_CTB_MAX_NUM_APS, "Wrong APS index assignment in getAvaiApsIdsLuma"); +#else CHECK(newApsId >= (int)MAX_NUM_APS, "Wrong APS index assignment in getAvaiApsIdsLuma"); +#endif return result; } void EncAdaptiveLoopFilter::initDistortion() @@ -3059,6 +3098,12 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar int numIter = useNewFilter ? 2 : 1; for (int numTemporalAps = 0; numTemporalAps <= apsIds.size(); numTemporalAps++) { +#if JVET_O_MAX_NUM_ALF_APS_8 + if (numTemporalAps + useNewFilter >= ALF_CTB_MAX_NUM_APS) + { + continue; + } +#endif cs.slice->setTileGroupNumAps(numTemporalAps + useNewFilter); int numFilterSet = NUM_FIXED_FILTER_SETS + numTemporalAps + useNewFilter; if (numTemporalAps == apsIds.size() && numTemporalAps > 0 && useNewFilter && newApsId == apsIds.back()) //last temporalAPS is occupied by new filter set and this temporal APS becomes unavailable @@ -3069,7 +3114,11 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar { m_alfParamTemp = alfParamNewFilters; m_alfParamTemp.enabledFlag[CHANNEL_TYPE_LUMA] = true; +#if JVET_O_MAX_NUM_ALF_APS_8 + double curCost = 3 * m_lambda[CHANNEL_TYPE_LUMA]; +#else double curCost = getTBlength(numTemporalAps + useNewFilter, ALF_CTB_MAX_NUM_APS + 1) * m_lambda[CHANNEL_TYPE_LUMA]; +#endif if (iter > 0) //re-derive new filter-set { double dDistOrgNewFilter = 0; @@ -3229,9 +3278,17 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar } } //for(ctbIdx) #if JVET_O0288_UNIFY_ALF_SLICE_TYPE_REMOVAL +#if JVET_O_MAX_NUM_ALF_APS_8 + int tmpBits = bitsNewFilter + 3 * (numFilterSet - NUM_FIXED_FILTER_SETS); +#else int tmpBits = bitsNewFilter + 5 * (numFilterSet - NUM_FIXED_FILTER_SETS) + getTBlength(numFilterSet - NUM_FIXED_FILTER_SETS, ALF_CTB_MAX_NUM_APS + 1); +#endif +#else +#if JVET_O_MAX_NUM_ALF_APS_8 + int tmpBits = bitsNewFilter + 3 * (numFilterSet - NUM_FIXED_FILTER_SETS) + (cs.slice->isIntra() ? 1 : 3); #else int tmpBits = bitsNewFilter + 5 * (numFilterSet - NUM_FIXED_FILTER_SETS) + (cs.slice->isIntra() ? 1 : getTBlength(numFilterSet - NUM_FIXED_FILTER_SETS, ALF_CTB_MAX_NUM_APS + 1)); +#endif #endif curCost += tmpBits * m_lambda[COMPONENT_Y]; if (curCost < costMin) @@ -3291,6 +3348,7 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar newAPS->setAPSType(ALF_APS); } newAPS->setAlfAPSParam(alfParamNewFiltersBest); + newAPS->getAlfAPSParam().newFilterFlag[CHANNEL_TYPE_CHROMA] = false; m_apsMap->setChangedFlag((newApsId << NUM_APS_TYPE_LEN) + ALF_APS); m_apsIdStart = newApsId; } @@ -3330,7 +3388,11 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar curId--; if (curId < 0) { +#if JVET_O_MAX_NUM_ALF_APS_8 + curId = ALF_CTB_MAX_NUM_APS - 1; +#else curId = (int)MAX_NUM_APS - 1; +#endif } if (std::find(bestApsIds.begin(), bestApsIds.end(), curId) == bestApsIds.end()) { @@ -3338,7 +3400,11 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar } } } +#if JVET_O_MAX_NUM_ALF_APS_8 + for (int curApsId = 0; curApsId < ALF_CTB_MAX_NUM_APS; curApsId++) +#else for (int curApsId = 0; curApsId < MAX_NUM_APS; curApsId++) +#endif { if ((cs.slice->getPendingRasInit() || cs.slice->isIDRorBLA() || cs.slice->isIntra()) && curApsId != newApsIdChroma) { @@ -3346,9 +3412,17 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar } APS* curAPS = m_apsMap->getPS((curApsId << NUM_APS_TYPE_LEN) + ALF_APS); #if JVET_O0288_UNIFY_ALF_SLICE_TYPE_REMOVAL +#if JVET_O_MAX_NUM_ALF_APS_8 + double curCost = m_lambda[CHANNEL_TYPE_CHROMA] * 3; +#else double curCost = m_lambda[CHANNEL_TYPE_CHROMA] * 5; +#endif +#else +#if JVET_O_MAX_NUM_ALF_APS_8 + double curCost = (cs.slice->isIntra() && cs.slice->getTileGroupNumAps() == 1) ? 0 : (m_lambda[CHANNEL_TYPE_CHROMA] * 3); #else double curCost = (cs.slice->isIntra() && cs.slice->getTileGroupNumAps() == 1) ? 0 : (m_lambda[CHANNEL_TYPE_CHROMA] * 5); +#endif #endif if (curApsId == newApsIdChroma) { @@ -3501,6 +3575,10 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar newAPS->getAlfAPSParam().reset(); } newAPS->getAlfAPSParam().newFilterFlag[CHANNEL_TYPE_CHROMA] = true; + if (!alfParamNewFiltersBest.newFilterFlag[CHANNEL_TYPE_LUMA]) + { + newAPS->getAlfAPSParam().newFilterFlag[CHANNEL_TYPE_LUMA] = false; + } #if JVET_O0090_ALF_CHROMA_FILTER_ALTERNATIVES_CTB newAPS->getAlfAPSParam().numAlternativesChroma = alfParamNewFilters.numAlternativesChroma; for( int altIdx = 0; altIdx < MAX_NUM_ALF_ALTERNATIVES_CHROMA; ++altIdx ) diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index 1e4eac65681e3d329b4d2a915d273abd1459615b..53f499ce905001924a86f96635d6d4fb98ed8fde 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -276,6 +276,9 @@ protected: int m_SubPuMvpMode; bool m_Affine; bool m_AffineType; +#if JVET_O0070_PROF + bool m_PROF; +#endif bool m_BIO; bool m_SMVD; @@ -819,6 +822,10 @@ public: bool getAffine () const { return m_Affine; } void setAffineType( bool b ) { m_AffineType = b; } bool getAffineType() const { return m_AffineType; } +#if JVET_O0070_PROF + void setPROF (bool b) { m_PROF = b; } + bool getPROF () const { return m_PROF; } +#endif void setBIO(bool b) { m_BIO = b; } bool getBIO() const { return m_BIO; } diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index ec041e0b1dd662cde38d747146cc91a938189bd2..fe5b1633dd3eeefe92f4f1847336c1a60cf289bc 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -1243,7 +1243,13 @@ void EncCu::xCheckModeSplit(CodingStructure *&tempCS, CodingStructure *&bestCS, bestSubCS->sharedBndPos.y = (m_shareState == SHARING) ? m_shareBndPosY : tempSubCS->area.Y().lumaPos().y; bestSubCS->sharedBndSize.width = (m_shareState == SHARING) ? m_shareBndSizeW : tempSubCS->area.lwidth(); bestSubCS->sharedBndSize.height = (m_shareState == SHARING) ? m_shareBndSizeH : tempSubCS->area.lheight(); +#if JVET_O0070_PROF + tempSubCS->bestParent = bestSubCS->bestParent = bestCS; +#endif xCompressCU( tempSubCS, bestSubCS, partitioner ); +#if JVET_O0070_PROF + tempSubCS->bestParent = bestSubCS->bestParent = nullptr; +#endif if( bestSubCS->cost == MAX_DOUBLE ) { @@ -1395,7 +1401,11 @@ void EncCu::xCheckRDCostIntra( CodingStructure *&tempCS, CodingStructure *&bestC #if JVET_O0213_RESTRICT_LFNST_TO_MAX_TB_SIZE const int maxLfnstIdx = ( CS::isDualITree( *tempCS ) && partitioner.chType == CHANNEL_TYPE_CHROMA && ( partitioner.currArea().lwidth() < 8 || partitioner.currArea().lheight() < 8 ) ) +#if JVET_O0545_MAX_TB_SIGNALLING + || ( partitioner.currArea().lwidth() > sps.getMaxTbSize() || partitioner.currArea().lheight() > sps.getMaxTbSize() ) ? 0 : 2; +#else || ( partitioner.currArea().lwidth() > MAX_TB_SIZEY || partitioner.currArea().lheight() > MAX_TB_SIZEY ) ? 0 : 2; +#endif #else const int maxLfnstIdx = CS::isDualITree( *tempCS ) && partitioner.chType == CHANNEL_TYPE_CHROMA && ( partitioner.currArea().lwidth() < 8 || partitioner.currArea().lheight() < 8 ) ? 0 : 2; #endif diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index c348ae0e23623207e95894ef085a82df0ac29ec8..c6dcaa5c6b03e0c659b03e8cad17ef6ed9a1ca89 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -2494,7 +2494,11 @@ void EncGOP::compressGOP( int iPOCLast, int iNumPicRcvd, PicList& rcListPic, if (pcSlice->getSPS()->getALFEnabledFlag() && pcSlice->getTileGroupAlfEnabledFlag(COMPONENT_Y)) { +#if JVET_O_MAX_NUM_ALF_APS_8 + for (int apsId = 0; apsId < ALF_CTB_MAX_NUM_APS; apsId++) +#else for (int apsId = 0; apsId < MAX_NUM_APS; apsId++) //HD: shouldn't this be looping over slice_alf_aps_id_luma[ i ]? By looping over MAX_NUM_APS, it is possible unused ALF APS is written. Please check! +#endif { ParameterSetMap<APS> *apsMap = m_pcEncLib->getApsMap(); diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index 6fdade2d5d17d78b44901c3ceb22bfa2e3dded80..dd743359da715987bb89226d26cce6ec2fc741fc 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -904,6 +904,9 @@ void EncLib::xInitSPS(SPS &sps) sps.setBDOFEnabledFlag ( m_BIO ); sps.setUseAffine ( m_Affine ); sps.setUseAffineType ( m_AffineType ); +#if JVET_O0070_PROF + sps.setUsePROF ( m_PROF ); +#endif sps.setUseLMChroma ( m_LMChroma ? true : false ); sps.setCclmCollocatedChromaFlag( m_cclmCollocatedChromaFlag ); sps.setUseMTS ( m_IntraMTS || m_InterMTS || m_ImplicitMTS ); @@ -980,6 +983,9 @@ void EncLib::xInitSPS(SPS &sps) { sps.setBitDepth (ChannelType(channelType), m_bitDepth[channelType] ); sps.setQpBDOffset (ChannelType(channelType), (6 * (m_bitDepth[channelType] - 8))); +#if JVET_O0919_TS_MIN_QP + sps.setMinQpPrimeTsMinus4(ChannelType(channelType), (6 * (m_bitDepth[channelType] - m_inputBitDepth[channelType]))); +#endif sps.setPCMBitDepth (ChannelType(channelType), m_PCMBitDepth[channelType] ); } diff --git a/source/Lib/EncoderLib/EncLib.h b/source/Lib/EncoderLib/EncLib.h index d1ea9637df1cdfaa0f45384972bd31cc9dab3190..71bcf0d1b5abc5faa62e6359660f179ffc9b0ef7 100644 --- a/source/Lib/EncoderLib/EncLib.h +++ b/source/Lib/EncoderLib/EncLib.h @@ -138,7 +138,11 @@ private: CacheModel m_cacheModel; #endif +#if JVET_O_MAX_NUM_ALF_APS_8 + APS* m_apss[ALF_CTB_MAX_NUM_APS]; +#else APS* m_apss[MAX_NUM_APS]; +#endif APS* m_lmcsAPS; diff --git a/source/Lib/EncoderLib/InterSearch.cpp b/source/Lib/EncoderLib/InterSearch.cpp index b1314953456596e4be9fd29616d3493cb45d0dcc..aa185316bef82966e93cd0a1f7feed517fb2d049 100644 --- a/source/Lib/EncoderLib/InterSearch.cpp +++ b/source/Lib/EncoderLib/InterSearch.cpp @@ -2339,6 +2339,13 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner) } { +#if JVET_O0070_PROF + if (pu.cu->cs->bestParent != nullptr && pu.cu->cs->bestParent->getCU(CHANNEL_TYPE_LUMA) != nullptr && pu.cu->cs->bestParent->getCU(CHANNEL_TYPE_LUMA)->affine == false) + { + m_skipPROF = true; + } + m_encOnly = true; +#endif // motion estimation only evaluates luma component m_maxCompIDToPred = MAX_NUM_COMPONENT; // m_maxCompIDToPred = COMPONENT_Y; @@ -3090,6 +3097,10 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner) PU::spanMotionInfo( pu, mergeCtx ); } +#if JVET_O0070_PROF + m_skipPROF = false; + m_encOnly = false; +#endif // MC PelUnitBuf predBuf = pu.cs->getPredBuf(pu); if ( gbiIdx == GBI_DEFAULT || !m_affineMotion.affine4ParaAvail || !m_affineMotion.affine6ParaAvail ) @@ -3324,6 +3335,9 @@ Distortion InterSearch::xGetAffineTemplateCost( PredictionUnit& pu, PelUnitBuf& const bool bi = pu.cu->slice->testWeightPred() && pu.cu->slice->getSliceType()==P_SLICE; Mv mv[3]; memcpy(mv, acMvCand, sizeof(mv)); +#if JVET_O0070_PROF + m_iRefListIdx = eRefPicList; +#endif xPredAffineBlk(COMPONENT_Y, pu, picRef, mv, predBuf, bi, pu.cu->slice->clpRng(COMPONENT_Y)); if( bi ) { @@ -4989,6 +5003,9 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit& pu, { tryBipred = 1; pu.interDir = 3; +#if JVET_O0070_PROF + m_isBi = true; +#endif // Set as best list0 and list1 iRefIdxBi[0] = iRefIdx[0]; iRefIdxBi[1] = iRefIdx[1]; @@ -5189,6 +5206,9 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit& pu, } } // for loop-iter } +#if JVET_O0070_PROF + m_isBi = false; +#endif } // if (B_SLICE) pu.mv [REF_PIC_LIST_0] = Mv(); @@ -5462,6 +5482,9 @@ void InterSearch::xAffineMotionEstimation( PredictionUnit& pu, PelUnitBuf origBufTmp = m_tmpStorageLCU.getBuf( UnitAreaRelative( *pu.cu, pu ) ); enum DFunc distFunc = (pu.cu->transQuantBypass || pu.cs->slice->getDisableSATDForRD()) ? DF_SAD : DF_HAD; +#if JVET_O0070_PROF + m_iRefListIdx = eRefPicList; +#endif // if Bi, set to ( 2 * Org - ListX ) if ( bBi ) diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index 7b93acc8c47fcc1bb37ac741664ffcc4bbb16621..f562e86d0da6e16d0c77114beddc91d16f2845d5 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -748,6 +748,10 @@ void HLSWriter::codeSPS( const SPS* pcSPS ) const bool chromaEnabled = isChromaEnabled(format); WRITE_UVLC( chromaEnabled ? (pcSPS->getBitDepth(CHANNEL_TYPE_CHROMA) - 8):0, "bit_depth_chroma_minus8" ); +#if JVET_O0919_TS_MIN_QP + WRITE_UVLC( pcSPS->getMinQpPrimeTsMinus4(CHANNEL_TYPE_LUMA), "min_qp_prime_ts_minus4" ); +#endif + WRITE_UVLC( pcSPS->getBitsForPOC()-4, "log2_max_pic_order_cnt_lsb_minus4" ); WRITE_FLAG( pcSPS->getIDRRefParamListPresent(), "sps_idr_rpl_present_flag" ); // KJS: Marakech decision: sub-layers added back @@ -904,6 +908,9 @@ void HLSWriter::codeSPS( const SPS* pcSPS ) if ( pcSPS->getUseAffine() ) { WRITE_FLAG( pcSPS->getUseAffineType() ? 1 : 0, "affine_type_flag" ); +#if JVET_O0070_PROF + WRITE_FLAG( pcSPS->getUsePROF() ? 1 : 0, "sps_prof_enabled_flag" ); +#endif #if JVET_O0438_SPS_AFFINE_AMVR_FLAG WRITE_FLAG( pcSPS->getAffineAmvrEnabledFlag() ? 1 : 0, "sps_affine_amvr_enabled_flag" ); #endif @@ -1271,7 +1278,11 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) if (alfEnabled) { #if JVET_O0288_UNIFY_ALF_SLICE_TYPE_REMOVAL +#if JVET_O_MAX_NUM_ALF_APS_8 + WRITE_CODE(pcSlice->getTileGroupNumAps(), 3, "tile_group_num_aps"); +#else xWriteTruncBinCode(pcSlice->getTileGroupNumAps(), ALF_CTB_MAX_NUM_APS + 1); +#endif #else if (pcSlice->isIntra()) { @@ -1279,13 +1290,21 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) } else { +#if JVET_O_MAX_NUM_ALF_APS_8 + WRITE_CODE(pcSlice->getTileGroupNumAps(), 3, "tile_group_num_aps"); +#else xWriteTruncBinCode(pcSlice->getTileGroupNumAps(), ALF_CTB_MAX_NUM_APS + 1); +#endif } #endif const std::vector<int>& apsId = pcSlice->getTileGroupApsIdLuma(); for (int i = 0; i < pcSlice->getTileGroupNumAps(); i++) { +#if JVET_O_MAX_NUM_ALF_APS_8 + WRITE_CODE(apsId[i], 3, "tile_group_aps_id"); +#else WRITE_CODE(apsId[i], 5, "tile_group_aps_id"); +#endif } const int alfChromaIdc = pcSlice->getTileGroupAlfEnabledFlag(COMPONENT_Cb) + pcSlice->getTileGroupAlfEnabledFlag(COMPONENT_Cr) * 2 ; @@ -1300,7 +1319,11 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) if (alfChromaIdc) { #if JVET_O0288_UNIFY_ALF_SLICE_TYPE_REMOVAL +#if JVET_O_MAX_NUM_ALF_APS_8 + WRITE_CODE(pcSlice->getTileGroupApsIdChroma(), 3, "tile_group_aps_id_chroma"); +#else WRITE_CODE(pcSlice->getTileGroupApsIdChroma(), 5, "tile_group_aps_id_chroma"); +#endif #else if (pcSlice->isIntra()&& pcSlice->getTileGroupNumAps() == 1) { @@ -1308,7 +1331,11 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) } else { +#if JVET_O_MAX_NUM_ALF_APS_8 + WRITE_CODE(pcSlice->getTileGroupApsIdChroma(), 3, "tile_group_aps_id_chroma"); +#else WRITE_CODE(pcSlice->getTileGroupApsIdChroma(), 5, "tile_group_aps_id_chroma"); +#endif } #endif }