From c0f0d7b6993e9be41d18e289538e85a985f016e2 Mon Sep 17 00:00:00 2001 From: Ruoyang Yu <ruoyyu@qti.qualcomm.com> Date: Fri, 15 Nov 2024 17:37:12 +0000 Subject: [PATCH] JVET-AJ0237: 12-bit internal bit depth modifications for ECM --- source/App/EncoderApp/EncAppCfg.cpp | 7 + source/Lib/CommonLib/AdaptiveLoopFilter.cpp | 81 +++++++ source/Lib/CommonLib/AdaptiveLoopFilter.h | 8 + source/Lib/CommonLib/AlfParameters.h | 15 +- source/Lib/CommonLib/BilateralFilter.cpp | 107 ++++++++- source/Lib/CommonLib/BilateralFilter.h | 17 ++ source/Lib/CommonLib/CommonDef.h | 11 + source/Lib/CommonLib/InterPrediction.cpp | 107 +++++++++ source/Lib/CommonLib/InterPrediction.h | 12 + source/Lib/CommonLib/InterpolationFilter.cpp | 28 +++ source/Lib/CommonLib/InterpolationFilter.h | 4 + source/Lib/CommonLib/IntraPrediction.cpp | 222 +++++++++++++++++- source/Lib/CommonLib/IntraPrediction.h | 28 +++ source/Lib/CommonLib/Picture.cpp | 3 + source/Lib/CommonLib/Rom.cpp | 12 + source/Lib/CommonLib/Rom.h | 12 + source/Lib/CommonLib/SampleAdaptiveOffset.cpp | 173 ++++++++++++++ source/Lib/CommonLib/Slice.h | 14 ++ source/Lib/CommonLib/TrQuant.cpp | 60 +++++ source/Lib/CommonLib/TypeDef.h | 1 + .../Lib/CommonLib/x86/AdaptiveLoopFilterX86.h | 152 +++++++++++- source/Lib/CommonLib/x86/BilateralFilterX86.h | 107 +++++++++ source/Lib/CommonLib/x86/BufferX86.h | 4 + .../CommonLib/x86/InterpolationFilterX86.h | 160 +++++++++++++ source/Lib/CommonLib/x86/IntraX86.h | 56 +++++ source/Lib/CommonLib/x86/RdCostX86.h | 40 ++++ source/Lib/DecoderLib/DecLib.cpp | 23 ++ source/Lib/DecoderLib/VLCReader.cpp | 15 ++ .../Lib/EncoderLib/EncAdaptiveLoopFilter.cpp | 12 + source/Lib/EncoderLib/EncCu.cpp | 3 + source/Lib/EncoderLib/EncGOP.cpp | 27 +++ source/Lib/EncoderLib/EncGOP.h | 8 + source/Lib/EncoderLib/EncLib.cpp | 9 + .../EncoderLib/EncSampleAdaptiveOffset.cpp | 128 ++++++++++ .../Lib/EncoderLib/EncSampleAdaptiveOffset.h | 9 + source/Lib/EncoderLib/EncTemporalFilter.cpp | 67 ++++++ source/Lib/EncoderLib/EncTemporalFilter.h | 16 ++ source/Lib/EncoderLib/InterSearch.cpp | 8 + source/Lib/EncoderLib/IntraSearch.cpp | 24 ++ source/Lib/EncoderLib/IntraSearch.h | 4 + source/Lib/EncoderLib/VLCWriter.cpp | 11 + 41 files changed, 1797 insertions(+), 8 deletions(-) diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index 75c962989..64ff0e13e 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -3588,6 +3588,13 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) } #endif +#if JVET_AJ0237_INTERNAL_12BIT + if ((m_internalBitDepth[CHANNEL_TYPE_LUMA] > 10) && m_CCSAO && (m_iQP >= 37) && (m_sourceWidth * m_sourceHeight > 1920 * 1080)) + { + m_CCSAO = false; + } +#endif + // check validity of input parameters if( xCheckParameter() ) { diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp index ceaa4d3d1..875310240 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp @@ -2544,7 +2544,11 @@ void AdaptiveLoopFilter::alfAddCorrect( AlfClassifier** classifier, const PelUn int dstStride2 = dstStride * clsSizeY; int srcStride2 = srcStride * clsSizeY; +#if JVET_AJ0237_INTERNAL_12BIT + const Pel currBase = 1 << (clpRng.bd - 1); +#else const Pel currBase = 512; +#endif const int adjustOffCorr = (1 << (shiftCorr + shiftPrecis - 1)); const int offsetN = adjustOffCorr; const int offsetP = offsetN - 1; @@ -3182,7 +3186,11 @@ double AdaptiveLoopFilter::getScaleCorrDouble( const int s ) #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER void AdaptiveLoopFilter::deriveFixFilterResultsBlkChroma(AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &src, const CPelBuf &srcBeforeDb, const Area &blkDst, const Area &blk, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS]) { +#if JVET_AJ0237_INTERNAL_12BIT + m_deriveVariance(src, blkDst, blk, laplacian, bits); +#else m_deriveVariance(src, blkDst, blk, laplacian); +#endif m_deriveClassificationLaplacian(src, blkDst, blk, laplacian, ALF_CLASSIFIER_FL_CHROMA); m_calcClass0(classifier[ALF_NUM_CLASSIFIER + 1], blkDst, blk, ALF_CLASSIFIER_FL_CHROMA + 10, 1, NUM_DIR_FIX, NUM_ACT_FIX, bits, 2, mappingDir, laplacian); @@ -5054,8 +5062,12 @@ void AdaptiveLoopFilter::deriveClassificationAndFixFilterResultsBlk( AlfClassifi #endif { #if JVET_AE0139_ALF_IMPROVED_FIXFILTER +#if JVET_AJ0237_INTERNAL_12BIT + m_deriveVariance(srcLuma, blkDst, blk, laplacian, bits); +#else m_deriveVariance(srcLuma, blkDst, blk, laplacian); #endif +#endif #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER m_deriveClassificationLaplacian(srcLuma, blkDst, blk, laplacian, ALF_CLASSIFIER_FL); @@ -5622,7 +5634,11 @@ void AdaptiveLoopFilter::filterBlk(AlfClassifier **classifier, const PelUnitBuf adjustShift -= shiftPrecis; // add more precision } const int shift = adjustShift; +#if JVET_AJ0237_INTERNAL_12BIT + const Pel currBase = 1 << (clpRng.bd - 1); +#else const Pel currBase = 512; // 10-bits +#endif #else #if JVET_AG0158_ALF_LUMA_COEFF_PRECISION const int shift = coeffBits - 1; @@ -7055,8 +7071,16 @@ void AdaptiveLoopFilter::paddingFixedFilterResultsCtu(Pel*** fixedFilterResultsP } #if JVET_AE0139_ALF_IMPROVED_FIXFILTER +#if JVET_AJ0237_INTERNAL_12BIT +void AdaptiveLoopFilter::deriveVariance(const CPelBuf& srcLuma, const Area& blkDst, const Area& blk, uint32_t*** laplacian, int bits) +#else void AdaptiveLoopFilter::deriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, uint32_t ***laplacian) +#endif { +#if JVET_AJ0237_INTERNAL_12BIT + int64_t tempData[4][(m_CLASSIFICATION_BLK_SIZE + 10) >> 1][((m_CLASSIFICATION_BLK_SIZE + 16) >> 1) + 8] = { { { 0 } } }; +#endif + int fl = DIST_CLASS; int stride = srcLuma.stride; int stride2 = 2 * stride; @@ -7072,12 +7096,43 @@ void AdaptiveLoopFilter::deriveVariance(const CPelBuf &srcLuma, const Area &blkD for (int j = 0; j < blk.width + fl2; j += 2) { int jOffset = j >> 1; + +#if JVET_AJ0237_INTERNAL_12BIT + tempData[0][iOffset][jOffset] = src[j] + src[j + 1] + src1[j] + src1[j + 1]; + tempData[1][iOffset][jOffset] = src[j] * src[j] + src[j + 1] * src[j + 1] + src1[j] * src1[j] + src1[j + 1] * src1[j + 1]; +#else laplacian[0][iOffset][jOffset] = src[j] + src[j + 1] + src1[j] + src1[j + 1]; laplacian[1][iOffset][jOffset] = src[j] * src[j] + src[j + 1] * src[j + 1] + src1[j] * src1[j] + src1[j + 1] * src1[j + 1]; +#endif int iOffsetM4 = iOffset - 4; int jOffsetM4 = jOffset - 4; +#if JVET_AJ0237_INTERNAL_12BIT + if (jOffsetM4 == 0) + { + tempData[2][iOffset][jOffsetM4] = tempData[0][iOffset][jOffset - 4] + tempData[0][iOffset][jOffset - 3] + tempData[0][iOffset][jOffset - 2] + tempData[0][iOffset][jOffset - 1] + tempData[0][iOffset][jOffset]; + tempData[3][iOffset][jOffsetM4] = tempData[1][iOffset][jOffset - 4] + tempData[1][iOffset][jOffset - 3] + tempData[1][iOffset][jOffset - 2] + tempData[1][iOffset][jOffset - 1] + tempData[1][iOffset][jOffset]; + } + else if (jOffsetM4 > 0) + { + tempData[2][iOffset][jOffsetM4] = tempData[2][iOffset][jOffset - 5] - tempData[0][iOffset][jOffset - 5] + tempData[0][iOffset][jOffset]; + tempData[3][iOffset][jOffsetM4] = tempData[3][iOffset][jOffset - 5] - tempData[1][iOffset][jOffset - 5] + tempData[1][iOffset][jOffset]; + } + + if ((iOffsetM4 >= 0) && (jOffsetM4 >= 0)) + { + if (iOffsetM4 == 0) + { + tempData[0][iOffsetM4][jOffsetM4] = tempData[2][iOffsetM4][jOffsetM4] + tempData[2][iOffset - 3][jOffsetM4] + tempData[2][iOffset - 2][jOffsetM4] + tempData[2][iOffset - 1][jOffsetM4] + tempData[2][iOffset][jOffsetM4]; + tempData[1][iOffsetM4][jOffsetM4] = tempData[3][iOffsetM4][jOffsetM4] + tempData[3][iOffset - 3][jOffsetM4] + tempData[3][iOffset - 2][jOffsetM4] + tempData[3][iOffset - 1][jOffsetM4] + tempData[3][iOffset][jOffsetM4]; + } + else + { + tempData[0][iOffsetM4][jOffsetM4] = tempData[0][iOffsetM4 - 1][jOffsetM4] - tempData[2][iOffsetM4 - 1][jOffsetM4] + tempData[2][iOffset][jOffsetM4]; + tempData[1][iOffsetM4][jOffsetM4] = tempData[1][iOffsetM4 - 1][jOffsetM4] - tempData[3][iOffsetM4 - 1][jOffsetM4] + tempData[3][iOffset][jOffsetM4]; + } +#else if (jOffsetM4 == 0) { laplacian[2][iOffset][jOffsetM4] = laplacian[0][iOffset][jOffset - 4] + laplacian[0][iOffset][jOffset - 3] + laplacian[0][iOffset][jOffset - 2] + laplacian[0][iOffset][jOffset - 1] + laplacian[0][iOffset][jOffset]; @@ -7101,8 +7156,14 @@ void AdaptiveLoopFilter::deriveVariance(const CPelBuf &srcLuma, const Area &blkD laplacian[0][iOffsetM4][jOffsetM4] = laplacian[0][iOffsetM4 - 1][jOffsetM4] - laplacian[2][iOffsetM4 - 1][jOffsetM4] + laplacian[2][iOffset][jOffsetM4]; laplacian[1][iOffsetM4][jOffsetM4] = laplacian[1][iOffsetM4 - 1][jOffsetM4] - laplacian[3][iOffsetM4 - 1][jOffsetM4] + laplacian[3][iOffset][jOffsetM4]; } +#endif +#if JVET_AJ0237_INTERNAL_12BIT + int bdShift = 2 * std::max(0, bits - 10); + laplacian[VARIANCE][iOffsetM4][jOffsetM4] = (uint32_t)((13 * ((numSample * tempData[1][iOffsetM4][jOffsetM4] - tempData[0][iOffsetM4][jOffsetM4] * tempData[0][iOffsetM4][jOffsetM4] + offset) >> 3)) >> (14 + bdShift)); +#else laplacian[VARIANCE][iOffsetM4][jOffsetM4] = (13 * ((numSample * laplacian[1][iOffsetM4][jOffsetM4] - laplacian[0][iOffsetM4][jOffsetM4] * laplacian[0][iOffsetM4][jOffsetM4] + offset) >> 3)) >> 14; +#endif } } src += stride2; @@ -7184,7 +7245,11 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlkChroma(AlfClassifier ***c if (useSimd) { +#if JVET_AJ0237_INTERNAL_12BIT + m_deriveVariance(src, blk, blk, laplacian, bits); +#else m_deriveVariance(src, blk, blk, laplacian); +#endif m_deriveClassificationLaplacian(src, blk, blk, laplacian, ALF_CLASSIFIER_FL_CHROMA); m_calcClass0(classifier[ALF_NUM_CLASSIFIER + 1], blk, blk, ALF_CLASSIFIER_FL_CHROMA + 10, 1, NUM_DIR_FIX, NUM_ACT_FIX, bits, 2, mappingDir, laplacian); alfFixedFilterBlk(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blk, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFilterSetIdx, targetFixedFilterSetInd, 0, clpRng, clippingValues, false @@ -7195,7 +7260,11 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlkChroma(AlfClassifier ***c } else { +#if JVET_AJ0237_INTERNAL_12BIT + deriveVariance(src, blk, blk, laplacian, bits); +#else deriveVariance(src, blk, blk, laplacian); +#endif deriveClassificationLaplacian(src, blk, blk, laplacian, ALF_CLASSIFIER_FL_CHROMA); calcClass0Var(classifier[ALF_NUM_CLASSIFIER + 1], blk, blk, ALF_CLASSIFIER_FL_CHROMA + 10, 1, NUM_DIR_FIX, NUM_ACT_FIX, bits, 2, mappingDir, laplacian); alfFixedFilterBlkNonSimd(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blk, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFilterSetIdx, targetFixedFilterSetInd, 0, clpRng, clippingValues, false @@ -7704,8 +7773,12 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlk( AlfClassifier **classif if(useSimd) { #if JVET_AE0139_ALF_IMPROVED_FIXFILTER +#if JVET_AJ0237_INTERNAL_12BIT + m_deriveVariance(srcLuma, blkCur, blkCur, laplacian, bits); +#else m_deriveVariance(srcLuma, blkCur, blkCur, laplacian); #endif +#endif #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER m_deriveClassificationLaplacian(srcLuma, blkCur, blkCur, laplacian, ALF_CLASSIFIER_FL); #else @@ -7715,8 +7788,12 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlk( AlfClassifier **classif else { #if JVET_AE0139_ALF_IMPROVED_FIXFILTER +#if JVET_AJ0237_INTERNAL_12BIT + deriveVariance(srcLuma, blkCur, blkCur, laplacian, bits); +#else deriveVariance(srcLuma, blkCur, blkCur, laplacian); #endif +#endif #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER deriveClassificationLaplacian(srcLuma, blkCur, blkCur, laplacian, ALF_CLASSIFIER_FL); #else @@ -8126,7 +8203,11 @@ void AdaptiveLoopFilter::gaussFiltering(CodingStructure &cs, Pel ***gaussPic, co int padSize = ALF_PADDING_SIZE_GAUSS_RESULTS; int shift = 10; const int numCoeff = 12; +#if JVET_AJ0237_INTERNAL_12BIT + int diffTH = 32 << std::max(0, cs.sps->getBitDepth(CHANNEL_TYPE_LUMA) - 10); +#else int diffTH = 32; +#endif #if JVET_AJ0188_CODING_INFO_CLASSIFICATION const bool isIntraSlice = cs.slice->isIntra(); const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h index d2dacb490..6a31efba7 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.h +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.h @@ -180,14 +180,22 @@ public: void deriveFixedFilterResultsBlk( AlfClassifier*** classifier, const CPelBuf& srcLuma, const CPelBuf& srcLumaBeforeDb, const Area& blkDst, const Area& blk, const int bits, CodingStructure &cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int winIdx, int fixedFilterSetIdx ); void deriveFixedFilterResults( AlfClassifier*** classifier, const CPelBuf& srcLuma, const CPelBuf& srcLumaBeforeDb, const Area& blkDst, const Area& blk, CodingStructure &cs, int winIdx, int fixedFilterSetIdx ); static void calcClass0Var( AlfClassifier **classifier, const Area &blkDst, const Area &cu, int dirWindSize, int classDir, int noDir, int noAct, int bitDepth, int subBlkSize, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS] ); +#if JVET_AJ0237_INTERNAL_12BIT + static void deriveVariance(const CPelBuf& srcLuma, const Area& blkDst, const Area& blk, uint32_t ***laplacian, int bd); +#else static void deriveVariance( const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, uint32_t ***laplacian ); +#endif void deriveFixedFilterResultsCtuBoundary( AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const CPelBuf &srcLumaBeforeDb, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlagLuma, uint8_t* ctuEnableOnlineLuma, int ctuIdx, int classifierIdx #if JVET_AJ0188_CODING_INFO_CLASSIFICATION , const CPelBuf& srcCodingInfo, const CPelBuf& srcResi #endif ); void deriveFixedFilterResultsPerBlk( AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const CPelBuf &srcLumaBeforeDb, const Area &blkCur, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], const int classifierIdx ); +#if JVET_AJ0237_INTERNAL_12BIT + void(*m_deriveVariance)(const CPelBuf& srcLuma, const Area& blkDst, const Area& blk, uint32_t ***variance, int bd); +#else void(*m_deriveVariance)(const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, uint32_t ***variance); +#endif #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER void deriveFixedFilterResultsCtuBoundaryChroma(AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &src, const CPelBuf &srcBeforeDb, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlag, int ctuIdx); void deriveFixedFilterResultsPerBlkChroma(AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &src, const CPelBuf &srcBeforeDb, const Area &blk, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS]); diff --git a/source/Lib/CommonLib/AlfParameters.h b/source/Lib/CommonLib/AlfParameters.h index 0a507c46b..1712e1ba5 100644 --- a/source/Lib/CommonLib/AlfParameters.h +++ b/source/Lib/CommonLib/AlfParameters.h @@ -584,6 +584,9 @@ struct ScaleAlf bool usePrev; int apsIdx; +#if JVET_AJ0237_INTERNAL_12BIT + int bitDepth; +#endif void reset() { @@ -608,14 +611,20 @@ struct ScaleAlf void setMinMax( const Pel lumaMin = 0, const Pel lumaMax = 1024, const bool bCheckClassifier = true ) { const int c = classifierIdx; +#if !JVET_AJ0237_INTERNAL_12BIT const int bitDepth = 10; +#endif idxClassMin = (!bCheckClassifier || c == 1) ? ((lumaMin * ALF_NUM_CLASSES_CLASSIFIER[c]) >> bitDepth) : 0 ; idxClassMax = (!bCheckClassifier || c == 1) ? ((lumaMax * ALF_NUM_CLASSES_CLASSIFIER[c]) >> bitDepth) : (ALF_NUM_CLASSES_CLASSIFIER[c] - 1) ; initMinMaxDone = true; } - void init( const int f, const int a, const int c ) +#if JVET_AJ0237_INTERNAL_12BIT + void init(const int f, const int a, const int c, const int bDepth) +#else + void init( const int f, const int a, const int c ) +#endif { filterSetIndex = f; alt_num = a; @@ -625,7 +634,9 @@ struct ScaleAlf idxClassMin = 0 ; idxClassMax = ALF_NUM_CLASSES_CLASSIFIER[c] - 1 ; - +#if JVET_AJ0237_INTERNAL_12BIT + bitDepth = bDepth; +#endif initDone = true; } diff --git a/source/Lib/CommonLib/BilateralFilter.cpp b/source/Lib/CommonLib/BilateralFilter.cpp index 7d3b615d8..0f6c0f4e3 100644 --- a/source/Lib/CommonLib/BilateralFilter.cpp +++ b/source/Lib/CommonLib/BilateralFilter.cpp @@ -62,6 +62,9 @@ BilateralFilter::BilateralFilter() initBilateralFilterX86(); #endif #endif +#if JVET_AJ0237_INTERNAL_12BIT + internalBitDepth = 10; +#endif } BilateralFilter::~BilateralFilter() @@ -101,6 +104,12 @@ const char* BilateralFilter::getFilterLutParameters(int16_t* block, const int st int h = floorLog2(height); int mad = m_calcMAD(block, stride, width, height, w + h); +#if JVET_AJ0237_INTERNAL_12BIT + int bdShift = std::max(0, internalBitDepth - 10); + int offset = (bdShift == 0) ? 0 : (1 << (bdShift - 1)); + mad = (mad + offset) >> bdShift; +#endif + w = std::min(w, 7); h = std::min(h, 7); mad = std::min(mad >> 4, 15); @@ -163,7 +172,11 @@ const char* BilateralFilter::getFilterLutParameters(int16_t* block, const int st } #endif +#if JVET_AJ0237_INTERNAL_12BIT +inline void bifApplyLut(int diff, int& res, int cutBitsNum, int bitsRound, int bitsRound2, int shift, const char* lutRowPtr, int lutShift, int bdShift) +#else inline void bifApplyLut(int diff, int& res, int cutBitsNum, int bitsRound, int bitsRound2, int shift, const char* lutRowPtr, int lutShift) +#endif { int sg0 = diff >> shift; int v0 = (diff + sg0) ^ sg0; @@ -177,14 +190,26 @@ inline void bifApplyLut(int diff, int& res, int cutBitsNum, int bitsRound, int b int idx = (v0 + 4) >> 3; idx = 15 + ((idx - 15) & ((idx - 15) >> shift)); int w0 = lutRowPtr[idx] >> lutShift; +#endif +#if JVET_AJ0237_INTERNAL_12BIT + w0 = (w0 << bdShift); #endif res = (w0 + sg0) ^ sg0; } +#if JVET_AJ0237_INTERNAL_12BIT +void BilateralFilter::blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum, int bdShift) +#else void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum) +#endif { int pad = NUMBER_PADDED_SAMPLES; +#if JVET_AJ0237_INTERNAL_12BIT + cutBitsNum += bdShift; +#endif + + int padwidth = iWidthExtSIMD; int downbuffer[128]; int downleftbuffer[129]; @@ -204,13 +229,25 @@ void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t { int pixel = block[(-1 + pad)*padwidth + x + pad]; int below = block[(-1 + pad + 1)*padwidth + x + pad]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(below - pixel, downbuffer[x], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1, bdShift); +#else bifApplyLut(below - pixel, downbuffer[x], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1); +#endif int belowright = block[(-1 + pad + 1)*padwidth + x + pad + 1]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(belowright - pixel, downrightbuffer[1][x + 1], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2, bdShift); +#else bifApplyLut(belowright - pixel, downrightbuffer[1][x + 1], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2); +#endif int belowleft = block[(-1 + pad + 1)*padwidth + x + pad - 1]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(belowleft - pixel, downleftbuffer[x], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2, bdShift); +#else bifApplyLut(belowleft - pixel, downleftbuffer[x], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2); +#endif } int width = uiWidth; for( int y = 0; y < uiHeight; y++ ) @@ -220,15 +257,27 @@ void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t int pixel = rowStart[-1]; int right = rowStart[0], rightmod = 0; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(right - pixel, rightmod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1, bdShift); +#else bifApplyLut(right - pixel, rightmod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1); +#endif pixel = rowStart[-padwidth - 1]; int belowright = right; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(belowright - pixel, downrightbuffer[(y + 1) % 2][0], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2, bdShift); +#else bifApplyLut(belowright - pixel, downrightbuffer[(y + 1) % 2][0], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2); +#endif pixel = rowStart[-padwidth + width]; int belowleft = rowStart[width - 1]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(belowleft - pixel, downleftbuffer[width], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2, bdShift); +#else bifApplyLut(belowleft - pixel, downleftbuffer[width], cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2); +#endif for( int x = 0; x < uiWidth; x++ ) { @@ -242,12 +291,20 @@ void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t modsum += leftmod; right = rowStart[x + 1]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(right - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1, bdShift); +#else bifApplyLut(right - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1); +#endif modsum += mod; rightmod = mod; int below = rowStart[x + padwidth]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(below - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1, bdShift); +#else bifApplyLut(below - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift1); +#endif modsum += mod; downbuffer[x] = mod; @@ -258,12 +315,20 @@ void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t modsum += aboveleftmod; int belowleft = rowStart[x + padwidth - 1]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(belowleft - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2, bdShift); +#else bifApplyLut(belowleft - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2); +#endif modsum += mod; downleftbuffer[x] = mod; int belowright = rowStart[x + padwidth + 1]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(belowright - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2, bdShift); +#else bifApplyLut(belowright - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift2); +#endif modsum += mod; downrightbuffer[y % 2][x + 1] = mod; @@ -272,23 +337,43 @@ void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t // speed when SIMD is turned off. int above = rowStart[x - 2 * padwidth]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(above - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3, bdShift); +#else bifApplyLut(above - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3); +#endif modsum += mod; below = rowStart[x + 2 * padwidth]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(below - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3, bdShift); +#else bifApplyLut(below - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3); +#endif modsum += mod; int left = rowStart[x - 2]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(left - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3, bdShift); +#else bifApplyLut(left - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3); +#endif modsum += mod; right = rowStart[x + 2]; +#if JVET_AJ0237_INTERNAL_12BIT + bifApplyLut(right - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3, bdShift); +#else bifApplyLut(right - pixel, mod, cutBitsNum, bitsRound, bitsRound2, shift, lutRowPtr, lutShift3); +#endif modsum += mod; #if JVET_AF0112_BIF_DYNAMIC_SCALING +#if JVET_AJ0237_INTERNAL_12BIT + blkFilt[(y + pad) * padwidth + x + pad] = ((int16_t)((modsum * bfac + (bifRoundAdd << 3)) >> (bifRoundShift + 3))); +#else blkFilt[(y + pad) * padwidth + x + pad] = ((int16_t)((uint16_t)((modsum * bfac + (bifRoundAdd << 3)) >> (bifRoundShift + 3)))); +#endif #else blkFilt[(y + pad) * padwidth + x + pad] = (( int16_t ) (( uint16_t ) ((modsum*bfac + bifRoundAdd) >> bifRoundShift))); #endif @@ -592,7 +677,12 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(const ComponentID compID, Pel CHECK(doReshape, "Reshape domain is not used for chroma"); #endif } +#if JVET_AJ0237_INTERNAL_12BIT + int bdShift = std::max(0, internalBitDepth - 10); + m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, true, lutRowPtr, false, cutBitsNum, bdShift); +#else m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, true, lutRowPtr, false, cutBitsNum); +#endif if( !useReco ) { @@ -619,6 +709,9 @@ void BilateralFilter::bilateralFilterDiamond5x5( const ComponentID compID, const #endif ) { +#if JVET_AJ0237_INTERNAL_12BIT + int bdShift = std::max(0, internalBitDepth - 10); +#endif #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY const int scaleX = getChannelTypeScaleX( toChannelType( compID ), currTU.cu->cs->pcv->chrFormat ); const int scaleY = getChannelTypeScaleY( toChannelType( compID ), currTU.cu->cs->pcv->chrFormat ); @@ -902,7 +995,11 @@ void BilateralFilter::bilateralFilterDiamond5x5( const ComponentID compID, const int bifRoundAdd = BIF_ROUND_ADD >> currTU.cs->pps->getBIFStrength(); int bifRoundShift = BIF_ROUND_SHIFT - currTU.cs->pps->getBIFStrength(); +#if JVET_AJ0237_INTERNAL_12BIT + m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, false, lutRowPtr, false, cutBitsNum, bdShift); +#else m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, false, lutRowPtr, false, cutBitsNum); +#endif xStart = xEnd; } @@ -1168,8 +1265,11 @@ void BilateralFilter::bilateralFilterDiamond5x5( const ComponentID compID, const int bifRoundAdd = BIF_ROUND_ADD >> currTU.cs->pps->getBIFStrength(); int bifRoundShift = BIF_ROUND_SHIFT - currTU.cs->pps->getBIFStrength(); - +#if JVET_AJ0237_INTERNAL_12BIT + m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, false, lutRowPtr, noClip, cutBitsNum, bdShift); +#else m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, false, lutRowPtr, noClip, cutBitsNum); +#endif } } void BilateralFilter::clipNotBilaterallyFilteredBlocks(const ComponentID compID, const CPelUnitBuf& src, PelUnitBuf& rec, const ClpRng& clpRng, TransformUnit & currTU) @@ -1595,6 +1695,11 @@ const char* BilateralFilter::getFilterLutParametersChroma(int16_t* block, const int h = floorLog2(heightForStrength); int mad = m_calcMAD(block, stride, width, height, floorLog2(width) + floorLog2(height)); +#if JVET_AJ0237_INTERNAL_12BIT + int bdShift = std::max(0, internalBitDepth - 10); + int offset = (bdShift == 0) ? 0 : (1 << (bdShift - 1)); + mad = (mad + offset) >> bdShift; +#endif w = std::min(w, 7); h = std::min(h, 7); diff --git a/source/Lib/CommonLib/BilateralFilter.h b/source/Lib/CommonLib/BilateralFilter.h index 27cece33b..3cba3eefc 100644 --- a/source/Lib/CommonLib/BilateralFilter.h +++ b/source/Lib/CommonLib/BilateralFilter.h @@ -67,8 +67,13 @@ private: Pel *tempblock = (Pel*)tempblockSIMD; Pel* tempblockFiltered = (Pel*)tempblockFilteredSIMD; +#if JVET_AJ0237_INTERNAL_12BIT + void (*m_bilateralFilterDiamond5x5)(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum, int bdShift); + static void blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum, int bdShift); +#else void (*m_bilateralFilterDiamond5x5)(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum); static void blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum); +#endif #if JVET_AF0112_BIF_DYNAMIC_SCALING int (*m_calcMAD)(int16_t* block, int stride, int width, int height, int whlog2); @@ -213,12 +218,19 @@ private: { 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, }, }; #endif + +#if JVET_AJ0237_INTERNAL_12BIT + int internalBitDepth; +#endif public: BilateralFilter(); ~BilateralFilter(); void create(); void destroy(); +#if JVET_AJ0237_INTERNAL_12BIT + void setInternalBitDepth(int bdDepth) { internalBitDepth = bdDepth; } +#endif #if JVET_V0094_BILATERAL_FILTER void bilateralFilterRDOdiamond5x5(const ComponentID compID, PelBuf& resiBuf, const CPelBuf& predBuf, PelBuf& recoBuf, int32_t qp, const CPelBuf& recIPredBuf, const ClpRng& clpRng, TransformUnit & currTU, bool useReco, bool doReshape = false, std::vector<Pel>* pLUT = nullptr); void bilateralFilterPicRDOperCTU(const ComponentID compID, CodingStructure& cs, PelUnitBuf& src,BIFCabacEst* bifCABACEstimator); @@ -243,8 +255,13 @@ public: #if ENABLE_SIMD_BILATERAL_FILTER || JVET_X0071_CHROMA_BILATERAL_FILTER_ENABLE_SIMD #ifdef TARGET_SIMD_X86 +#if JVET_AJ0237_INTERNAL_12BIT + template<X86_VEXT vext> + static void simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum, int bdShift); +#else template<X86_VEXT vext> static void simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum); +#endif #if JVET_AF0112_BIF_DYNAMIC_SCALING template<X86_VEXT vext> static int simdCalcMAD(int16_t* block, int stride, int width, int height, int whlog2); diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 29b7eb3dd..9bebc2f78 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -1419,6 +1419,10 @@ static const int CCCM_MAX_REF_SAMPLES = 4 * ( 2 * CCCM_WINDOW_SIZE * ( 2 * M #else static const int CCCM_MAX_REF_SAMPLES = ( 2 * CCCM_WINDOW_SIZE * ( 2 * MAX_CU_SIZE + CCCM_WINDOW_SIZE ) ); #endif +#if JVET_AJ0237_INTERNAL_12BIT +static const int CCCM_MATRIX_BITS_HBD = 32; +static const int CCCM_DECIM_BITS_HBD = 22; +#endif #if JVET_AB0174_CCCM_DIV_FREE static const int CCCM_MATRIX_BITS = 22; static const int CCCM_DECIM_BITS = 16; @@ -1426,7 +1430,9 @@ static const int CCCM_DECIM_BITS = 16; static const int CCCM_MATRIX_BITS = 28; static const int CCCM_DECIM_BITS = 22; #endif +#if !JVET_AJ0237_INTERNAL_12BIT static const int CCCM_DECIM_ROUND = ( 1 << (CCCM_DECIM_BITS - 1 ) ); +#endif #if JVET_AB0143_CCCM_TS #if MMLM #if JVET_AC0054_GLCCCM @@ -1531,6 +1537,11 @@ static const int MAX_DELTA_QP = 7; ///< static const int MAX_TESTED_QPs = ( 1 + 1 + ( MAX_DELTA_QP << 1 ) ); ///< dqp=0 +- max_delta_qp + lossless mode static const int COM16_C806_TRANS_PREC = 0; + +#if JVET_AJ0237_INTERNAL_12BIT +#define DECIM_BITS(x) ( (x) > 10 ? CCCM_DECIM_BITS_HBD : CCCM_DECIM_BITS ) +#endif + #if IF_12TAP #define NTAPS_LUMA(x) ( (x) == 0 ? 12 : 8 ) // 12-tap filter for index 0. 8-tap fitler for other indices. #else diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index 70b847afd..806653b25 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -347,6 +347,9 @@ InterPrediction::InterPrediction() #if JVET_AG0276_NLIC m_skipDoLic = false; #endif +#if JVET_AJ0237_INTERNAL_12BIT + m_dmvrCostLambda = 1; +#endif } InterPrediction::~InterPrediction() @@ -570,10 +573,18 @@ void InterPrediction::destroy() #if INTER_LIC || (TM_AMVP || TM_MRG || JVET_Z0084_IBC_TM) || JVET_W0090_ARMC_TM || JVET_Z0056_GPM_SPLIT_MODE_REORDERING || JVET_Z0061_TM_OBMC #if JVET_Z0153_IBC_EXT_REF #if JVET_AJ0172_IBC_ITMP_ALIGN_REF_AREA +#if JVET_AJ0237_INTERNAL_12BIT +void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth, const int picHeight, const int bitDepth ) +#else void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth, const int picHeight ) +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT +void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth, const int bitDepth ) #else void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth ) #endif +#endif #else void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape ) #endif @@ -593,6 +604,10 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, cons } m_currChromaFormat = chromaFormatIDC; +#if JVET_AJ0237_INTERNAL_12BIT + m_dmvrCostLambda = 1 << std::max(0, std::min(14, bitDepth) - 10); // 14 is the maximum possible DMVR internal precision value, 10 is the baseline +#endif + if( m_acYuvPred[REF_PIC_LIST_0][COMPONENT_Y] == nullptr ) // check if first is null (in which case, nothing initialised yet) { #if JVET_AF0057 @@ -15486,7 +15501,11 @@ Distortion InterPrediction::deriveBcwBlending( PredictionUnit& pu, bool bUniDir[ blendModel.params[1] = (int)((bcwModel.params[1] + offsetA) >> shiftA); blendModel.params[2] = (int)((bcwModel.params[2] + offsetA) >> shiftA); +#if JVET_AJ0237_INTERNAL_12BIT + blendModel.shift = bcwModel.decimBits - shiftA - bcwBlendingLog2WeightBase; +#else blendModel.shift = CCCM_DECIM_BITS - shiftA - bcwBlendingLog2WeightBase; +#endif blendModel.offset = blendModel.shift ? (1 << (blendModel.shift - 1)) : 0; if (blendModel.shift < 0) { @@ -27643,7 +27662,11 @@ bool InterPrediction::processBDMVRPU2Dir(PredictionUnit& pu, bool subPURefine[2] #else Distortion initCost = xBDMVRGetMatchingError(pu, mvInitial_PU, bUseMR, false); #endif +#if JVET_AJ0237_INTERNAL_12BIT + if (initCost < lumaArea * m_dmvrCostLambda) +#else if (initCost < lumaArea) +#endif { subPURefine[0] = false; subPURefine[1] = false; @@ -27661,7 +27684,11 @@ bool InterPrediction::processBDMVRPU2Dir(PredictionUnit& pu, bool subPURefine[2] #else minCost = xBDMVRMvOneTemplateHPelSquareSearch<1>(mvFinal, initCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false); #endif +#if JVET_AJ0237_INTERNAL_12BIT + subPURefine[0] = minCost >= lumaArea * m_dmvrCostLambda; +#else subPURefine[0] = minCost >= lumaArea; +#endif finalMvDir[0] = mvFinal[0]; #if JVET_AA0093_REFINED_MOTION_FOR_ARMC } @@ -27678,7 +27705,11 @@ bool InterPrediction::processBDMVRPU2Dir(PredictionUnit& pu, bool subPURefine[2] #else minCost = xBDMVRMvOneTemplateHPelSquareSearch<2>(mvFinal, initCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false); #endif +#if JVET_AJ0237_INTERNAL_12BIT + subPURefine[1] = minCost >= lumaArea * m_dmvrCostLambda; +#else subPURefine[1] = minCost >= lumaArea; +#endif finalMvDir[1] = mvFinal[1]; #if JVET_AA0093_REFINED_MOTION_FOR_ARMC } @@ -27732,7 +27763,11 @@ void InterPrediction::processBDMVRSubPU(PredictionUnit& pu, bool subPURefine) Mv mvFinal[2] = { pu.mv[0], pu.mv[1] }; Mv mvOffset; +#if JVET_AJ0237_INTERNAL_12BIT + const Distortion earlyTerminateTh = dx * dy * m_dmvrCostLambda; +#else const Distortion earlyTerminateTh = dx * dy; +#endif const int adaptiveSearchRangeHor = (dx >> 1) < BDMVR_INTME_RANGE ? (dx >> 1) : BDMVR_INTME_RANGE; const int adaptiveSearchRangeVer = (dy >> 1) < BDMVR_INTME_RANGE ? (dy >> 1) : BDMVR_INTME_RANGE; const bool adaptRange = (adaptiveSearchRangeHor != BDMVR_INTME_RANGE || adaptiveSearchRangeVer != BDMVR_INTME_RANGE); @@ -28154,7 +28189,11 @@ void InterPrediction::bmAdaptiveAffineIntSearch(const PredictionUnit &pu, Mv(&mv } else { +#if JVET_AJ0237_INTERNAL_12BIT + bmCostShift = bitDepth > 8 ? 2 : 0; +#else bmCostShift = bitDepth > 8 ? bitDepth - 8 : 0; +#endif } #else bmCostShift = 0; @@ -28585,7 +28624,11 @@ void InterPrediction::bmAffineIntSearch(const PredictionUnit &pu, Mv(&mvOffset)[ } else { +#if JVET_AJ0237_INTERNAL_12BIT + bmCostShift = bitDepth > 8 ? 2 : 0; +#else bmCostShift = bitDepth > 8 ? bitDepth - 8 : 0; +#endif } #else bmCostShift = 0; @@ -28802,7 +28845,11 @@ void InterPrediction::xInitBilateralMatching(const int width, const int height, } else { +#if JVET_AJ0237_INTERNAL_12BIT + m_bmCostShift = bitDepth > 8 ? 2 : 0; +#else m_bmCostShift = bitDepth > 8 ? bitDepth - 8 : 0; +#endif } #else m_bmCostShift = 0; @@ -29395,7 +29442,11 @@ bool InterPrediction::processBDMVR4Affine(PredictionUnit& pu minCost = xGetBilateralMatchingErrorAffine(pu, pu.mvAffi, true); } const int lumaArea = pu.lumaSize().area(); +#if JVET_AJ0237_INTERNAL_12BIT + const bool isTooSmallDist = minCost < lumaArea * m_dmvrCostLambda; +#else const bool isTooSmallDist = minCost < lumaArea; +#endif if (!isTooSmallDist) { minCost = xBDMVRMv6ParameterSearchAffine(minCost, pu); @@ -30555,7 +30606,11 @@ bool InterPrediction::processBDMVR4AdaptiveAffine(PredictionUnit& pu, Mv(&mvAffi #else bmAdaptiveAffineIntSearch(pu, mvFinalPUL0, minCostL0, mvFinalPUL1, minCostL1); #endif +#if JVET_AJ0237_INTERNAL_12BIT + const int lumaArea = pu.lumaSize().area() * m_dmvrCostLambda; +#else const int lumaArea = pu.lumaSize().area(); +#endif // sub-pel search for L0 if (minCostL0 > lumaArea) @@ -30680,7 +30735,11 @@ bool InterPrediction::processBDMVR4AdaptiveAffine(PredictionUnit& pu, Mv(&mvAffi { minCostL0 = xGetBilateralMatchingErrorAffine(pu, pu.mvAffi, true); } +#if JVET_AJ0237_INTERNAL_12BIT + const int lumaArea = pu.lumaSize().area() * m_dmvrCostLambda; +#else const int lumaArea = pu.lumaSize().area(); +#endif const bool isTooSmallDist = minCostL0 < lumaArea; if (!isTooSmallDist) { @@ -30761,7 +30820,11 @@ bool InterPrediction::processBDMVR4AdaptiveAffine(PredictionUnit& pu, Mv(&mvAffi { minCostL1 = xGetBilateralMatchingErrorAffine(pu, pu.mvAffi, true); } +#if JVET_AJ0237_INTERNAL_12BIT + const int lumaArea = pu.lumaSize().area() * m_dmvrCostLambda; +#else const int lumaArea = pu.lumaSize().area(); +#endif const bool isTooSmallDist = minCostL1 < lumaArea; if (!isTooSmallDist) { @@ -30848,7 +30911,11 @@ bool InterPrediction::processBDMVR4AdaptiveAffine(PredictionUnit& pu, Mv(&mvAffi { minCost = xGetBilateralMatchingErrorAffine(pu, pu.mvAffi, true); } +#if JVET_AJ0237_INTERNAL_12BIT + const int lumaArea = pu.lumaSize().area() * m_dmvrCostLambda; +#else const int lumaArea = pu.lumaSize().area(); +#endif const bool isTooSmallDist = minCost < lumaArea; if (!isTooSmallDist) { @@ -30934,7 +31001,11 @@ bool InterPrediction::processBDMVR(PredictionUnit& pu) #else minCost = xBDMVRGetMatchingError(pu, mvInitial_PU, bUseMR, false); #endif +#if JVET_AJ0237_INTERNAL_12BIT + if (minCost >= lumaArea * m_dmvrCostLambda) +#else if (minCost >= lumaArea) +#endif { #if JVET_AI0185_ADAPTIVE_COST_IN_MERGE_MODE minCost = xBDMVRMvOneTemplateHPelSquareSearch<1>(mvFinal_PU, minCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, useHadmard); @@ -30950,7 +31021,11 @@ bool InterPrediction::processBDMVR(PredictionUnit& pu) #else minCost = xBDMVRGetMatchingError(pu, mvInitial_PU, bUseMR, false); #endif +#if JVET_AJ0237_INTERNAL_12BIT + if (minCost >= lumaArea * m_dmvrCostLambda) +#else if (minCost >= lumaArea) +#endif { #if JVET_AI0185_ADAPTIVE_COST_IN_MERGE_MODE minCost = xBDMVRMvOneTemplateHPelSquareSearch<2>(mvFinal_PU, minCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, useHadmard); @@ -31106,7 +31181,11 @@ bool InterPrediction::processBDMVR(PredictionUnit& pu) minCost = xBDMVRMvSquareSearch( mvFinal_PU, minCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false ); #endif +#if JVET_AJ0237_INTERNAL_12BIT + subPURefine = minCost >= (lumaArea * m_dmvrCostLambda); +#else subPURefine = minCost >= lumaArea; +#endif #if JVET_AG0067_DMVR_EXTENSIONS pu.mv[REF_PIC_LIST_0] = (mvFinal_PU[0] - puOrgMv[0]).scaleMv(scale0) + puOrgMv[0]; pu.mv[REF_PIC_LIST_1] = (mvFinal_PU[1] - puOrgMv[1]).scaleMv(scale1) + puOrgMv[1]; @@ -31188,7 +31267,11 @@ bool InterPrediction::processBDMVR(PredictionUnit& pu) Mv mvFinal[2] = { pu.mv[0], pu.mv[1] }; Mv mvOffset; +#if JVET_AJ0237_INTERNAL_12BIT + const Distortion earlyTerminateTh = dx * dy * m_dmvrCostLambda; +#else const Distortion earlyTerminateTh = dx * dy; +#endif const int adaptiveSearchRangeHor = (dx >> 1) < BDMVR_INTME_RANGE ? (dx >> 1) : BDMVR_INTME_RANGE; const int adaptiveSearchRangeVer = (dy >> 1) < BDMVR_INTME_RANGE ? (dy >> 1) : BDMVR_INTME_RANGE; const bool adaptRange = (adaptiveSearchRangeHor != BDMVR_INTME_RANGE || adaptiveSearchRangeVer != BDMVR_INTME_RANGE); @@ -32077,7 +32160,11 @@ Distortion InterPrediction::xBDMVRMvIntPelFullSearch(Mv&mvOffset, Distortion cur } else { +#if JVET_AJ0237_INTERNAL_12BIT + int32_t precisionAdj = cDistParam.bitDepth > 8 ? 2 : 0; +#else int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0; +#endif curBestCost = cDistParam.distFunc(cDistParam) >> precisionAdj; } #else @@ -32124,7 +32211,11 @@ Distortion InterPrediction::xBDMVRMvIntPelFullSearch(Mv&mvOffset, Distortion cur } else { +#if JVET_AJ0237_INTERNAL_12BIT + int32_t precisionAdj = cDistParam.bitDepth > 8 ? 2 : 0; +#else int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0; +#endif m_sadEnlargeArrayBilMrg[searchOffsetIdx] = cDistParam.distFunc(cDistParam) >> precisionAdj; } #else @@ -33093,7 +33184,11 @@ Distortion InterPrediction::xBDMVRGetMatchingError(const PredictionUnit& pu, con } else { +#if JVET_AJ0237_INTERNAL_12BIT + int32_t precisionAdj = cDistParam.bitDepth > 8 ? 2 : 0; +#else int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0; +#endif return cDistParam.distFunc(cDistParam) >> precisionAdj; } #else @@ -33259,7 +33354,11 @@ Distortion InterPrediction::xBDMVRGetMatchingError(const PredictionUnit& pu, con } else { +#if JVET_AJ0237_INTERNAL_12BIT + int32_t precisionAdj = cDistParam.bitDepth > 8 ? 2 : 0; +#else int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0; +#endif return cDistParam.distFunc( cDistParam ) >> precisionAdj; } #else @@ -41495,6 +41594,10 @@ std::vector<Mv> InterPrediction::deriveMVDFromMVSDIdxAffineSI(PredictionUnit& pu { const ComponentID ch = ComponentID(chan); +#if JVET_AJ0237_INTERNAL_12BIT + const int maxValue = (1 << slice.getSPS()->getBitDepth(toChannelType(ch))) - 1; +#endif + Pel *piTxtBuff = pPadBuffYUV->getBuf(blkUnitAreaBuff) .bufs[ch] @@ -41512,7 +41615,11 @@ std::vector<Mv> InterPrediction::deriveMVDFromMVSDIdxAffineSI(PredictionUnit& pu piTxtBuff[idx] += CompDiff[chan]; piTxtBuff[idx] = (piTxtBuff[idx] < 0) ? 0 : piTxtBuff[idx]; +#if JVET_AJ0237_INTERNAL_12BIT + piTxtBuff[idx] = (piTxtBuff[idx] > maxValue) ? maxValue : piTxtBuff[idx]; +#else piTxtBuff[idx] = (piTxtBuff[idx] > 1023) ? 1023 : piTxtBuff[idx]; +#endif } piTxtBuff += iStrideBuff; piTmpBuff += iStrideTmp; diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index 9e7ba7f3e..f1d9664ef 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -310,6 +310,10 @@ protected: PelStorage m_obmcPelStorage; #endif +#if JVET_AJ0237_INTERNAL_12BIT + uint8_t m_dmvrCostLambda; +#endif + ChromaFormat m_currChromaFormat; ComponentID m_maxCompIDToPred; ///< tells the predictor to only process the components up to (inklusive) this one - useful to skip chroma components during RD-search @@ -687,10 +691,18 @@ public: #if INTER_LIC || (TM_AMVP || TM_MRG || JVET_Z0084_IBC_TM) || JVET_W0090_ARMC_TM || JVET_Z0056_GPM_SPLIT_MODE_REORDERING || JVET_Z0061_TM_OBMC #if JVET_Z0153_IBC_EXT_REF #if JVET_AJ0172_IBC_ITMP_ALIGN_REF_AREA +#if JVET_AJ0237_INTERNAL_12BIT + void init (RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth, const int picHeight, const int bitDepth); +#else void init (RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth, const int picHeight); +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT + void init (RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth, const int bitDepth); #else void init (RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape, const int picWidth); #endif +#endif #else void init (RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape); #endif diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp index 09be2f148..914cceb7d 100644 --- a/source/Lib/CommonLib/InterpolationFilter.cpp +++ b/source/Lib/CommonLib/InterpolationFilter.cpp @@ -1596,9 +1596,17 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel *src, int if (biMCForDMVR) { int shift10BitOut, offset; +#if JVET_AJ0237_INTERNAL_12BIT + if ((clpRng.bd - IF_INTERNAL_PREC_BILINEAR(clpRng.bd)) > 0) +#else if ((clpRng.bd - IF_INTERNAL_PREC_BILINEAR) > 0) +#endif { +#if JVET_AJ0237_INTERNAL_12BIT + shift10BitOut = (clpRng.bd - IF_INTERNAL_PREC_BILINEAR(clpRng.bd)); +#else shift10BitOut = (clpRng.bd - IF_INTERNAL_PREC_BILINEAR); +#endif offset = (1 << (shift10BitOut - 1)); for (row = 0; row < height; row++) { @@ -1612,7 +1620,11 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel *src, int } else { +#if JVET_AJ0237_INTERNAL_12BIT + shift10BitOut = (IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd); +#else shift10BitOut = (IF_INTERNAL_PREC_BILINEAR - clpRng.bd); +#endif for (row = 0; row < height; row++) { for (col = 0; col < width; col++) @@ -1649,9 +1661,17 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel *src, int if (biMCForDMVR) { int shift10BitOut, offset; +#if JVET_AJ0237_INTERNAL_12BIT + if ((clpRng.bd - IF_INTERNAL_PREC_BILINEAR(clpRng.bd)) > 0) +#else if ((clpRng.bd - IF_INTERNAL_PREC_BILINEAR) > 0) +#endif { +#if JVET_AJ0237_INTERNAL_12BIT + shift10BitOut = (clpRng.bd - IF_INTERNAL_PREC_BILINEAR(clpRng.bd)); +#else shift10BitOut = (clpRng.bd - IF_INTERNAL_PREC_BILINEAR); +#endif offset = (1 << (shift10BitOut - 1)); for (row = 0; row < height; row++) { @@ -1665,7 +1685,11 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel *src, int } else { +#if JVET_AJ0237_INTERNAL_12BIT + shift10BitOut = (IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd); +#else shift10BitOut = (IF_INTERNAL_PREC_BILINEAR - clpRng.bd); +#endif for (row = 0; row < height; row++) { for (col = 0; col < width; col++) @@ -1865,7 +1889,11 @@ void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcSt { if( isFirst ) { +#if JVET_AJ0237_INTERNAL_12BIT + shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd); +#else shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd); +#endif offset = 1 << (shift - 1); } else diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h index 827258848..4b494a388 100644 --- a/source/Lib/CommonLib/InterpolationFilter.h +++ b/source/Lib/CommonLib/InterpolationFilter.h @@ -52,7 +52,11 @@ #define IF_FILTER_PREC 6 ///< Log2 of sum of filter taps #endif #define IF_INTERNAL_OFFS (1<<(IF_INTERNAL_PREC-1)) ///< Offset used internally +#if JVET_AJ0237_INTERNAL_12BIT +#define IF_INTERNAL_PREC_BILINEAR(bd) std::min(IF_INTERNAL_PREC, int(bd)) +#else #define IF_INTERNAL_PREC_BILINEAR 10 ///< Number of bits for internal precision +#endif #define IF_FILTER_PREC_BILINEAR 4 ///< Bilinear filter coeff precision so that intermediate value will not exceed 16 bit for SIMD - bit exact #if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT #define IF_INTERNAL_FRAC_BITS(bd) std::max(2, IF_INTERNAL_PREC - int(bd)) diff --git a/source/Lib/CommonLib/IntraPrediction.cpp b/source/Lib/CommonLib/IntraPrediction.cpp index c18de13d3..c6631009b 100644 --- a/source/Lib/CommonLib/IntraPrediction.cpp +++ b/source/Lib/CommonLib/IntraPrediction.cpp @@ -16521,7 +16521,11 @@ void IntraPrediction::reorderPLT(CodingStructure& cs, Partitioner& partitioner, } #if MMLM && LMS_LINEAR_MODEL +#if JVET_AJ0237_INTERNAL_12BIT +int IntraPrediction::xCalcLMParametersGeneralized(int64_t x, int64_t y, int64_t xx, int64_t xy, int count, int bitDepth, int& a, int& b, int& iShift) +#else int IntraPrediction::xCalcLMParametersGeneralized(int x, int y, int xx, int xy, int count, int bitDepth, int &a, int &b, int &iShift) +#endif { uint32_t uiInternalBitDepth = bitDepth; @@ -16537,23 +16541,42 @@ int IntraPrediction::xCalcLMParametersGeneralized(int x, int y, int xx, int xy, int iCountShift = g_aucLog2[count]; +#if JVET_AJ0237_INTERNAL_12BIT + int iTempShift = uiInternalBitDepth + iCountShift - ((uiInternalBitDepth > 10) ? 31 : 15); +#else int iTempShift = uiInternalBitDepth + iCountShift - 15; +#endif if (iTempShift > 0) { +#if JVET_AJ0237_INTERNAL_12BIT + x = (x + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; + y = (y + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; + xx = (xx + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; + xy = (xy + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; +#else x = (x + (1 << (iTempShift - 1))) >> iTempShift; y = (y + (1 << (iTempShift - 1))) >> iTempShift; xx = (xx + (1 << (iTempShift - 1))) >> iTempShift; xy = (xy + (1 << (iTempShift - 1))) >> iTempShift; +#endif iCountShift -= iTempShift; } /////// xCalcLMParameters +#if JVET_AJ0237_INTERNAL_12BIT + int64_t avgX = x >> iCountShift; + int64_t avgY = y >> iCountShift; + + int64_t RErrX = x & ((1 << iCountShift) - 1); + int64_t RErrY = y & ((1 << iCountShift) - 1); +#else int avgX = x >> iCountShift; int avgY = y >> iCountShift; int RErrX = x & ((1 << iCountShift) - 1); int RErrY = y & ((1 << iCountShift) - 1); +#endif int iB = 7; iShift = 13 - iB; @@ -16566,19 +16589,33 @@ int IntraPrediction::xCalcLMParametersGeneralized(int x, int y, int xx, int xy, } else { +#if JVET_AJ0237_INTERNAL_12BIT + int64_t a1 = xy - (avgX * avgY << iCountShift) - avgX * RErrY - avgY * RErrX; + int64_t a2 = xx - (avgX * avgX << iCountShift) - 2 * avgX * RErrX; +#else int a1 = xy - (avgX * avgY << iCountShift) - avgX * RErrY - avgY * RErrX; int a2 = xx - (avgX * avgX << iCountShift) - 2 * avgX * RErrX; +#endif const int iShiftA1 = uiInternalBitDepth - 2; const int iShiftA2 = 5; const int iAccuracyShift = uiInternalBitDepth + 4; int iScaleShiftA2 = 0; int iScaleShiftA1 = 0; + +#if JVET_AJ0237_INTERNAL_12BIT + int64_t a1s = a1; + int64_t a2s = a2; + + iScaleShiftA1 = a1 == 0 ? 0 : floorLog2Uint64(abs(a1)) - iShiftA1; + iScaleShiftA2 = a2 == 0 ? 0 : floorLog2Uint64(abs(a2)) - iShiftA2; +#else int a1s = a1; int a2s = a2; iScaleShiftA1 = a1 == 0 ? 0 : floorLog2(abs(a1)) - iShiftA1; iScaleShiftA2 = a2 == 0 ? 0 : floorLog2(abs(a2)) - iShiftA2; +#endif if (iScaleShiftA1 < 0) { @@ -16599,7 +16636,11 @@ int IntraPrediction::xCalcLMParametersGeneralized(int x, int y, int xx, int xy, if (a2s >= 32) { uint32_t a2t = m_auShiftLM[a2s - 32]; +#if JVET_AJ0237_INTERNAL_12BIT + a = int(a1s * a2t); +#else a = a1s * a2t; +#endif } else { @@ -16625,8 +16666,11 @@ int IntraPrediction::xCalcLMParametersGeneralized(int x, int y, int xx, int xy, iShift = (iShift + iB) - n; a = a >> n; - +#if JVET_AJ0237_INTERNAL_12BIT + b = int(avgY - ((a * avgX) >> iShift)); +#else b = avgY - ((a * avgX) >> iShift); +#endif } return 0; } @@ -16699,7 +16743,11 @@ int IntraPrediction::xLMSampleClassifiedTraining(int count, int mean, int meanC, } } +#if JVET_AJ0237_INTERNAL_12BIT + int64_t x[2], y[2], xy[2], xx[2]; +#else int x[2], y[2], xy[2], xx[2]; +#endif for (int group = 0; group < 2; group++) { x[group] = y[group] = xy[group] = xx[group] = 0; @@ -16851,7 +16899,11 @@ void IntraPrediction::xGetLMParametersLMS(const PredictionUnit &pu, const Compon srcColor0 = temp.bufAt(0, 0); curChroma0 = getPredictorPtr(compID); +#if JVET_AJ0237_INTERNAL_12BIT + int64_t x = 0, y = 0, xx = 0, xy = 0; +#else int x = 0, y = 0, xx = 0, xy = 0; +#endif int iCountShift = 0; unsigned uiInternalBitDepth = sps.getBitDepth(CHANNEL_TYPE_CHROMA); @@ -17091,25 +17143,42 @@ void IntraPrediction::xGetLMParametersLMS(const PredictionUnit &pu, const Compon return; } } - +#if JVET_AJ0237_INTERNAL_12BIT + int iTempShift = uiInternalBitDepth + iCountShift - ((uiInternalBitDepth > 10) ? 31 : 15); +#else int iTempShift = uiInternalBitDepth + iCountShift - 15; +#endif if (iTempShift > 0) { +#if JVET_AJ0237_INTERNAL_12BIT + x = (x + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; + y = (y + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; + xx = (xx + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; + xy = (xy + ((int64_t)1 << (iTempShift - 1))) >> iTempShift; +#else x = (x + (1 << (iTempShift - 1))) >> iTempShift; y = (y + (1 << (iTempShift - 1))) >> iTempShift; xx = (xx + (1 << (iTempShift - 1))) >> iTempShift; xy = (xy + (1 << (iTempShift - 1))) >> iTempShift; +#endif iCountShift -= iTempShift; } /////// xCalcLMParameters +#if JVET_AJ0237_INTERNAL_12BIT + int64_t avgX = x >> iCountShift; + int64_t avgY = y >> iCountShift; + int64_t RErrX = x & ((1 << iCountShift) - 1); + int64_t RErrY = y & ((1 << iCountShift) - 1); +#else int avgX = x >> iCountShift; int avgY = y >> iCountShift; int RErrX = x & ((1 << iCountShift) - 1); int RErrY = y & ((1 << iCountShift) - 1); +#endif int iB = 7; int a = 0; @@ -17122,20 +17191,32 @@ void IntraPrediction::xGetLMParametersLMS(const PredictionUnit &pu, const Compon } else { +#if JVET_AJ0237_INTERNAL_12BIT + int64_t a1 = xy - (avgX * avgY << iCountShift) - avgX * RErrY - avgY * RErrX; + int64_t a2 = xx - (avgX * avgX << iCountShift) - 2 * avgX * RErrX; +#else int a1 = xy - (avgX * avgY << iCountShift) - avgX * RErrY - avgY * RErrX; int a2 = xx - (avgX * avgX << iCountShift) - 2 * avgX * RErrX; +#endif const int iShiftA1 = uiInternalBitDepth - 2; const int iShiftA2 = 5; const int iAccuracyShift = uiInternalBitDepth + 4; int iScaleShiftA2 = 0; int iScaleShiftA1 = 0; +#if JVET_AJ0237_INTERNAL_12BIT + int64_t a1s = a1; + int64_t a2s = a2; + + iScaleShiftA1 = a1 == 0 ? 0 : floorLog2Uint64(abs(a1)) - iShiftA1; + iScaleShiftA2 = a2 == 0 ? 0 : floorLog2Uint64(abs(a2)) - iShiftA2; +#else int a1s = a1; int a2s = a2; iScaleShiftA1 = a1 == 0 ? 0 : floorLog2(abs(a1)) - iShiftA1; iScaleShiftA2 = a2 == 0 ? 0 : floorLog2(abs(a2)) - iShiftA2; - +#endif if (iScaleShiftA1 < 0) { iScaleShiftA1 = 0; @@ -17155,7 +17236,11 @@ void IntraPrediction::xGetLMParametersLMS(const PredictionUnit &pu, const Compon if (a2s >= 32) { uint32_t a2t = m_auShiftLM[a2s - 32]; +#if JVET_AJ0237_INTERNAL_12BIT + a = int(a1s * a2t); +#else a = a1s * a2t; +#endif } else { @@ -17181,7 +17266,11 @@ void IntraPrediction::xGetLMParametersLMS(const PredictionUnit &pu, const Compon iShift = (iShift + iB) - n; a = a >> n; +#if JVET_AJ0237_INTERNAL_12BIT + b = int(avgY - ((a * avgX) >> iShift)); +#else b = avgY - ((a * avgX) >> iShift); +#endif cclmModel.setFirstModel( a, b, iShift ); } @@ -23308,7 +23397,11 @@ void IntraPrediction::xCclmApplyModel(const PredictionUnit &pu, const ComponentI samples[0] = refLumaBlk.at(x, y); // C samples[1] = cccmModel.bias(); +#if JVET_AJ0237_INTERNAL_12BIT + piPred.at(x, y) = ClipPel<Pel>(Pel((cccmModel.params[0] * samples[0] + cccmModel.params[1] * samples[1] + cccmModel.decimRound) >> cccmModel.decimBits), clpRng); +#else piPred.at(x, y) = ClipPel<Pel>(Pel((cccmModel.params[0] * samples[0] + cccmModel.params[1] * samples[1] + CCCM_DECIM_ROUND) >> CCCM_DECIM_BITS), clpRng); +#endif } } } @@ -24292,7 +24385,11 @@ void IntraPrediction::combineCcpAndInter(PredictionUnit& pu, PelBuf& inPredCb, P #define DIV_INTR_BITS (DIV_PREC_BITS - DIV_SLOT_BITS) #define DIV_INTR_ROUND (1 << DIV_INTR_BITS >> 1) +#if JVET_AJ0237_INTERNAL_12BIT +int64_t xDivide(int64_t num, int64_t denom, int decimBits) // Note: assumes positive denominator +#else int64_t xDivide(int64_t num, int64_t denom) // Note: assumes positive denominator +#endif { static const int pow2W[8] = { 214, 153, 113, 86, 67, 53, 43, 35 }; // DIV_PREC_BITS_POW2 static const int pow2O[8] = { 4822, 5952, 6624, 6792, 6408, 5424, 3792, 1466 }; // DIV_PREC_BITS @@ -24306,11 +24403,19 @@ int64_t xDivide(int64_t num, int64_t denom) // Note: assumes positive denominato int scale = ((pow2W[diffFull] * ((normDiff2 * normDiff2) >> DIV_PREC_BITS)) >> DIV_PREC_BITS_POW2) - (normDiff2 >> 1) + pow2B[diffFull]; +#if JVET_AJ0237_INTERNAL_12BIT + return ((num << (decimBits - DIV_PREC_BITS)) * scale + round) >> shift; +#else return ( (num << (CCCM_DECIM_BITS - DIV_PREC_BITS)) * scale + round) >> shift; +#endif } #if JVET_AC0053_GAUSSIAN_SOLVER +#if JVET_AJ0237_INTERNAL_12BIT +void xGetDivScaleRoundShift(int64_t denom, int decimBits, int& scale, int& round, int& shift) // Note: assumes positive denominator +#else void xGetDivScaleRoundShift(int64_t denom, int &scale, int &round, int &shift) // Note: assumes positive denominator +#endif { static const int pow2W[8] = { 214, 153, 113, 86, 67, 53, 43, 35 }; // DIV_PREC_BITS_POW2 static const int pow2O[8] = { 4822, 5952, 6624, 6792, 6408, 5424, 3792, 1466 }; // DIV_PREC_BITS @@ -24323,7 +24428,11 @@ void xGetDivScaleRoundShift(int64_t denom, int &scale, int &round, int &shift) / int normDiff2 = normDiff - pow2O[diffFull]; scale = ((pow2W[diffFull] * ((normDiff2 * normDiff2) >> DIV_PREC_BITS)) >> DIV_PREC_BITS_POW2) - (normDiff2 >> 1) + pow2B[diffFull]; +#if JVET_AJ0237_INTERNAL_12BIT + scale <<= decimBits - DIV_PREC_BITS; +#else scale <<= CCCM_DECIM_BITS - DIV_PREC_BITS; +#endif } #endif @@ -24333,8 +24442,22 @@ void xGetDivScaleRoundShift(int64_t denom, int &scale, int &round, int &shift) / #undef DIV_INTR_BITS #undef DIV_INTR_ROUND +#if JVET_AJ0237_INTERNAL_12BIT +int xCccmDivideLowPrec(int64_t num, int64_t denom, int decimBits) +#else int xCccmDivideLowPrec(int64_t num, int64_t denom) +#endif { +#if JVET_AJ0237_INTERNAL_12BIT + if (num < 0) + { + return -int(xDivide(-num, denom, decimBits) >> decimBits); + } + else + { + return int(xDivide(num, denom, decimBits) >> decimBits); + } +#else if ( num < 0 ) { return -int(xDivide(-num, denom) >> CCCM_DECIM_BITS); @@ -24343,13 +24466,21 @@ int xCccmDivideLowPrec(int64_t num, int64_t denom) { return int(xDivide(num, denom) >> CCCM_DECIM_BITS); } +#endif } +#if JVET_AJ0237_INTERNAL_12BIT +int64_t xCccmDivide(int64_t num, int64_t denom, int decimBits) // Note: assumes positive denominator +{ + return xDivide(num, denom, decimBits); +} +#else int64_t xCccmDivide(int64_t num, int64_t denom) // Note: assumes positive denominator { return xDivide(num, denom); } #endif +#endif #if JVET_AD0120_LBCCP || JVET_AG0154_DECODER_DERIVED_CCP_FUSION #if JVET_AA0057_CCCM || JVET_AG0154_DECODER_DERIVED_CCP_FUSION @@ -26328,7 +26459,16 @@ int IntraPrediction::xBvgCccmCalcBlkAver(const PredictionUnit& pu) const } } #if JVET_AB0174_CCCM_DIV_FREE +#if JVET_AJ0237_INTERNAL_12BIT +#if JVET_AJ0237_INTERNAL_12BIT + const int bd = pu.cu->slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA); + return numSamples == 0 ? (1 << (bd - 1)) : xCccmDivideLowPrec(sumSamples, numSamples, DECIM_BITS(bd)); +#else + return numSamples == 0 ? ((1 << pu.cu->slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)) - 1) : xCccmDivideLowPrec(sumSamples, numSamples); +#endif +#else return numSamples == 0 ? 512 : xCccmDivideLowPrec(sumSamples, numSamples); +#endif #else return numSamples == 0 ? 512 : ( sumSamples + numSamples/2) / numSamples; #endif @@ -26856,7 +26996,16 @@ int IntraPrediction::xCccmCalcRefAver(const PredictionUnit& pu #endif #if JVET_AB0174_CCCM_DIV_FREE +#if JVET_AJ0237_INTERNAL_12BIT +#if JVET_AJ0237_INTERNAL_12BIT + const int bd = pu.cu->slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA); + return numSamples == 0 ? (1 << (bd - 1)) : xCccmDivideLowPrec(sumSamples, numSamples, DECIM_BITS(bd)); +#else + return numSamples == 0 ? ((1 << pu.cu->slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)) - 1) : xCccmDivideLowPrec(sumSamples, numSamples); +#endif +#else return numSamples == 0 ? 512 : xCccmDivideLowPrec(sumSamples, numSamples); +#endif #else return numSamples == 0 ? 512 : ( sumSamples + numSamples/2) / numSamples; #endif @@ -27787,7 +27936,11 @@ int IntraPrediction::xCflmCalcRefAver(const PredictionUnit& pu, const CompArea& } #if JVET_AD0184_REMOVAL_OF_DIVISION_OPERATIONS +#if JVET_AJ0237_INTERNAL_12BIT + return numSamples == 0 ? (1 << (pu.cu->slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA) - 1)) : PU::getMeanValue(sumSamples + (numSamples >> 1), numSamples); +#else return numSamples == 0 ? 512 : PU::getMeanValue( sumSamples + (numSamples >> 1), numSamples); +#endif #else return numSamples == 0 ? 512 : (sumSamples + numSamples / 2) / numSamples; #endif @@ -27797,7 +27950,11 @@ int IntraPrediction::xCflmCalcRefAver(const PredictionUnit& pu, const CompArea& #if JVET_AA0057_CCCM || JVET_AB0092_GLM_WITH_LUMA || JVET_AC0119_LM_CHROMA_FUSION || JVET_AG0058_EIP || JVET_AG0154_DECODER_DERIVED_CCP_FUSION #if JVET_AC0053_GAUSSIAN_SOLVER +#if JVET_AJ0237_INTERNAL_12BIT +void CccmCovariance::gaussBacksubstitution( TCccmCoeff* x, int numEq, int col, int round, int bits) +#else void CccmCovariance::gaussBacksubstitution( TCccmCoeff* x, int numEq, int col ) +#endif { x[numEq-1] = C[numEq-1][col]; @@ -27807,7 +27964,11 @@ void CccmCovariance::gaussBacksubstitution( TCccmCoeff* x, int numEq, int col ) for( int j = i+1; j < numEq; j++ ) { +#if JVET_AJ0237_INTERNAL_12BIT + x[i] -= FIXED_MULT(C[i][j], x[j], round, bits); +#else x[i] -= FIXED_MULT(C[i][j], x[j]); +#endif } } } @@ -27825,6 +27986,11 @@ void CccmCovariance::gaussElimination( TCccmCoeff A[CCCM_NUM_PARAMS_MAX][CCCM_NU #else int reg = 2 << (bd - 8); #endif + +#if JVET_AJ0237_INTERNAL_12BIT + const int decimBits = DECIM_BITS(bd); + const int decimRound = (1 << (decimBits - 1)); +#endif // Create an [M][M+2] matrix system (could have been done already when calculating auto/cross-correlations) for( int i = 0; i < numEq; i++ ) @@ -27847,7 +28013,11 @@ void CccmCovariance::gaussElimination( TCccmCoeff A[CCCM_NUM_PARAMS_MAX][CCCM_NU #if JVET_AB0174_CCCM_DIV_FREE int scale, round, shift; +#if JVET_AJ0237_INTERNAL_12BIT + xGetDivScaleRoundShift(diag, decimBits, scale, round, shift); +#else xGetDivScaleRoundShift(diag, scale, round, shift); +#endif #endif for( int j = i+1; j < numEq+numFilters; j++ ) @@ -27867,7 +28037,11 @@ void CccmCovariance::gaussElimination( TCccmCoeff A[CCCM_NUM_PARAMS_MAX][CCCM_NU // On row j all elements with k < i+1 are now zero (not zeroing those here as backsubstitution does not need them) for( int k = i + 1; k < numEq+numFilters; k++ ) { +#if JVET_AJ0237_INTERNAL_12BIT + dst[k] -= FIXED_MULT(scale, src[k], decimRound, decimBits); +#else dst[k] -= FIXED_MULT(scale, src[k]); +#endif } } } @@ -27875,12 +28049,21 @@ void CccmCovariance::gaussElimination( TCccmCoeff A[CCCM_NUM_PARAMS_MAX][CCCM_NU // Solve with backsubstitution if ( numFilters == 2 ) { +#if JVET_AJ0237_INTERNAL_12BIT + gaussBacksubstitution(x0, numEq, colChr0, decimRound, decimBits); + gaussBacksubstitution(x1, numEq, colChr1, decimRound, decimBits); +#else gaussBacksubstitution(x0, numEq, colChr0); gaussBacksubstitution(x1, numEq, colChr1); +#endif } else { +#if JVET_AJ0237_INTERNAL_12BIT + gaussBacksubstitution(x0, numEq, colChr0, decimRound, decimBits); +#else gaussBacksubstitution(x0, numEq, colChr0); +#endif } } @@ -28055,7 +28238,11 @@ void CccmCovariance::solve1( const Pel A[CCCM_NUM_PARAMS_MAX][CCCM_REF_SAMPLES_M #endif // Scale the matrix and vector to selected dynamic range +#if JVET_AJ0237_INTERNAL_12BIT + int matrixShift = ((model.bd > 10) ? CCCM_MATRIX_BITS_HBD : 28) - 2 * model.bd - ceilLog2(sampleNum); +#else int matrixShift = 28 - 2 * model.bd - ceilLog2( sampleNum ); +#endif if( matrixShift > 0 ) { @@ -28104,8 +28291,12 @@ void CccmCovariance::solve1( const Pel A[CCCM_NUM_PARAMS_MAX][CCCM_REF_SAMPLES_M #if JVET_AB0174_CCCM_DIV_FREE // Add the chroma offset to bias term (after shifting up by CCCM_DECIM_BITS and down by cccmModelCb.bd - 1) +#if JVET_AJ0237_INTERNAL_12BIT + model.params[numParams - 1] += chromaOffset << (model.decimBits - (model.bd - 1)); +#else model.params[numParams - 1] += chromaOffset << (CCCM_DECIM_BITS - (model.bd - 1)); #endif +#endif } #if JVET_AB0174_CCCM_DIV_FREE @@ -28174,7 +28365,11 @@ void CccmCovariance::solve2( const Pel A[CCCM_NUM_PARAMS_MAX][CCCM_REF_SAMPLES_M // Scale the matrix and vector to selected dynamic range CHECK( modelCb.bd != modelCr.bd, "Bitdepth of Cb and Cr is different" ); #if JVET_AE0059_INTER_CCCM +#if JVET_AJ0237_INTERNAL_12BIT + int matrixShift = ((modelCb.bd > 10) ? CCCM_MATRIX_BITS_HBD : (interCccmMode ? 28 : CCCM_MATRIX_BITS)) - 2 * modelCb.bd - ceilLog2(sampleNum); +#else int matrixShift = (interCccmMode ? 28 : CCCM_MATRIX_BITS) - 2 * modelCb.bd - ceilLog2( sampleNum ); +#endif #else int matrixShift = CCCM_MATRIX_BITS - 2 * modelCb.bd - ceilLog2( sampleNum ); #endif @@ -28242,9 +28437,14 @@ void CccmCovariance::solve2( const Pel A[CCCM_NUM_PARAMS_MAX][CCCM_REF_SAMPLES_M #if JVET_AB0174_CCCM_DIV_FREE // Add the chroma offset to bias term (after shifting up by CCCM_DECIM_BITS and down by cccmModelCb.bd - 1) +#if JVET_AJ0237_INTERNAL_12BIT + modelCb.params[numParams - 1] += chromaOffsetCb << (modelCb.decimBits - (modelCb.bd - 1)); + modelCr.params[numParams - 1] += chromaOffsetCr << (modelCr.decimBits - (modelCr.bd - 1)); +#else modelCb.params[numParams - 1] += chromaOffsetCb << (CCCM_DECIM_BITS - (modelCb.bd - 1)); modelCr.params[numParams - 1] += chromaOffsetCr << (CCCM_DECIM_BITS - (modelCr.bd - 1)); #endif +#endif } #endif @@ -30445,8 +30645,16 @@ void CccmCovariance::solveEip(const TCccmCoeff* A, const TCccmCoeff* Y, const in { regularizationParam = (sampleNum <= REGULARIZED_EIP_L2_SAMPLE_THRESHOLD) ? REGULARIZED_EIP_L2_SMALL * numParams : REGULARIZED_EIP_L2_LARGE * numParams; } +#if JVET_AJ0237_INTERNAL_12BIT + regularizationParam <<= 2 * std::max(0, model.bd - 10); +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT + int regShift = 2 * std::max(0, model.bd - 10); + const int regularizationParam = ((sampleNum <= REGULARIZED_EIP_L2_SAMPLE_THRESHOLD) ? REGULARIZED_EIP_L2_SMALL * numParams : REGULARIZED_EIP_L2_LARGE * numParams) << regShift; #else const int regularizationParam = (sampleNum <= REGULARIZED_EIP_L2_SAMPLE_THRESHOLD) ? REGULARIZED_EIP_L2_SMALL * numParams : REGULARIZED_EIP_L2_LARGE * numParams; +#endif #endif for (int coli0 = 0; coli0 < numParams - 1; coli0++) // The last term (bias) is not regularized. { @@ -30467,7 +30675,11 @@ void CccmCovariance::solveEip(const TCccmCoeff* A, const TCccmCoeff* Y, const in } #endif // Scale the matrix and vector to selected dynamic range +#if JVET_AJ0237_INTERNAL_12BIT + int matrixShift = ((model.bd > 10) ? CCCM_MATRIX_BITS_HBD : 28) - 2 * model.bd - ceilLog2(sampleNum); +#else int matrixShift = 28 - 2 * model.bd - ceilLog2(sampleNum); +#endif if (matrixShift > 0) { @@ -30515,8 +30727,12 @@ void CccmCovariance::solveEip(const TCccmCoeff* A, const TCccmCoeff* Y, const in #if JVET_AB0174_CCCM_DIV_FREE // Add the chroma offset to bias term (after shifting up by CCCM_DECIM_BITS and down by cccmModelCb.bd - 1) +#if JVET_AJ0237_INTERNAL_12BIT + model.params[numParams - 1] += lumaOffset << (model.decimBits - (model.bd - 1)); +#else model.params[numParams - 1] += lumaOffset << (CCCM_DECIM_BITS - (model.bd - 1)); #endif +#endif } void IntraPrediction::initEipParams(const PredictionUnit& pu, const ComponentID compId) diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h index 9601371d8..66ae8a1e5 100644 --- a/source/Lib/CommonLib/IntraPrediction.h +++ b/source/Lib/CommonLib/IntraPrediction.h @@ -145,7 +145,11 @@ typedef short TrainDataType; #if JVET_AA0057_CCCM || JVET_AB0092_GLM_WITH_LUMA || JVET_AC0119_LM_CHROMA_FUSION || JVET_AG0058_EIP || JVET_AG0154_DECODER_DERIVED_CCP_FUSION typedef int64_t TCccmCoeff; +#if JVET_AJ0237_INTERNAL_12BIT +#define FIXED_MULT(x, y, round, bits) TCccmCoeff((int64_t(x)*(y) + round) >> bits ) +#else #define FIXED_MULT(x, y) TCccmCoeff((int64_t(x)*(y) + CCCM_DECIM_ROUND) >> CCCM_DECIM_BITS ) +#endif #if !JVET_AB0174_CCCM_DIV_FREE #define FIXED_DIV(x, y) TCccmCoeff((int64_t(x) << CCCM_DECIM_BITS ) / (y) ) #endif @@ -157,6 +161,10 @@ struct CccmModel bd = bitdepth; midVal = ( 1 << ( bitdepth - 1 ) ); params.resize( num ); +#if JVET_AJ0237_INTERNAL_12BIT + decimBits = DECIM_BITS(bd); + decimRound = (1 << (decimBits - 1)); +#endif } ~CccmModel() {} @@ -164,6 +172,10 @@ struct CccmModel std::vector<TCccmCoeff> params; int bd; int midVal; +#if JVET_AJ0237_INTERNAL_12BIT + int decimRound; + int decimBits; +#endif const int getNumParams() const { @@ -176,7 +188,11 @@ struct CccmModel std::fill( params.begin(), params.end(), 0 ); +#if JVET_AJ0237_INTERNAL_12BIT + params[numParams - 1] = (TCccmCoeff)1 << decimBits; // Default bias to 1 +#else params[numParams - 1] = 1 << CCCM_DECIM_BITS; // Default bias to 1 +#endif } Pel convolve(Pel* vector) @@ -188,7 +204,11 @@ struct CccmModel sum += params[i] * vector[i]; } +#if JVET_AJ0237_INTERNAL_12BIT + return Pel( (sum + decimRound) >> decimBits); +#else return Pel( (sum + CCCM_DECIM_ROUND ) >> CCCM_DECIM_BITS ); +#endif } Pel nonlinear(const Pel val) { return (val * val + midVal) >> bd; } @@ -238,7 +258,11 @@ private: TCccmCoeff C[CCCM_NUM_PARAMS_MAX][CCCM_NUM_PARAMS_MAX + 2]; #if JVET_AC0053_GAUSSIAN_SOLVER +#if JVET_AJ0237_INTERNAL_12BIT + void gaussBacksubstitution ( TCccmCoeff* x, int numEq, int col, int round, int bits); +#else void gaussBacksubstitution ( TCccmCoeff* x, int numEq, int col ); +#endif #if JVET_AE0059_INTER_CCCM void gaussElimination ( TCccmCoeff A[CCCM_NUM_PARAMS_MAX][CCCM_NUM_PARAMS_MAX], TCccmCoeff* y0, TCccmCoeff* x0, TCccmCoeff* y1, TCccmCoeff* x1, int numEq, int numFilters, int bd, const bool interCccmMode = false); #else @@ -659,7 +683,11 @@ public: int b; int shift; }; +#if JVET_AJ0237_INTERNAL_12BIT + int xCalcLMParametersGeneralized(int64_t x, int64_t y, int64_t xx, int64_t xy, int count, int bitDepth, int& a, int& b, int& iShift); +#else int xCalcLMParametersGeneralized(int x, int y, int xx, int xy, int count, int bitDepth, int &a, int &b, int &iShift); +#endif int xLMSampleClassifiedTraining (int count, int mean, int meanC, int lumaSamples[], int chrmSamples[], int bitDepth, MMLMParameters parameters[]); #if JVET_AG0136_INTRA_TMP_LIC std::array<int, 7>& getMemLicParams(const int licIdc, const int idx) { return m_memLicParams[licIdc][idx]; } diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp index c3e95010c..e1e609a75 100644 --- a/source/Lib/CommonLib/Picture.cpp +++ b/source/Lib/CommonLib/Picture.cpp @@ -1810,6 +1810,9 @@ void Picture::calcLumaClpParams() clipDeltaShift = ADAPTIVE_CLIP_SHIFT_DELTA_VALUE_0; cs->slice->setAdaptiveClipQuant(false); } +#if JVET_AJ0237_INTERNAL_12BIT + clipDeltaShift += std::max(0, cs->sps->getBitDepth(toChannelType(COMPONENT_Y)) - 10); +#endif int pelMaxOF = 0; int pelMinOF = (1 << cs->sps->getBitDepth(toChannelType(COMPONENT_Y))) - 1; const int orgPelMin = pelMin; diff --git a/source/Lib/CommonLib/Rom.cpp b/source/Lib/CommonLib/Rom.cpp index 3dfaab825..b4aba4478 100644 --- a/source/Lib/CommonLib/Rom.cpp +++ b/source/Lib/CommonLib/Rom.cpp @@ -625,14 +625,26 @@ MsgLevel g_verbosity = VERBOSE; #if JVET_Y0141_SIGN_PRED_IMPROVE #if JVET_W0119_LFNST_EXTENSION || EXTENDED_LFNST #if JVET_AJ0175_NSPT_FOR_NONREG_MODES +#if JVET_AJ0237_INTERNAL_12BIT +int16_t* g_resiBorderTemplateLFNST[NUM_NSPT_BLOCK_TYPES][6][6][210]; +#else int8_t * g_resiBorderTemplateLFNST[NUM_NSPT_BLOCK_TYPES][6][6][210]; +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT +int16_t* g_resiBorderTemplateLFNST[6][6][210]; #else int8_t * g_resiBorderTemplateLFNST[6][6][210]; #endif +#endif #else int8_t * g_resiBorderTemplateLFNST[6][6][16]; #endif +#if JVET_AJ0237_INTERNAL_12BIT +int16_t* g_resiBorderTemplate[6][6][NUM_TRANS_TYPE * NUM_TRANS_TYPE]; +#else int8_t * g_resiBorderTemplate[6][6][NUM_TRANS_TYPE*NUM_TRANS_TYPE]; +#endif #else const int8_t * g_resiBorderTemplate[6][6][NUM_TRANS_TYPE*NUM_TRANS_TYPE]; #endif diff --git a/source/Lib/CommonLib/Rom.h b/source/Lib/CommonLib/Rom.h index eb045838d..6f832c7a0 100644 --- a/source/Lib/CommonLib/Rom.h +++ b/source/Lib/CommonLib/Rom.h @@ -82,14 +82,26 @@ void destroyMipFilters(); #if JVET_Y0141_SIGN_PRED_IMPROVE #if JVET_W0119_LFNST_EXTENSION || EXTENDED_LFNST #if JVET_AJ0175_NSPT_FOR_NONREG_MODES +#if JVET_AJ0237_INTERNAL_12BIT +extern int16_t* g_resiBorderTemplateLFNST[NUM_NSPT_BLOCK_TYPES][6][6][210]; +#else extern int8_t * g_resiBorderTemplateLFNST[NUM_NSPT_BLOCK_TYPES][6][6][210]; +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT +extern int16_t* g_resiBorderTemplateLFNST[6][6][210]; #else extern int8_t * g_resiBorderTemplateLFNST[6][6][210]; #endif +#endif #else extern int8_t * g_resiBorderTemplateLFNST[6][6][16]; #endif +#if JVET_AJ0237_INTERNAL_12BIT +extern int16_t* g_resiBorderTemplate[6][6][NUM_TRANS_TYPE * NUM_TRANS_TYPE]; +#else extern int8_t * g_resiBorderTemplate[6][6][NUM_TRANS_TYPE*NUM_TRANS_TYPE]; +#endif #else extern const int8_t * g_resiBorderTemplate[6][6][NUM_TRANS_TYPE*NUM_TRANS_TYPE]; #endif diff --git a/source/Lib/CommonLib/SampleAdaptiveOffset.cpp b/source/Lib/CommonLib/SampleAdaptiveOffset.cpp index 04f37c011..f2a946054 100644 --- a/source/Lib/CommonLib/SampleAdaptiveOffset.cpp +++ b/source/Lib/CommonLib/SampleAdaptiveOffset.cpp @@ -2158,7 +2158,12 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int edgePosXB = g_ccSaoEdgePosX[edgeDir][1], edgePosYB = g_ccSaoEdgePosY[edgeDir][1]; const int bandCmp = g_ccSaoBandTab [bandIdc][0]; const int bandNum = g_ccSaoBandTab [bandIdc][1]; +#if JVET_AJ0237_INTERNAL_12BIT + const int bdShift = std::max(0, bitDepth - 10); + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr [edgeIdc][edgeThr]; +#endif const int edgeNum = g_ccSaoEdgeNum [edgeIdc][0]; const int edgeNumUni = g_ccSaoEdgeNum [edgeIdc][1]; const int srcStrideE = edgeCmp == COMPONENT_Y ? srcStrideY : edgeCmp == COMPONENT_Cb ? srcStrideU : srcStrideV; @@ -2216,7 +2221,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2295,7 +2304,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2369,7 +2382,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2436,7 +2453,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2504,7 +2525,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2573,7 +2598,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2640,7 +2669,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2708,7 +2741,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + x; const Pel *colA = srcY + x + srcStrideY * candPosYYA + candPosYXA; @@ -2788,7 +2825,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -2868,7 +2909,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -2943,7 +2988,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -3011,7 +3060,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -3080,7 +3133,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -3150,7 +3207,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -3218,7 +3279,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -3287,7 +3352,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClipEdge(const ComponentID compID, const int bandIdx = (*col[bandCmp] * bandNum) >> bitDepth; const int classIdx = bandIdx * edgeNum + edgeIdx; +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif #else const Pel *colY = srcY + (x << chromaScaleX); const Pel *colA = srcY + (x << chromaScaleX) + srcStrideY * candPosYYA + candPosYXA; @@ -3538,7 +3607,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * chromaScaleYM1; @@ -3565,7 +3638,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * ((y & 0x1) | chromaScaleYM1); @@ -3604,7 +3681,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; @@ -3638,7 +3719,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; @@ -3666,7 +3751,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; @@ -3700,7 +3789,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * ((y & 0x1) | chromaScaleYM1); @@ -3727,7 +3820,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * ((y & 0x1) | chromaScaleYM1); @@ -3760,7 +3857,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * ((y & 0x1) | chromaScaleYM1); @@ -3793,7 +3894,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * ((y & 0x1) | chromaScaleYM1); @@ -3822,7 +3927,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } break; } @@ -3850,7 +3959,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * ((y & 0x1) | chromaScaleYM1); @@ -3883,7 +3996,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY; srcU += srcStrideU * ((y & 0x1) | chromaScaleYM1); @@ -3911,7 +4028,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } break; } @@ -3950,7 +4071,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -3977,7 +4102,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -4016,7 +4145,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; @@ -4050,7 +4183,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; @@ -4078,7 +4215,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; @@ -4112,7 +4253,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -4139,7 +4284,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -4172,7 +4321,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -4205,7 +4358,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -4234,7 +4391,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } break; } @@ -4262,7 +4423,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -4295,7 +4460,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } srcY += srcStrideY << chromaScaleY; srcU += srcStrideU; @@ -4322,7 +4491,11 @@ void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, cons const int classIdx = bandIdx; // dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); +#if JVET_AJ0237_INTERNAL_12BIT + dst[x] = dst[x] + (offset[classIdx] << m_offsetStepLog2[compID]); +#else dst[x] = dst[x] + offset[classIdx]; +#endif } } break; diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index 29864a45a..ec4cababa 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -3531,8 +3531,15 @@ public: void setColRefIdx( uint32_t refIdx) { m_colRefIdx = refIdx; } uint32_t getColRefIdx() { return m_colRefIdx; } #if JVET_AA0093_DIVERSITY_CRITERION_FOR_ARMC +#if JVET_AJ0237_INTERNAL_12BIT + void setCostForARMC(uint32_t cost, int bitDepth) { m_costForARMC = (cost << (std::max(0, bitDepth - 10))); } +#else void setCostForARMC(uint32_t cost) { m_costForARMC = cost; } +#endif uint32_t getCostForARMC() { return m_costForARMC; } +#if JVET_AJ0237_INTERNAL_12BIT + uint32_t getCostForARMC(int bitDepth) { return m_costForARMC >> (std::max(0, bitDepth - 10)); } // for header parsing/writing purpose +#endif #endif #if JVET_AC0185_ENHANCED_TEMPORAL_MOTION_DERIVATION void setPicColFromL0Flag2nd(bool val) { m_picColFromL0Flag2nd = val; } @@ -4084,6 +4091,9 @@ public: #endif void checkColRefIdx(uint32_t curSliceSegmentIdx, const Picture* pic); #if JVET_AA0093_DIVERSITY_CRITERION_FOR_ARMC +#if JVET_AJ0237_INTERNAL_12BIT + uint32_t getCostForARMC(int bitDepth) const { return m_costForARMC >> (std::max(0, bitDepth - 10)); } // for header parsing/writing purpose +#endif uint32_t getCostForARMC() const { return m_costForARMC; } #endif #if JVET_Y0134_TMVP_NAMVP_CAND_REORDERING @@ -4284,7 +4294,11 @@ public: void setExtAmvpLevel(int b) { m_extAmvpLevel = b; } #endif #if JVET_AA0093_DIVERSITY_CRITERION_FOR_ARMC +#if JVET_AJ0237_INTERNAL_12BIT + void setCostForARMC(uint32_t cost, int bitDepth) { m_costForARMC = (cost << (std::max(0, bitDepth - 10))); } +#else void setCostForARMC(uint32_t cost) { m_costForARMC = cost; } +#endif #endif void setBiDirPred( bool b, int refIdx0, int refIdx1 ) { m_biDirPred = b; m_symRefIdx[0] = refIdx0; m_symRefIdx[1] = refIdx1; } bool getBiDirPred() const { return m_biDirPred; } diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp index ac707949d..c731c3976 100644 --- a/source/Lib/CommonLib/TrQuant.cpp +++ b/source/Lib/CommonLib/TrQuant.cpp @@ -2529,7 +2529,11 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const ComponentID residCompID = compID; bool bJccrWithCr = bIsJCCR && !(tu.jointCbCr >> 1); #if JVET_AI0096_SIGN_PRED_BIT_DEPTH_FIX +#if JVET_AJ0237_INTERNAL_12BIT + const int signPredShift = SIGN_PRED_RESIDUAL_BITS; +#else const int signPredShift = 10 + SIGN_PRED_RESIDUAL_BITS - tu.cs->sps->getBitDepth(toChannelType(COMPONENT_Y)); +#endif const int signPredOffset = 1 << (signPredShift - 1); #endif if(bJccrWithCr) @@ -2627,14 +2631,23 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const int spArea = tu.cs->sps->getSignPredArea(); int signPredWidth = std::min((int)width, spArea); int signPredHeight = std::min((int)height, spArea); +#if JVET_AJ0237_INTERNAL_12BIT + int16_t* pTemplate = (int16_t*)xMalloc(int16_t, stride * h * w); + AreaBuf<int16_t> templateBuf(pTemplate, stride, length, h * w); +#else int8_t *pTemplate = (int8_t *) xMalloc(int8_t, stride * h * w); AreaBuf<int8_t> templateBuf(pTemplate, stride, length, h * w); +#endif #else int8_t *pTemplate = (int8_t *) xMalloc(int8_t, stride * SIGN_PRED_FREQ_RANGE * SIGN_PRED_FREQ_RANGE); AreaBuf<int8_t> templateBuf(pTemplate, stride, length, SIGN_PRED_FREQ_RANGE * SIGN_PRED_FREQ_RANGE); #endif Position prev(0,0); +#if JVET_AJ0237_INTERNAL_12BIT + int16_t* templ = templateBuf.buf; +#else int8_t *templ = templateBuf.buf; +#endif #if JVET_Y0141_SIGN_PRED_IMPROVE for (int j = 0; j < signPredHeight*signPredWidth; ++j) { @@ -2660,8 +2673,12 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const for (uint32_t i = 0; i < height; i++) { +#if JVET_AJ0237_INTERNAL_12BIT + templ[i] = (int16_t)(*pelResi); +#else CHECK(*pelResi < -128 || *pelResi > 127, "value exceeds 8-bit range"); templ[i] = (int8_t)(*pelResi); +#endif pelResi -= resi.stride; } @@ -2669,8 +2686,12 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const for (uint32_t i = 0; i < width; i++) { +#if JVET_AJ0237_INTERNAL_12BIT + templ[i + height] = (int16_t)pelResi[i]; +#else CHECK(pelResi[i] < -128 || pelResi[i] > 127, "value exceeds 8-bit range"); templ[i + height] = (int8_t) pelResi[i]; +#endif } #if !JVET_Y0141_SIGN_PRED_IMPROVE templ += templateBuf.stride; @@ -2701,9 +2722,15 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const PelBuf resi(memTmpResid, width, height); int signPredHeight = 4; int signPredWidth = 4; +#if JVET_AJ0237_INTERNAL_12BIT + int16_t* pTemplate = (int16_t*)xMalloc(int16_t, stride * signPredHeight * signPredWidth); + AreaBuf<int16_t> templateBuf(pTemplate, stride, length, signPredHeight* signPredWidth); + int16_t* templ = templateBuf.buf; +#else int8_t *pTemplate = (int8_t *) xMalloc(int8_t, stride * signPredHeight * signPredWidth); AreaBuf<int8_t> templateBuf(pTemplate, stride, length, signPredHeight * signPredWidth); int8_t *templ = templateBuf.buf; +#endif for (int j = 0; j < signPredHeight*signPredWidth; ++j) { coeff.fill(0); @@ -2721,8 +2748,12 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const for (uint32_t i = 0; i < height; i++) { +#if JVET_AJ0237_INTERNAL_12BIT + templ[i] = (int16_t)(*pelResi); +#else CHECK(*pelResi < -128 || *pelResi > 127, "value exceeds 8-bit range"); templ[i] = (int8_t)(*pelResi); +#endif pelResi -= resi.stride; } @@ -2730,8 +2761,12 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const for (uint32_t i = 0; i < width; i++) { +#if JVET_AJ0237_INTERNAL_12BIT + templ[i + height] = (int16_t)pelResi[i]; +#else CHECK(pelResi[i] < -128 || pelResi[i] > 127, "value exceeds 8-bit range"); templ[i + height] = (int8_t) pelResi[i]; +#endif } templ += templateBuf.stride; } @@ -2838,23 +2873,44 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const const uint32_t w = std::min(uiWidth, (uint32_t)SIGN_PRED_FREQ_RANGE); const uint32_t h = std::min(uiHeight, (uint32_t)SIGN_PRED_FREQ_RANGE); +#if JVET_AJ0237_INTERNAL_12BIT + AreaBuf<const int16_t> templateNormalizedBuf = + (lfnstEnabled ? AreaBuf<const int16_t>() + : AreaBuf<const int16_t>(g_resiBorderTemplate[log2Width - 2][log2Height - 2][actualTrIdx], stride, + length, w * h)); +#else AreaBuf<const int8_t> templateNormalizedBuf = (lfnstEnabled ? AreaBuf<const int8_t>() : AreaBuf<const int8_t>(g_resiBorderTemplate[log2Width - 2][log2Height - 2][actualTrIdx], stride, length, w * h)); +#endif #if JVET_AJ0175_NSPT_FOR_NONREG_MODES bool allowNSPT = CU::isNSPTAllowed( tu, compID, uiWidth, uiHeight, spsIntraLfnstEnabled && CU::isIntra( *( tu.cu ) ) ); int nsptBucketIdx = allowNSPT ? PU::getNSPTBucket(tu) : 0; +#if JVET_AJ0237_INTERNAL_12BIT + AreaBuf<const int16_t> templateLfnstNormalizedBuf = + (lfnstEnabled ? AreaBuf<const int16_t>(g_resiBorderTemplateLFNST[nsptBucketIdx][log2Width - 2][log2Height - 2][actualLfnstIdx], + stride, length, signPredWidth * signPredHeight) + : AreaBuf<const int16_t>()); +#else AreaBuf<const int8_t> templateLfnstNormalizedBuf = (lfnstEnabled ? AreaBuf<const int8_t>(g_resiBorderTemplateLFNST[nsptBucketIdx][log2Width - 2][log2Height - 2][actualLfnstIdx], stride, length, signPredWidth * signPredHeight) : AreaBuf<const int8_t>()); +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT + AreaBuf<const int16_t> templateLfnstNormalizedBuf = + (lfnstEnabled ? AreaBuf<const int16_t>(g_resiBorderTemplateLFNST[log2Width - 2][log2Height - 2][actualLfnstIdx], + stride, length, signPredWidth * signPredHeight) + : AreaBuf<const int16_t>()); #else AreaBuf<const int8_t> templateLfnstNormalizedBuf = (lfnstEnabled ? AreaBuf<const int8_t>(g_resiBorderTemplateLFNST[log2Width - 2][log2Height - 2][actualLfnstIdx], stride, length, signPredWidth * signPredHeight) : AreaBuf<const int8_t>()); #endif +#endif #else AreaBuf<const int8_t> templateNormalizedBuf(g_resiBorderTemplate[log2Width - 2][log2Height - 2][actualTrIdx], stride, length, SIGN_PRED_FREQ_RANGE * SIGN_PRED_FREQ_RANGE); @@ -2915,7 +2971,11 @@ void TrQuant::predCoeffSigns(TransformUnit &tu, const ComponentID compID, const CHECK(coeffVal == 0, "coefficient value should be nonzero"); #endif +#if JVET_AJ0237_INTERNAL_12BIT + const int16_t* templateBasisVec; +#else const int8_t *templateBasisVec; +#endif #if JVET_Y0141_SIGN_PRED_IMPROVE if (lfnstEnabled) diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index a2ab6604d..d5e901ea3 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -525,6 +525,7 @@ #define JVET_Z0150_MEMORY_USAGE_PRINT 1 // JVET-Z0150: Print memory usage #define JVET_Z0118_GDR 1 // JVET-Z0118: GDR #define JVET_AD0169_SMALL_SCALE_DOWNSAMPLING 1 // JVET-AD0169: Downsampling filters in range 1.1 to 1.35 based on Kaiser(7) windowed sinc +#define JVET_AJ0237_INTERNAL_12BIT 1 // JVET-AJ0237: Modifications for better operation at 12-bit internal bitdepth #if JVET_Z0118_GDR #define GDR_LEAK_TEST 0 diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index bd42c0c21..b6f449d96 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -3761,7 +3761,11 @@ static void simdFilter13x13BlkExtDbResiDirect( adjustShift -= shiftPrecis; // add more precision } const int shift = adjustShift; +#if JVET_AJ0237_INTERNAL_12BIT + const Pel currBase = 1 << (clpRng.bd - 1); +#else const Pel currBase = 512; +#endif int round = 1 << (shift - 1); #if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) @@ -4976,7 +4980,11 @@ static void simdFilter13x13BlkExtDbResi( adjustShift -= shiftPrecis; // add more precision } const int shift = adjustShift; +#if JVET_AJ0237_INTERNAL_12BIT + const Pel currBase = 1 << (clpRng.bd - 1); +#else const Pel currBase = 512; +#endif int round = 1 << (shift - 1); #if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) @@ -6115,7 +6123,11 @@ static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelB int clipIdx = gaussClipIdxTable[filterSetIdx][i]; gaussClipTable[i] = clippingValues[clipIdx]; } +#if JVET_AJ0237_INTERNAL_12BIT + int16_t diffTH = 32 << std::max(0, cs.sps->getBitDepth(CHANNEL_TYPE_LUMA) - 10); +#else int16_t diffTH = 32; +#endif #if JVET_AJ0188_CODING_INFO_CLASSIFICATION const bool isIntraSlice = cs.slice->isIntra(); @@ -8372,9 +8384,19 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr #endif } +#if JVET_AJ0237_INTERNAL_12BIT +template<X86_VEXT vext> +static void simdDeriveVariance(const CPelBuf& srcLuma, const Area& blkDst, const Area& blk, uint32_t ***variance, int bits) +#else template<X86_VEXT vext> static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, uint32_t ***variance) +#endif { +#if JVET_AJ0237_INTERNAL_12BIT + // temporary buffer, could be optimized + int64_t tempData[2][(256 + 10) >> 1][((256 + 16) >> 1) + 8] = { { { 0 } } }; + int bdShift = 2 * std::max(0, bits - 10); +#endif const size_t imgStride = srcLuma.stride; const Pel * srcExt = srcLuma.buf; int fl = DIST_CLASS; @@ -8386,16 +8408,19 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const int numSample2 = 128 * 128; int offset = numSample2 >> 1; +#if !JVET_AJ0237_INTERNAL_12BIT int num[8]{ numSample, numSample, numSample, numSample, numSample, numSample, numSample, numSample }; int mul[8]{ 13, 13, 13, 13, 13, 13, 13, 13 }; int off[8]{ offset, offset, offset, offset, offset, offset, offset, offset }; +#endif #if USE_AVX2 if (vext >= AVX2 && (blk.width % 32) == 0) { +#if !JVET_AJ0237_INTERNAL_12BIT __m256i n = _mm256_loadu_si256((__m256i *) num); __m256i m13 = _mm256_loadu_si256((__m256i *) mul); __m256i o = _mm256_loadu_si256((__m256i *) off); - +#endif const int posX = blk.pos().x; const int posY = blk.pos().y; @@ -8525,6 +8550,41 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const if (i == 8) { +#if JVET_AJ0237_INTERNAL_12BIT + for (int kk = 0; kk < 4; kk++) + { + __m256i x8Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 4][jOffset + kk * 4])); + __m256i y8Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 4][jOffset + kk * 4])); + __m256i x6Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 3][jOffset + kk * 4])); + __m256i y6Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 3][jOffset + kk * 4])); + __m256i x4Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 2][jOffset + kk * 4])); + __m256i y4Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 2][jOffset + kk * 4])); + __m256i x2Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 1][jOffset + kk * 4])); + __m256i y2Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 1][jOffset + kk * 4])); + __m256i sumLow = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset ][jOffset + kk * 4])); + __m256i sum2Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset ][jOffset + kk * 4])); + + x8Low = _mm256_add_epi64(sumLow, x8Low); + y8Low = _mm256_add_epi64(sum2Low, y8Low); + + x4Low = _mm256_add_epi64(x6Low, x4Low); + y4Low = _mm256_add_epi64(y6Low, y4Low); + + x2Low = _mm256_add_epi64(x8Low, x2Low); + y2Low = _mm256_add_epi64(y8Low, y2Low); + + sumLow = _mm256_add_epi64(x4Low, x2Low); + sum2Low = _mm256_add_epi64(y4Low, y2Low); + + _mm256_storeu_si256((__m256i*) &tempData[0][iOffset - 4][jOffset + kk * 4], sumLow); + _mm256_storeu_si256((__m256i*) &tempData[1][iOffset - 4][jOffset + kk * 4], sum2Low); + + variance[VARIANCE][iOffset - 4][jOffset + kk * 4] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4] - tempData[0][iOffset - 4][jOffset + kk * 4] * tempData[0][iOffset - 4][jOffset + kk * 4] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 4 + 1] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4 + 1] - tempData[0][iOffset - 4][jOffset + kk * 4 + 1] * tempData[0][iOffset - 4][jOffset + kk * 4 + 1] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 4 + 2] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4 + 2] - tempData[0][iOffset - 4][jOffset + kk * 4 + 2] * tempData[0][iOffset - 4][jOffset + kk * 4 + 2] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 4 + 3] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4 + 3] - tempData[0][iOffset - 4][jOffset + kk * 4 + 3] * tempData[0][iOffset - 4][jOffset + kk * 4 + 3] + offset) >> 3)) >> (14 + bdShift)); + } +#else x8 = _mm256_loadu_si256((__m256i *)&variance[2][iOffset - 4][jOffset]); y8 = _mm256_loadu_si256((__m256i *)&variance[3][iOffset - 4][jOffset]); x6 = _mm256_loadu_si256((__m256i *)&variance[2][iOffset - 3][jOffset]); @@ -8583,9 +8643,37 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const summ2 = _mm256_srli_epi32(summ2, 14); _mm256_storeu_si256((__m256i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); _mm256_storeu_si256((__m256i *) &variance[VARIANCE][iOffset - 4][jOffset + 8], summ2); +#endif } else if (i > 8) { +#if JVET_AJ0237_INTERNAL_12BIT + for (int kk = 0; kk < 4; kk++) + { + __m256i x8Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) &variance[2][iOffset - 5][jOffset + kk * 4])); + __m256i y8Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) &variance[3][iOffset - 5][jOffset + kk * 4])); + + __m256i x6Low = _mm256_loadu_si256((__m256i*) &tempData[0][iOffset - 5][jOffset + kk * 4]); + __m256i y6Low = _mm256_loadu_si256((__m256i*) &tempData[1][iOffset - 5][jOffset + kk * 4]); + + __m256i sumLow = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) &variance[2][iOffset][jOffset + kk * 4])); + __m256i sum2Low = _mm256_cvtepi32_epi64(_mm_loadu_si128((__m128i*) &variance[3][iOffset][jOffset + kk * 4])); + + x6Low = _mm256_sub_epi64(x6Low, x8Low); + y6Low = _mm256_sub_epi64(y6Low, y8Low); + + sumLow = _mm256_add_epi64(x6Low, sumLow); + sum2Low = _mm256_add_epi64(y6Low, sum2Low); + + _mm256_storeu_si256((__m256i*)& tempData[0][iOffset - 4][jOffset + kk * 4], sumLow); + _mm256_storeu_si256((__m256i*)& tempData[1][iOffset - 4][jOffset + kk * 4], sum2Low); + + variance[VARIANCE][iOffset - 4][jOffset + kk * 4] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4] - tempData[0][iOffset - 4][jOffset + kk * 4] * tempData[0][iOffset - 4][jOffset + kk * 4] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 4 + 1] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4 + 1] - tempData[0][iOffset - 4][jOffset + kk * 4 + 1] * tempData[0][iOffset - 4][jOffset + kk * 4 + 1] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 4 + 2] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4 + 2] - tempData[0][iOffset - 4][jOffset + kk * 4 + 2] * tempData[0][iOffset - 4][jOffset + kk * 4 + 2] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 4 + 3] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 4 + 3] - tempData[0][iOffset - 4][jOffset + kk * 4 + 3] * tempData[0][iOffset - 4][jOffset + kk * 4 + 3] + offset) >> 3)) >> (14 + bdShift)); + } +#else x8 = _mm256_loadu_si256((__m256i *)&variance[2][iOffset - 5][jOffset]); xx8 = _mm256_loadu_si256((__m256i *)&variance[2][iOffset - 5][jOffset + 8]); y8 = _mm256_loadu_si256((__m256i *)&variance[3][iOffset - 5][jOffset]); @@ -8625,6 +8713,7 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const summ2 = _mm256_srli_epi32(summ2, 14); _mm256_storeu_si256((__m256i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); _mm256_storeu_si256((__m256i *) &variance[VARIANCE][iOffset - 4][jOffset + 8], summ2); +#endif } } @@ -8633,10 +8722,11 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const else { #endif +#if !JVET_AJ0237_INTERNAL_12BIT __m128i n = _mm_loadu_si128((__m128i *) num); __m128i m13 = _mm_loadu_si128((__m128i *) mul); __m128i o = _mm_loadu_si128((__m128i *) off); - +#endif const int posX = blk.pos().x; const int posY = blk.pos().y; @@ -8706,6 +8796,39 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const if (i == 8) { +#if JVET_AJ0237_INTERNAL_12BIT + for (int kk = 0; kk < 2; kk++) + { + __m128i x8Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 4][jOffset + kk * 2])); + __m128i y8Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 4][jOffset + kk * 2])); + __m128i x6Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 3][jOffset + kk * 2])); + __m128i y6Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 3][jOffset + kk * 2])); + __m128i x4Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 2][jOffset + kk * 2])); + __m128i y4Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 2][jOffset + kk * 2])); + __m128i x2Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 1][jOffset + kk * 2])); + __m128i y2Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 1][jOffset + kk * 2])); + __m128i sumLow = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset ][jOffset + kk * 2])); + __m128i sum2Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset ][jOffset + kk * 2])); + + x8Low = _mm_add_epi64(sumLow, x8Low); + y8Low = _mm_add_epi64(sum2Low, y8Low); + + x4Low = _mm_add_epi64(x6Low, x4Low); + y4Low = _mm_add_epi64(y6Low, y4Low); + + x2Low = _mm_add_epi64(x8Low, x2Low); + y2Low = _mm_add_epi64(y8Low, y2Low); + + sumLow = _mm_add_epi64(x4Low, x2Low); + sum2Low = _mm_add_epi64(y4Low, y2Low); + + _mm_storeu_si128((__m128i*) &tempData[0][iOffset - 4][jOffset + kk * 2], sumLow); + _mm_storeu_si128((__m128i*) &tempData[1][iOffset - 4][jOffset + kk * 2], sum2Low); + + variance[VARIANCE][iOffset - 4][jOffset + kk * 2] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 2] - tempData[0][iOffset - 4][jOffset + kk * 2] * tempData[0][iOffset - 4][jOffset + kk * 2] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 2 + 1] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 2 + 1] - tempData[0][iOffset - 4][jOffset + kk * 2 + 1] * tempData[0][iOffset - 4][jOffset + kk * 2 + 1] + offset) >> 3)) >> (14 + bdShift)); + } +#else x8 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 4][jOffset]); y8 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 4][jOffset]); x6 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 3][jOffset]); @@ -8737,9 +8860,33 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const sum2 = _mm_mullo_epi32(sum2, m13); sum2 = _mm_srli_epi32(sum2, 14); _mm_storeu_si128((__m128i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); +#endif } else if (i > 8) { +#if JVET_AJ0237_INTERNAL_12BIT + for (int kk = 0; kk < 2; kk++) + { + __m128i x8Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset - 5][jOffset + kk * 2])); + __m128i y8Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset - 5][jOffset + kk * 2])); + __m128i x6Low = _mm_loadu_si128((__m128i*) &tempData[0][iOffset - 5][jOffset + kk * 2]); + __m128i y6Low = _mm_loadu_si128((__m128i*) &tempData[1][iOffset - 5][jOffset + kk * 2]); + __m128i sumLow = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[2][iOffset][jOffset + kk * 2])); + __m128i sum2Low = _mm_cvtepi32_epi64(_mm_loadu_si128((__m128i*) & variance[3][iOffset][jOffset + kk * 2])); + + x6Low = _mm_sub_epi32(x6Low, x8Low); + y6Low = _mm_sub_epi32(y6Low, y8Low); + + sumLow = _mm_add_epi32(x6Low, sumLow); + sum2Low = _mm_add_epi32(y6Low, sum2Low); + + _mm_storeu_si128((__m128i*) & tempData[0][iOffset - 4][jOffset + kk * 2], sumLow); + _mm_storeu_si128((__m128i*) & tempData[1][iOffset - 4][jOffset + kk * 2], sum2Low); + + variance[VARIANCE][iOffset - 4][jOffset + kk * 2] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 2] - tempData[0][iOffset - 4][jOffset + kk * 2] * tempData[0][iOffset - 4][jOffset + kk * 2] + offset) >> 3)) >> (14 + bdShift)); + variance[VARIANCE][iOffset - 4][jOffset + kk * 2 + 1] = (uint32_t)((13 * ((numSample * tempData[1][iOffset - 4][jOffset + kk * 2 + 1] - tempData[0][iOffset - 4][jOffset + kk * 2 + 1] * tempData[0][iOffset - 4][jOffset + kk * 2 + 1] + offset) >> 3)) >> (14 + bdShift)); + } +#else x8 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 5][jOffset]); y8 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 5][jOffset]); x6 = _mm_loadu_si128((__m128i *)&variance[0][iOffset - 5][jOffset]); @@ -8761,6 +8908,7 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const sum2 = _mm_mullo_epi32(sum2, m13); sum2 = _mm_srli_epi32(sum2, 14); _mm_storeu_si128((__m128i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); +#endif } } diff --git a/source/Lib/CommonLib/x86/BilateralFilterX86.h b/source/Lib/CommonLib/x86/BilateralFilterX86.h index 9e3262765..2c60e66cc 100644 --- a/source/Lib/CommonLib/x86/BilateralFilterX86.h +++ b/source/Lib/CommonLib/x86/BilateralFilterX86.h @@ -48,7 +48,11 @@ #if USE_AVX2 #if JVET_AF0112_BIF_DYNAMIC_SCALING +#if JVET_AJ0237_INTERNAL_12BIT +inline void simdBifApplyLut(__m256i& val, __m256i& acc, int cutBitsNum, __m256i& bitsRound, __m256i& bitsRound2, __m256i& lut, int bdShift) +#else inline void simdBifApplyLut(__m256i& val, __m256i& acc, int cutBitsNum, __m256i& bitsRound, __m256i& bitsRound2, __m256i& lut) +#endif #else inline void simdBifApplyLut(__m256i& val, __m256i& acc, __m256i& lut, int lutShift) #endif @@ -74,23 +78,40 @@ inline void simdBifApplyLut(__m256i& val, __m256i& acc, __m256i& lut, int lutShi diffabs = _mm256_shuffle_epi8(lut, diffabs); /* lut */ diffabs = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(diffabs)); /* back to 16-bit */ diffabs = _mm256_srai_epi16(diffabs, lutShift); /* diagonal shift! */ +#endif +#if JVET_AJ0237_INTERNAL_12BIT + diffabs = _mm256_slli_epi16(diffabs, bdShift); #endif diffabs = _mm256_sign_epi16(diffabs, val); /* add original sign */ acc = _mm256_add_epi16(diffabs, acc); /* add to acc */ } +#if JVET_AJ0237_INTERNAL_12BIT +template<X86_VEXT vext> +void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum, int bdShift) +#else template<X86_VEXT vext> void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum) +#endif { //if( uiWidth < 4 || ( uiWidth < 8 && isRDO ) ) if (uiWidth < 4) { +#if JVET_AJ0237_INTERNAL_12BIT + return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, isRDO, lutRowPtr, noClip, cutBitsNum, bdShift); +#else return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, isRDO, lutRowPtr, noClip, cutBitsNum); +#endif } int pad = 2; int padwidth = iWidthExtSIMD; +#if JVET_AJ0237_INTERNAL_12BIT + cutBitsNum += bdShift; +#endif + + __m256i center, left, right, up, down, lu, ld, ru, rd, acc, roundAdd, clipmin, clipmax, inputVals; __m256i ll, rr, uu, dd; __m128i lutTmp; @@ -106,7 +127,11 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, __m256i lut2 = _mm256_set_m128i(lutTmp, lutTmp); lutTmp = _mm_loadu_si128((__m128i*)(lutRowPtr + 32)); __m256i lut3 = _mm256_set_m128i(lutTmp, lutTmp); +#if JVET_AJ0237_INTERNAL_12BIT + __m256i mmBfac = _mm256_unpacklo_epi16(_mm256_set1_epi16(bfac), _mm256_set1_epi16(1)); +#else __m256i mmBfac = _mm256_set1_epi16(bfac); +#endif roundAdd = _mm256_set1_epi16(bifRoundAdd << 3); __m256i bitsRound = _mm256_set1_epi16(1 << (cutBitsNum - 2)); __m256i bitsRound2 = _mm256_set1_epi16((1 << (cutBitsNum - 2)) + (1 << (cutBitsNum - 1))); @@ -160,6 +185,22 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, // apply LUT #if JVET_AF0112_BIF_DYNAMIC_SCALING +#if JVET_AJ0237_INTERNAL_12BIT + simdBifApplyLut(left, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + simdBifApplyLut(right, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + simdBifApplyLut(up, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + simdBifApplyLut(down, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + + simdBifApplyLut(lu, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + simdBifApplyLut(ld, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + simdBifApplyLut(ru, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + simdBifApplyLut(rd, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + + simdBifApplyLut(ll, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); + simdBifApplyLut(rr, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); + simdBifApplyLut(uu, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); + simdBifApplyLut(dd, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); +#else simdBifApplyLut(left, acc, cutBitsNum, bitsRound, bitsRound2, lut1); simdBifApplyLut(right, acc, cutBitsNum, bitsRound, bitsRound2, lut1); simdBifApplyLut(up, acc, cutBitsNum, bitsRound, bitsRound2, lut1); @@ -174,6 +215,7 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, simdBifApplyLut(rr, acc, cutBitsNum, bitsRound, bitsRound2, lut3); simdBifApplyLut(uu, acc, cutBitsNum, bitsRound, bitsRound2, lut3); simdBifApplyLut(dd, acc, cutBitsNum, bitsRound, bitsRound2, lut3); +#endif #else simdBifApplyLut(left, acc, lut, lutShift1); simdBifApplyLut(right, acc, lut, lutShift1); @@ -193,9 +235,21 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, // TU scaling #if JVET_AF0112_BIF_DYNAMIC_SCALING +#if JVET_AJ0237_INTERNAL_12BIT + __m256i accLow = _mm256_unpacklo_epi16(acc, roundAdd); + __m256i accHigh = _mm256_unpackhi_epi16(acc, roundAdd); + __m256i accLowPack = _mm256_madd_epi16(accLow, mmBfac); + __m256i accHighPack = _mm256_madd_epi16(accHigh, mmBfac); + + accLow = _mm256_srai_epi32(accLowPack, bifRoundShift + 3); + accHigh = _mm256_srai_epi32(accHighPack, bifRoundShift + 3); + + acc = _mm256_packs_epi32(accLow, accHigh); +#else acc = _mm256_mullo_epi16(acc, mmBfac); acc = _mm256_adds_epi16(acc, roundAdd); acc = _mm256_srai_epi16(acc, bifRoundShift + 3); +#endif #else if (bfac == 2) { @@ -293,7 +347,11 @@ int BilateralFilter::simdCalcMAD(int16_t* block, int stride, int width, int heig #else // USE_AVX2 #if JVET_AF0112_BIF_DYNAMIC_SCALING +#if JVET_AJ0237_INTERNAL_12BIT +inline void simdBifApplyLut(__m128i& val, __m128i& acc, int cutBitsNum, __m128i& bitsRound, __m128i& bitsRound2, __m128i& lut, int bdShift) +#else inline void simdBifApplyLut(__m128i& val, __m128i& acc, int cutBitsNum, __m128i& bitsRound, __m128i& bitsRound2, __m128i& lut) +#endif #else inline void simdBifApplyLut(__m128i& val, __m128i& acc, __m128i& lut, int lutShift) #endif @@ -318,23 +376,39 @@ inline void simdBifApplyLut(__m128i& val, __m128i& acc, __m128i& lut, int lutShi diffabs = _mm_shuffle_epi8(lut, diffabs); /* lut */ diffabs = _mm_cvtepi8_epi16(diffabs); /* back to 16-bit */ diffabs = _mm_srai_epi16(diffabs, lutShift); /* diagonal shift! */ +#endif +#if JVET_AJ0237_INTERNAL_12BIT + diffabs = _mm_slli_epi16(diffabs, bdShift); #endif diffabs = _mm_sign_epi16(diffabs, val); /* add original sign */ acc = _mm_add_epi16(diffabs, acc); /* add to acc */ } +#if JVET_AJ0237_INTERNAL_12BIT +template<X86_VEXT vext> +void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum, int bdShift) +#else template<X86_VEXT vext> void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bifRoundAdd, int bifRoundShift, bool isRDO, const char* lutRowPtr, bool noClip, int cutBitsNum) +#endif { //if( uiWidth < 4 || ( uiWidth < 8 && isRDO ) ) if( uiWidth < 4 ) { +#if JVET_AJ0237_INTERNAL_12BIT + return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, isRDO, lutRowPtr, noClip, cutBitsNum, bdShift); +#else return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bifRoundAdd, bifRoundShift, isRDO, lutRowPtr, noClip, cutBitsNum); +#endif } int pad = 2; int padwidth = iWidthExtSIMD; +#if JVET_AJ0237_INTERNAL_12BIT + cutBitsNum += bdShift; +#endif + __m128i center, left, right, up, down, lu, ld, ru, rd, acc, roundAdd, clipmin, clipmax, inputVals; __m128i ll, rr, uu, dd; @@ -346,7 +420,11 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, __m128i lut1 = _mm_loadu_si128((__m128i*)(lutRowPtr)); __m128i lut2 = _mm_loadu_si128((__m128i*)(lutRowPtr + 16)); __m128i lut3 = _mm_loadu_si128((__m128i*)(lutRowPtr + 32)); +#if JVET_AJ0237_INTERNAL_12BIT + __m128i mmBfac = _mm_unpacklo_epi16(_mm_set1_epi16(bfac), _mm_set1_epi16(1)); +#else __m128i mmBfac = _mm_set1_epi16(bfac); +#endif roundAdd = _mm_set1_epi16(bifRoundAdd << 3); __m128i bitsRound = _mm_set1_epi16(1 << (cutBitsNum - 2)); __m128i bitsRound2 = _mm_set1_epi16((1 << (cutBitsNum - 2)) + (1 << (cutBitsNum - 1))); @@ -399,6 +477,22 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, // apply LUT #if JVET_AF0112_BIF_DYNAMIC_SCALING +#if JVET_AJ0237_INTERNAL_12BIT + simdBifApplyLut(left, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + simdBifApplyLut(right, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + simdBifApplyLut(up, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + simdBifApplyLut(down, acc, cutBitsNum, bitsRound, bitsRound2, lut1, bdShift); + + simdBifApplyLut(lu, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + simdBifApplyLut(ld, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + simdBifApplyLut(ru, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + simdBifApplyLut(rd, acc, cutBitsNum, bitsRound, bitsRound2, lut2, bdShift); + + simdBifApplyLut(ll, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); + simdBifApplyLut(rr, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); + simdBifApplyLut(uu, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); + simdBifApplyLut(dd, acc, cutBitsNum, bitsRound, bitsRound2, lut3, bdShift); +#else simdBifApplyLut(left, acc, cutBitsNum, bitsRound, bitsRound2, lut1); simdBifApplyLut(right, acc, cutBitsNum, bitsRound, bitsRound2, lut1); simdBifApplyLut(up, acc, cutBitsNum, bitsRound, bitsRound2, lut1); @@ -413,6 +507,7 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, simdBifApplyLut(rr, acc, cutBitsNum, bitsRound, bitsRound2, lut3); simdBifApplyLut(uu, acc, cutBitsNum, bitsRound, bitsRound2, lut3); simdBifApplyLut(dd, acc, cutBitsNum, bitsRound, bitsRound2, lut3); +#endif #else simdBifApplyLut(left, acc, lut, lutShift1); simdBifApplyLut(right, acc, lut, lutShift1); @@ -432,9 +527,21 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, // TU scaling #if JVET_AF0112_BIF_DYNAMIC_SCALING +#if JVET_AJ0237_INTERNAL_12BIT + __m128i accLow = _mm_unpacklo_epi16(acc, roundAdd); + __m128i accHigh = _mm_unpackhi_epi16(acc, roundAdd); + __m128i accLowPack = _mm_madd_epi16(accLow, mmBfac); + __m128i accHighPack = _mm_madd_epi16(accHigh, mmBfac); + + accLow = _mm_srai_epi32(accLowPack, bifRoundShift + 3); + accHigh = _mm_srai_epi32(accHighPack, bifRoundShift + 3); + + acc = _mm_packs_epi32(accLow, accHigh); +#else acc = _mm_mullo_epi16(acc, mmBfac); acc = _mm_adds_epi16(acc, roundAdd); acc = _mm_srai_epi16(acc, bifRoundShift + 3); +#endif #else if (bfac == 2) { diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index 5730bd2ec..b5028b225 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -3907,7 +3907,11 @@ void getAbsoluteDifferencePerSample_SSE(Pel* dst, int dstStride, const Pel* src0 template <X86_VEXT vext, uint8_t maskType> int64_t getMaskedSampleSum_SSE(Pel* src, int srcStride, int width, int height, int bitDepth, short* weightMask, int maskStepX, int maskStride, int maskStride2) { +#if JVET_AJ0237_INTERNAL_12BIT + if ((width & 7) != 0 || bitDepth > 12) +#else if ((width & 7) != 0 || bitDepth > 10) +#endif { return getMaskedSampleSumCore<maskType>(src, srcStride, width, height, bitDepth, weightMask, maskStepX, maskStride, maskStride2); } diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h index 0d92aca83..039b840f0 100644 --- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h +++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h @@ -79,6 +79,11 @@ static void fullPelCopySSE( const ClpRng& clpRng, const void*_src, int srcStride #endif int headroom_offset = 1 << ( headroom - 1 ); int offset = IF_INTERNAL_OFFS; + +#if JVET_AJ0237_INTERNAL_12BIT + int dmvrHeadRoom = IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd; // in the current setup, dmvr headroom should either be 0 or negative +#endif + __m128i voffset = _mm_set1_epi16( offset ); __m128i voffset_headroom = _mm_set1_epi16( headroom_offset ); @@ -106,7 +111,29 @@ static void fullPelCopySSE( const ClpRng& clpRng, const void*_src, int srcStride #if MCIF_SIMD_NEW if (biMCForDMVR) { +#if JVET_AJ0237_INTERNAL_12BIT + if ((isFirst == isLast) || (isFirst && dmvrHeadRoom == 0)) + { + vsum = vsrc; + } + else if (isFirst) + { + if (dmvrHeadRoom > 0) + { + vsum = _mm_slli_epi16(vsrc, dmvrHeadRoom); + } + else + { + vsum = _mm_srai_epi16(vsrc, -dmvrHeadRoom); + } + } + else + { + CHECK(1, "Impossible to have isFirst being false and isLast being true, when biMCForDMVR is true"); + } +#else vsum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc)); +#endif } else if (isFirst == isLast) { @@ -152,6 +179,10 @@ static void fullPelCopyVerSSE(const ClpRng& clpRng, const void*_src, int srcStri int headroom = IF_INTERNAL_PREC - clpRng.bd; int headroom_offset = 1 << (headroom - 1); int offset = IF_INTERNAL_OFFS; +#if JVET_AJ0237_INTERNAL_12BIT + int dmvrHeadRoom = IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd; // in the current setup, dmvr headroom should either be 0 or negative +#endif + __m128i voffset = _mm_set1_epi16(offset); __m128i voffset_headroom = _mm_set1_epi16(headroom_offset); @@ -176,7 +207,29 @@ static void fullPelCopyVerSSE(const ClpRng& clpRng, const void*_src, int srcStri } if (biMCForDMVR) { +#if JVET_AJ0237_INTERNAL_12BIT + if ((isFirst == isLast) || (isFirst && dmvrHeadRoom == 0)) + { + vsum = vsrc; + } + else if (isFirst) + { + if (dmvrHeadRoom > 0) + { + vsum = _mm_slli_epi16(vsrc, dmvrHeadRoom); + } + else + { + vsum = _mm_srai_epi16(vsrc, -dmvrHeadRoom); + } + } + else + { + CHECK(1, "Impossible to have isFirst being false and isLast being true, when biMCForDMVR is true"); + } +#else vsum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc)); +#endif } else if (isFirst == isLast) { @@ -218,6 +271,10 @@ static void fullPelCopySSE_M4(const ClpRng& clpRng, const void*_src, ptrdiff_t s int headroom = IF_INTERNAL_PREC - clpRng.bd; int headroom_offset = 1 << (headroom - 1); int offset = IF_INTERNAL_OFFS; +#if JVET_AJ0237_INTERNAL_12BIT + int dmvrHeadRoom = IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd; // in the current setup, dmvr headroom should either be 0 or negative +#endif + __m128i voffset = _mm_set1_epi16(offset); __m128i voffset_headroom = _mm_set1_epi16(headroom_offset); __m128i vibdimin = _mm_set1_epi16(clpRng.min); @@ -243,7 +300,29 @@ static void fullPelCopySSE_M4(const ClpRng& clpRng, const void*_src, ptrdiff_t s } if (biMCForDMVR) { +#if JVET_AJ0237_INTERNAL_12BIT + if ((isFirst == isLast) || (isFirst && dmvrHeadRoom == 0)) + { + vsum = vsrc; + } + else if (isFirst) + { + if (dmvrHeadRoom > 0) + { + vsum = _mm_slli_epi16(vsrc, dmvrHeadRoom); + } + else + { + vsum = _mm_srai_epi16(vsrc, -dmvrHeadRoom); + } + } + else + { + CHECK(1, "Impossible to have isFirst being false and isLast being true, when biMCForDMVR is true"); + } +#else vsum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc)); +#endif } else if (isFirst == isLast) { @@ -280,6 +359,10 @@ static void fullPelCopyVerSSE_M4(const ClpRng& clpRng, const void*_src, ptrdiff_ int headroom = IF_INTERNAL_PREC - clpRng.bd; int headroom_offset = 1 << (headroom - 1); int offset = IF_INTERNAL_OFFS; +#if JVET_AJ0237_INTERNAL_12BIT + int dmvrHeadRoom = IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd; // in the current setup, dmvr headroom should either be 0 or negative +#endif + __m128i voffset = _mm_set1_epi16(offset); __m128i voffset_headroom = _mm_set1_epi16(headroom_offset); __m128i vibdimin = _mm_set1_epi16(clpRng.min); @@ -301,7 +384,29 @@ static void fullPelCopyVerSSE_M4(const ClpRng& clpRng, const void*_src, ptrdiff_ } if (biMCForDMVR) { +#if JVET_AJ0237_INTERNAL_12BIT + if ((isFirst == isLast) || (isFirst && dmvrHeadRoom == 0)) + { + vsum = vsrc; + } + else if (isFirst) + { + if (dmvrHeadRoom > 0) + { + vsum = _mm_slli_epi16(vsrc, dmvrHeadRoom); + } + else + { + vsum = _mm_srai_epi16(vsrc, -dmvrHeadRoom); + } + } + else + { + CHECK(1, "Impossible to have isFirst being false and isLast being true, when biMCForDMVR is true"); + } +#else vsum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc)); +#endif } else if (isFirst == isLast) { @@ -355,6 +460,9 @@ static void fullPelCopyAVX2( const ClpRng& clpRng, const void*_src, int srcStrid __m256i vibdimax = _mm256_set1_epi16( clpRng.max ); __m256i vsrc, vsum; +#if JVET_AJ0237_INTERNAL_12BIT + int dmvrHeadRoom = IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd; // in the current setup, dmvr headroom should either be 0 or negative +#endif for( int row = 0; row < height; row++ ) { @@ -376,7 +484,29 @@ static void fullPelCopyAVX2( const ClpRng& clpRng, const void*_src, int srcStrid #if MCIF_SIMD_NEW if (biMCForDMVR) { +#if JVET_AJ0237_INTERNAL_12BIT + if ((isFirst == isLast) || (isFirst && dmvrHeadRoom == 0)) + { + vsum = vsrc; + } + else if (isFirst) + { + if (dmvrHeadRoom > 0) + { + vsum = _mm256_slli_epi16(vsrc, dmvrHeadRoom); + } + else + { + vsum = _mm256_srai_epi16(vsrc, -dmvrHeadRoom); + } + } + else + { + CHECK(1, "Impossible to have isFirst being false and isLast being true, when biMCForDMVR is true"); + } +#else vsum = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, vsrc)); +#endif } else if (isFirst == isLast) { @@ -426,6 +556,10 @@ static void fullPelCopyVerAVX2(const ClpRng& clpRng, const void*_src, int srcStr int offset = 1 << (headroom - 1); int internal_offset = IF_INTERNAL_OFFS; +#if JVET_AJ0237_INTERNAL_12BIT + int dmvrHeadRoom = IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd; // in the current setup, dmvr headroom should either be 0 or negative +#endif + __m256i vinternal_offset = _mm256_set1_epi16(internal_offset); __m256i vheadroom_offset = _mm256_set1_epi16(offset); @@ -452,7 +586,29 @@ static void fullPelCopyVerAVX2(const ClpRng& clpRng, const void*_src, int srcStr } if (biMCForDMVR) { +#if JVET_AJ0237_INTERNAL_12BIT + if ((isFirst == isLast) || (isFirst && dmvrHeadRoom == 0)) + { + vsum = vsrc; + } + else if (isFirst) + { + if (dmvrHeadRoom > 0) + { + vsum = _mm256_slli_epi16(vsrc, dmvrHeadRoom); + } + else + { + vsum = _mm256_srai_epi16(vsrc, -dmvrHeadRoom); + } + } + else + { + CHECK(1, "Impossible to have isFirst being false and isLast being true, when biMCForDMVR is true"); + } +#else vsum = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, vsrc)); +#endif } else if (isFirst == isLast) { @@ -3262,7 +3418,11 @@ static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel { if( isFirst ) { +#if JVET_AJ0237_INTERNAL_12BIT + shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR(clpRng.bd) - clpRng.bd); +#else shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd); +#endif offset = 1 << (shift - 1); } else diff --git a/source/Lib/CommonLib/x86/IntraX86.h b/source/Lib/CommonLib/x86/IntraX86.h index 8d03a3abc..d505792cd 100644 --- a/source/Lib/CommonLib/x86/IntraX86.h +++ b/source/Lib/CommonLib/x86/IntraX86.h @@ -2723,7 +2723,11 @@ bool xPredIntraOpt_SIMD(PelBuf &pDst, const PredictionUnit &pu, const uint32_t m const int addShift = 1 << 13; const __m128i offset = _mm_set1_epi32( addShift ); +#if JVET_AJ0237_INTERNAL_12BIT + const __m128i max = _mm_set1_epi32((1 << clpRng.bd) - 1); +#else const __m128i max = _mm_set1_epi32( 1023 ); +#endif const __m128i zeros = _mm_setzero_si128(); __m128i vmat[ 4 ], vcoef[ 4 ], vsrc; @@ -2862,6 +2866,27 @@ int64_t calcAeipGroupSumSIMD(const Pel* src1, const Pel* src2, const int numSamp __m256i vzero = _mm256_setzero_si256(); __m256i vsum32 = vzero; const int samplesBySIMD = (numSamples >> 4) << 4; +#if JVET_AJ0237_INTERNAL_12BIT + int64_t sum = 0; + const int simdSampleBatchCnt = (samplesBySIMD >> 4) >> 2; + for (int batchIdx = 0; batchIdx < simdSampleBatchCnt; batchIdx++) + { + vsum32 = vzero; + for (; i < ((batchIdx + 1) * 64); i += 16) + { + __m256i vsrc1 = _mm256_lddqu_si256((__m256i*)(&src1[i])); + __m256i vsrc2 = _mm256_lddqu_si256((__m256i*)(&src2[i])); + __m256i vsumtemp = _mm256_madd_epi16(vsrc1, vsrc2); + vsum32 = _mm256_add_epi32(vsum32, vsumtemp); + } + vsum32 = _mm256_hadd_epi32(vsum32, vsum32); + vsum32 = _mm256_hadd_epi32(vsum32, vsum32); + sum += (_mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)))); + } + if (i < samplesBySIMD) + { + vsum32 = vzero; +#endif for (; i < samplesBySIMD; i += 16) { __m256i vsrc1 = _mm256_lddqu_si256((__m256i*)(&src1[i])); @@ -2871,11 +2896,37 @@ int64_t calcAeipGroupSumSIMD(const Pel* src1, const Pel* src2, const int numSamp } vsum32 = _mm256_hadd_epi32(vsum32, vsum32); vsum32 = _mm256_hadd_epi32(vsum32, vsum32); +#if JVET_AJ0237_INTERNAL_12BIT + sum += (_mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)))); + } +#else int64_t sum = (_mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)))); +#endif #else __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; const int samplesBySIMD = (numSamples >> 4) << 4; +#if JVET_AJ0237_INTERNAL_12BIT + int64_t sum = 0; + const int simdSampleBatchCnt = (samplesBySIMD >> 4) >> 2; + for (int batchIdx = 0; batchIdx < simdSampleBatchCnt; batchIdx++) + { + vsum32 = vzero; + for (; i < ((batchIdx + 1) * 64); i += 8) + { + __m128i vsrc1 = _mm_loadu_si128((__m128i*)(&src1[i])); + __m128i vsrc2 = _mm_loadu_si128((__m128i*)(&src2[i])); + __m128i vsumtemp = _mm_madd_epi16(vsrc1, vsrc2); + vsum32 = _mm_add_epi32(vsum32, vsumtemp); + } + vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e)); + vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1)); + sum += _mm_cvtsi128_si32(vsum32); + } + if (i < samplesBySIMD) + { + vsum32 = vzero; +#endif for (; i < samplesBySIMD; i += 8) { __m128i vsrc1 = _mm_loadu_si128((__m128i*)(&src1[i])); @@ -2885,7 +2936,12 @@ int64_t calcAeipGroupSumSIMD(const Pel* src1, const Pel* src2, const int numSamp } vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e)); // 01001110 vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1)); // 10110001 +#if JVET_AJ0237_INTERNAL_12BIT + sum += _mm_cvtsi128_si32(vsum32); + } +#else int64_t sum = _mm_cvtsi128_si32(vsum32); +#endif #endif for (; i < numSamples; i++) { diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h index 5a877a07e..f0c53c1a2 100644 --- a/source/Lib/CommonLib/x86/RdCostX86.h +++ b/source/Lib/CommonLib/x86/RdCostX86.h @@ -50,7 +50,11 @@ template<X86_VEXT vext > Distortion RdCost::xGetSSE_SIMD( const DistParam &rcDtParam ) { #if DIST_SSE_ENABLE +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) +#endif #else if( rcDtParam.bitDepth > 10 ) #endif @@ -179,7 +183,11 @@ Distortion RdCost::xGetSSE_SIMD( const DistParam &rcDtParam ) template<int iWidth, X86_VEXT vext > Distortion RdCost::xGetSSE_NxN_SIMD( const DistParam &rcDtParam ) { +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if( rcDtParam.bitDepth > 10 || rcDtParam.applyWeight ) +#endif return RdCost::xGetSSE( rcDtParam ); const Torg* pSrc1 = (const Torg*)rcDtParam.org.buf; @@ -354,7 +362,11 @@ Distortion RdCost::xGetSSE_NxN_SIMD( const DistParam &rcDtParam ) template<X86_VEXT vext> Distortion RdCost::xGetSSE_NxN_SIMD(const DistParam &rcDtParam) { +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if (rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) +#endif return RdCost::xGetSSE(rcDtParam); const Torg *pSrc1 = (const Torg *) rcDtParam.org.buf; @@ -500,7 +512,11 @@ Distortion RdCost::xGetSSE_NxN_SIMD(const DistParam &rcDtParam) template< X86_VEXT vext > Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam ) { +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if( rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight ) +#endif return RdCost::xGetSAD( rcDtParam ); const short* pSrc1 = (const short*)rcDtParam.org.buf; @@ -593,7 +609,11 @@ Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam ) template< X86_VEXT vext > Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) { +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) +#endif return RdCost::xGetSAD(rcDtParam); const short* src0 = (const short*)rcDtParam.org.buf; @@ -631,7 +651,11 @@ Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) template< int iWidth, X86_VEXT vext > Distortion RdCost::xGetSAD_NxN_SIMD( const DistParam &rcDtParam ) { +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if( rcDtParam.bitDepth > 10 || rcDtParam.applyWeight ) +#endif return RdCost::xGetSAD( rcDtParam ); // assert( rcDtParam.iCols == iWidth); @@ -2493,7 +2517,11 @@ static uint32_t xCalcHAD8x16_AVX2( const Pel* piOrg, const Pel* piCur, const int template< X86_VEXT vext > Distortion RdCost::xGetSADwMask_SIMD( const DistParam &rcDtParam ) { +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) +#endif return RdCost::xGetSADwMask( rcDtParam ); const short* src1 = (const short*)rcDtParam.org.buf; @@ -2583,7 +2611,11 @@ Distortion RdCost::xGetSADwMask_SIMD( const DistParam &rcDtParam ) template<X86_VEXT vext> Distortion RdCost::xGetHADs_SIMD( const DistParam &rcDtParam ) { +#if JVET_AJ0237_INTERNAL_12BIT + if (rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if( rcDtParam.bitDepth > 10 || rcDtParam.applyWeight ) +#endif { return RdCost::xGetHADs( rcDtParam ); } @@ -2842,7 +2874,11 @@ Distortion RdCost::xGetMRSAD_SIMD(const DistParam &rcDtParam) { int width = rcDtParam.org.width; +#if JVET_AJ0237_INTERNAL_12BIT + if (width < 4 || rcDtParam.bitDepth > 12 || rcDtParam.applyWeight) +#else if (width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) +#endif { return RdCost::xGetMRSAD(rcDtParam); } @@ -3056,7 +3092,11 @@ Distortion RdCost::xGetTMErrorFull_SIMD(const DistParam& rcDtParam) { if ( rcDtParam.org.width < 4 || ( trueAfalseL && (rcDtParam.org.width & 15) ) // (Above template) multiple of 16 +#if JVET_AJ0237_INTERNAL_12BIT + || rcDtParam.bitDepth > 12 +#else || rcDtParam.bitDepth > 10 +#endif || rcDtParam.applyWeight ) { diff --git a/source/Lib/DecoderLib/DecLib.cpp b/source/Lib/DecoderLib/DecLib.cpp index 955a7d0b2..8d36328a2 100644 --- a/source/Lib/DecoderLib/DecLib.cpp +++ b/source/Lib/DecoderLib/DecLib.cpp @@ -1119,9 +1119,17 @@ void DecLib::finishPicture(int& poc, PicList*& rpcListPic, MsgLevel msgl ) #if JVET_AA0096_MC_BOUNDARY_PADDING m_cFrameMcPadPrediction.init(&m_cRdCost, pcSlice->getSPS()->getChromaFormatIdc(), pcSlice->getSPS()->getMaxCUHeight(), #if JVET_AJ0172_IBC_ITMP_ALIGN_REF_AREA +#if JVET_AJ0237_INTERNAL_12BIT + NULL, m_pcPic->getPicWidthInLumaSamples(),m_pcPic->getPicHeightInLumaSamples(), pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); +#else NULL, m_pcPic->getPicWidthInLumaSamples(),m_pcPic->getPicHeightInLumaSamples()); +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT + NULL, m_pcPic->getPicWidthInLumaSamples(), pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); #else NULL, m_pcPic->getPicWidthInLumaSamples()); +#endif #endif m_cFrameMcPadPrediction.mcFramePad(m_pcPic, *(m_pcPic->slices[0])); #endif @@ -1969,6 +1977,10 @@ void DecLib::xActivateParameterSets( const InputNALUnit nalu ) sps->getMaxCUWidth(), sps->getMaxCUHeight(), maxDepth, log2SaoOffsetScaleLuma, log2SaoOffsetScaleChroma ); +#if JVET_AJ0237_INTERNAL_12BIT + m_cSAO.m_bilateralFilter.setInternalBitDepth(sps->getBitDepth(CHANNEL_TYPE_LUMA)); + m_cBilateralFilter.setInternalBitDepth(sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#endif #if JVET_W0066_CCSAO pSlice->m_ccSaoControl[COMPONENT_Y ] = m_cSAO.getCcSaoControlIdc(COMPONENT_Y); pSlice->m_ccSaoControl[COMPONENT_Cb] = m_cSAO.getCcSaoControlIdc(COMPONENT_Cb); @@ -1979,10 +1991,18 @@ void DecLib::xActivateParameterSets( const InputNALUnit nalu ) #if INTER_LIC || (TM_AMVP || TM_MRG || JVET_Z0084_IBC_TM) || JVET_W0090_ARMC_TM || JVET_Z0056_GPM_SPLIT_MODE_REORDERING #if JVET_Z0153_IBC_EXT_REF #if JVET_AJ0172_IBC_ITMP_ALIGN_REF_AREA +#if JVET_AJ0237_INTERNAL_12BIT + m_cInterPred.init(&m_cRdCost, sps->getChromaFormatIdc(), sps->getMaxCUHeight(), &m_cReshaper, sps->getMaxPicWidthInLumaSamples(),sps->getMaxPicHeightInLumaSamples(), sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#else m_cInterPred.init(&m_cRdCost, sps->getChromaFormatIdc(), sps->getMaxCUHeight(), &m_cReshaper, sps->getMaxPicWidthInLumaSamples(),sps->getMaxPicHeightInLumaSamples()); +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT + m_cInterPred.init(&m_cRdCost, sps->getChromaFormatIdc(), sps->getMaxCUHeight(), &m_cReshaper, sps->getMaxPicWidthInLumaSamples(), sps->getBitDepth(CHANNEL_TYPE_LUMA)); #else m_cInterPred.init(&m_cRdCost, sps->getChromaFormatIdc(), sps->getMaxCUHeight(), &m_cReshaper, sps->getMaxPicWidthInLumaSamples()); #endif +#endif #else m_cInterPred.init( &m_cRdCost, sps->getChromaFormatIdc(), sps->getMaxCUHeight(), &m_cReshaper); #endif @@ -3086,6 +3106,9 @@ bool DecLib::xDecodeSlice(InputNALUnit &nalu, int &iSkipFrame, int iPOCLastDispl { clipDeltaShift = ADAPTIVE_CLIP_SHIFT_DELTA_VALUE_0; } +#if JVET_AJ0237_INTERNAL_12BIT + clipDeltaShift += std::max(0, pcSlice->getSPS()->getBitDepth(toChannelType(COMPONENT_Y)) - 10); +#endif if (pcSlice->getSliceType() != I_SLICE) { int deltaMax = pcSlice->getLumaPelMax(); diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index f7a37c708..103394181 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -4711,7 +4711,11 @@ void HLSyntaxReader::parseScaleAlf( Slice* pcSlice, SPS* sps, ParameterSetManage { ScaleAlf& curScaleAlfParam = pcSlice->getAlfScale( i , j ); +#if JVET_AJ0237_INTERNAL_12BIT + curScaleAlfParam.init(apsIdx, j, alfParam.lumaClassifierIdx[j], sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#else curScaleAlfParam.init( apsIdx, j, alfParam.lumaClassifierIdx[j] ); +#endif curScaleAlfParam.apsIdx = apsIdx; if ( !bReadUseAlfScale ) @@ -5807,7 +5811,11 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, PicHeader* picHeader, Par if (index != -1) { lambdaCanBePredicted = true; +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->setCostForARMC(sps->getLambdaVal((int)index), sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#else pcSlice->setCostForARMC(sps->getLambdaVal((int) index)); +#endif } if (!lambdaCanBePredicted) { @@ -5816,7 +5824,11 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, PicHeader* picHeader, Par #else READ_CODE(9, uiCode, "Lambda"); #endif +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->setCostForARMC((uint32_t)uiCode, sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#else pcSlice->setCostForARMC((uint32_t)uiCode); +#endif } } #endif @@ -5861,6 +5873,9 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, PicHeader* picHeader, Par { clipDeltaShift = ADAPTIVE_CLIP_SHIFT_DELTA_VALUE_0; } +#if JVET_AJ0237_INTERNAL_12BIT + clipDeltaShift += std::max(0, sps->getBitDepth(toChannelType(COMPONENT_Y)) - 10); +#endif if (pcSlice->getSliceType() == I_SLICE) { READ_SVLC(iCode, "clip_luma_pel_max"); diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index 38e77f597..79bad478e 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -7051,7 +7051,11 @@ void EncAdaptiveLoopFilter::alfCorrection( CodingStructure& cs, const PelUnitBuf { curScaleAlfEncParam.reset(); +#if JVET_AJ0237_INTERNAL_12BIT + curScaleAlfParam.init(filterSetIndex, alt_num, classifierIdx, cs.slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); +#else curScaleAlfParam.init( filterSetIndex, alt_num, classifierIdx ); +#endif curScaleAlfParam.setMinMax( cs.slice->getLumaPelMin(), cs.slice->getLumaPelMax() ); const int apsIdx = cs.slice->getTileGroupApsIdLuma()[filterSetIndex - NUM_FIXED_FILTER_SETS]; @@ -7061,7 +7065,11 @@ void EncAdaptiveLoopFilter::alfCorrection( CodingStructure& cs, const PelUnitBuf CHECK( curScaleAlfParam.classifierIdx != classifierIdx || curScaleAlfParam.filterSetIndex != filterSetIndex || curScaleAlfParam.alt_num != alt_num, "alfCorrection() failed."); char coeffBits = m_coeffBitsApsLuma[filterSetIndex - NUM_FIXED_FILTER_SETS][alt_num]; +#if JVET_AJ0237_INTERNAL_12BIT + const Pel currBase = 1 << (curScaleAlfParam.bitDepth - 1); +#else const Pel currBase = 512; // 10-bits +#endif if ( !bModeAnalysis ) { @@ -9463,7 +9471,11 @@ void EncAdaptiveLoopFilter::countLumaSwingGreaterThanThreshold(const Pel* luma, void EncAdaptiveLoopFilter::countChromaSampleValueNearMidPoint(const Pel* chroma, int chromaStride, int height, int width, int log2BlockWidth, int log2BlockHeight, uint64_t* chromaSampleCountNearMidPoint, int chromaSampleCountNearMidPointStride) { const int midPoint = (1 << m_inputBitDepth[CH_C]) >> 1; +#if JVET_AJ0237_INTERNAL_12BIT + const int threshold = 16 << std::max(0, m_inputBitDepth[CH_C] - 10); +#else const int threshold = 16; +#endif for (int y = 0; y < height; y += (1 << log2BlockHeight)) { diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index f06e35c17..bcf1afb77 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -122,6 +122,9 @@ void EncCu::create( EncCfg* encCfg ) #if JVET_V0094_BILATERAL_FILTER || JVET_X0071_CHROMA_BILATERAL_FILTER m_bilateralFilter = new BilateralFilter();; m_bilateralFilter->create(); +#if JVET_AJ0237_INTERNAL_12BIT + m_bilateralFilter->setInternalBitDepth(encCfg->getBitDepth(CHANNEL_TYPE_LUMA)); +#endif #endif unsigned uiMaxWidth = encCfg->getMaxCUWidth(); diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index 472f0680d..fb066e072 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -2707,11 +2707,19 @@ void EncGOP::compressGOP(int iPOCLast, int iNumPicRcvd, PicList &rcListPic, std: if (index != -1) { const SPS* sps = pcSlice->getSPS(); +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->setCostForARMC(sps->getLambdaVal(index), sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#else pcSlice->setCostForARMC(sps->getLambdaVal(index)); +#endif } else { +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->setCostForARMC((uint32_t)LAMBDA_DEC_SIDE[min(max(pcSlice->getSliceQp(), 0), MAX_QP)], pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); +#else pcSlice->setCostForARMC((uint32_t) LAMBDA_DEC_SIDE[min(max(pcSlice->getSliceQp(), 0), MAX_QP)]); +#endif } if (pcSlice->getCheckLDC()) @@ -2738,12 +2746,20 @@ void EncGOP::compressGOP(int iPOCLast, int iNumPicRcvd, PicList &rcListPic, std: } if (mindist != 1 ) { +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->setCostForARMC((uint32_t)LAMBDA_DEC_SIDE[min(max(pcSlice->getSliceQp() - 4, 0), MAX_QP)], pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); +#else pcSlice->setCostForARMC((uint32_t) LAMBDA_DEC_SIDE[min(max(pcSlice->getSliceQp() - 4, 0), MAX_QP)]); +#endif } } else { +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->setCostForARMC((uint32_t)LAMBDA_DEC_SIDE[min(max(pcSlice->getSliceQp() - 4, 0), MAX_QP)], pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); +#else pcSlice->setCostForARMC((uint32_t) LAMBDA_DEC_SIDE[min(max(pcSlice->getSliceQp() - 4, 0), MAX_QP)]); +#endif } } #endif @@ -3803,6 +3819,9 @@ void EncGOP::compressGOP(int iPOCLast, int iNumPicRcvd, PicList &rcListPic, std: if ( saoSize.width != picWidth || saoSize.height != picHeight ) { m_pcSAO->create(picWidth, picHeight, chromaFormatIDC, maxCUWidth, maxCUHeight, maxTotalCUDepth, log2SaoOffsetScaleLuma, log2SaoOffsetScaleChroma); +#if JVET_AJ0237_INTERNAL_12BIT + m_pcSAO->m_bilateralFilter.setInternalBitDepth(pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); +#endif m_pcSAO->setReshaper(m_pcReshaper); } @@ -4852,9 +4871,17 @@ void EncGOP::compressGOP(int iPOCLast, int iNumPicRcvd, PicList &rcListPic, std: #if JVET_AA0096_MC_BOUNDARY_PADDING m_pcFrameMcPadPrediction->init(m_pcEncLib->getRdCost(), pcSlice->getSPS()->getChromaFormatIdc(), #if JVET_AJ0172_IBC_ITMP_ALIGN_REF_AREA +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->getSPS()->getMaxCUHeight(), NULL, pcPic->getPicWidthInLumaSamples(),pcPic->getPicHeightInLumaSamples(), pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); +#else pcSlice->getSPS()->getMaxCUHeight(), NULL, pcPic->getPicWidthInLumaSamples(),pcPic->getPicHeightInLumaSamples()); +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT + pcSlice->getSPS()->getMaxCUHeight(), NULL, pcPic->getPicWidthInLumaSamples(), pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)); #else pcSlice->getSPS()->getMaxCUHeight(), NULL, pcPic->getPicWidthInLumaSamples()); +#endif #endif m_pcFrameMcPadPrediction->mcFramePad(pcPic, *(pcPic->slices[0])); m_pcFrameMcPadPrediction->destroy(); diff --git a/source/Lib/EncoderLib/EncGOP.h b/source/Lib/EncoderLib/EncGOP.h index 907cf60ff..7fad71fa8 100644 --- a/source/Lib/EncoderLib/EncGOP.h +++ b/source/Lib/EncoderLib/EncGOP.h @@ -140,8 +140,10 @@ private: PicList* m_pcListPic; HLSWriter* m_HLSWriter; +#if !JVET_AJ0237_INTERNAL_12BIT #if JVET_V0094_BILATERAL_FILTER || JVET_X0071_CHROMA_BILATERAL_FILTER BilateralFilter m_cBilateralFilter; +#endif #endif LoopFilter* m_pcLoopFilter; @@ -214,6 +216,12 @@ private: #endif public: +#if JVET_AJ0237_INTERNAL_12BIT +#if JVET_V0094_BILATERAL_FILTER || JVET_X0071_CHROMA_BILATERAL_FILTER + BilateralFilter m_cBilateralFilter; +#endif +#endif + EncGOP(); virtual ~EncGOP(); diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index 964180341..613ba5a02 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -101,6 +101,9 @@ void EncLib::create( const int layerId ) m_iPOCLast = m_compositeRefEnabled ? -2 : -1; // create processing unit classes m_cGOPEncoder. create( ); +#if JVET_AJ0237_INTERNAL_12BIT + m_cGOPEncoder.m_cBilateralFilter.setInternalBitDepth(m_bitDepth[COMPONENT_Y]); +#endif #if ENABLE_SPLIT_PARALLELISM #if ENABLE_SPLIT_PARALLELISM m_numCuEncStacks = m_numSplitThreads == 1 ? 1 : NUM_RESERVERD_SPLIT_JOBS; @@ -131,6 +134,9 @@ void EncLib::create( const int layerId ) m_cCuEncoder. create( this ); #if JVET_V0094_BILATERAL_FILTER || JVET_X0071_CHROMA_BILATERAL_FILTER m_bilateralFilter. create(); +#if JVET_AJ0237_INTERNAL_12BIT + m_bilateralFilter.setInternalBitDepth(m_bitDepth[COMPONENT_Y]); +#endif #endif #endif #if JVET_J0090_MEMORY_BANDWITH_MEASURE @@ -219,6 +225,9 @@ void EncLib::create( const int layerId ) m_cEncSAO.create(m_iSourceWidth, m_iSourceHeight, m_chromaFormatIDC, m_maxCUWidth, m_maxCUHeight, floorLog2(m_maxCUWidth) - m_log2MinCUSize, (uint32_t)std::max(0, m_bitDepth[CHANNEL_TYPE_LUMA] - MAX_SAO_TRUNCATED_BITDEPTH), (uint32_t)std::max(0, m_bitDepth[CHANNEL_TYPE_CHROMA] - MAX_SAO_TRUNCATED_BITDEPTH)); #endif m_cEncSAO.createEncData(m_saoCtuBoundary, numCtuInFrame); +#if JVET_AJ0237_INTERNAL_12BIT + m_cEncSAO.m_bilateralFilter.setInternalBitDepth(m_bitDepth[COMPONENT_Y]); +#endif } } diff --git a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp index e850b03da..04ff642e8 100644 --- a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp +++ b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp @@ -413,6 +413,9 @@ void EncSampleAdaptiveOffset::SAOProcess( CodingStructure& cs, bool* sliceEnable if(!cs.sps->getSAOEnabledFlag() && (cs.pps->getUseBIF() || cs.pps->getUseChromaBIF())) { bilateralFilter.create(); +#if JVET_AJ0237_INTERNAL_12BIT + bilateralFilter.setInternalBitDepth(cs.sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#endif if( cs.pps->getUseBIF() ) { bilateralFilter.bilateralFilterPicRDOperCTU( COMPONENT_Y, cs, src, bifCABACEstimator ); // Filters from src to res @@ -468,6 +471,9 @@ void EncSampleAdaptiveOffset::SAOProcess( CodingStructure& cs, bool* sliceEnable if( cs.pps->getUseBIF() || cs.pps->getUseChromaBIF() ) { bilateralFilter.create(); +#if JVET_AJ0237_INTERNAL_12BIT + bilateralFilter.setInternalBitDepth(cs.sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#endif if( cs.pps->getUseBIF() ) { bilateralFilter.bilateralFilterPicRDOperCTU( COMPONENT_Y, cs, src, bifCABACEstimator ); // Filters from src to res' @@ -774,9 +780,18 @@ int64_t EncSampleAdaptiveOffset::getDistortion(const int channelBitDepth, int ty return dist; } +#if JVET_AJ0237_INTERNAL_12BIT +inline int64_t EncSampleAdaptiveOffset::estSaoDist(int64_t count, int64_t offset, int64_t diffSum, int shift, int bdShift) +#else inline int64_t EncSampleAdaptiveOffset::estSaoDist(int64_t count, int64_t offset, int64_t diffSum, int shift) +#endif { +#if JVET_AJ0237_INTERNAL_12BIT + int64_t tmpOffset = offset << bdShift; + return ((count * tmpOffset * tmpOffset - diffSum * tmpOffset * 2) >> shift); +#else return (( count*offset*offset-diffSum*offset*2 ) >> shift); +#endif } @@ -1154,6 +1169,9 @@ void EncSampleAdaptiveOffset::decideBlkParams(CodingStructure& cs, bool* sliceEn #if JVET_V0094_BILATERAL_FILTER || JVET_X0071_CHROMA_BILATERAL_FILTER BilateralFilter bilateralFilter; bilateralFilter.create(); +#if JVET_AJ0237_INTERNAL_12BIT + bilateralFilter.setInternalBitDepth(cs.sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#endif #endif const TempCtx ctxPicStart ( m_ctxCache, SAOCtx( m_CABACEstimator->getCtx() ) ); @@ -3011,6 +3029,19 @@ void EncSampleAdaptiveOffset::CCSAOProcess(CodingStructure& cs, const double* la } #endif +#if JVET_AJ0237_INTERNAL_12BIT + if (!cs.slice->isIntra() && !cs.slice->getCheckLDC() && (cs.slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA) > 10) && (cs.slice->getSliceQp() > 45) && (m_picWidth * m_picHeight <= 1920 * 1080)) + { + for (int compIdx = COMPONENT_Y; compIdx < MAX_NUM_COMPONENT; compIdx++) + { + ComponentID compID = (ComponentID)compIdx; + m_ccSaoComParam.reset(compID); + memset(m_ccSaoControl[compID], 0, sizeof(uint8_t) * m_numCTUsInPic); + } + return; + } +#endif + PelUnitBuf orgYuv = cs.getOrgBuf(); PelUnitBuf dstYuv = cs.getRecoBuf(); PelUnitBuf srcYuv = m_ccSaoBuf.getBuf( cs.area ); @@ -3413,6 +3444,10 @@ void EncSampleAdaptiveOffset::setupInitCcSaoParam(CodingStructure& cs, const Com initCcSaoParam.reset(); memset(initCcSaoControl, 0, sizeof(uint8_t) * m_numCTUsInPic); +#if JVET_AJ0237_INTERNAL_12BIT + const int shift = 2 * DISTORTION_PRECISION_ADJUSTMENT(cs.sps->getBitDepth(CHANNEL_TYPE_LUMA)); +#endif + if (setNum == 1) { std::fill_n(initCcSaoControl, m_numCTUsInPic, 1); @@ -3432,7 +3467,11 @@ void EncSampleAdaptiveOffset::setupInitCcSaoParam(CodingStructure& cs, const Com #if JVET_AE0151_CCSAO_HISTORY_OFFSETS_AND_EXT_EO getCcSaoDistortion(compID, setIdx , bestCcSaoParam.setType[setIdx] == CCSAO_SET_TYPE_BAND ? blkStats : blkStatsEdge +#if JVET_AJ0237_INTERNAL_12BIT + , bestCcSaoParam.offset, trainingDistortion, shift); +#else , bestCcSaoParam.offset, trainingDistortion); +#endif #else if (bestCcSaoParam.setType[setIdx] == 0) /* band */ { @@ -3941,6 +3980,10 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int srcStrideTab[MAX_NUM_COMPONENT] = { srcStrideY, srcStrideU, srcStrideU }; #endif +#if JVET_AJ0237_INTERNAL_12BIT + const int bdShift = std::max(0, bitDepth - 10); +#endif + #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY int x, y, startX, startY, endX, endY; int firstLineStartX, firstLineEndX; @@ -3997,7 +4040,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4067,7 +4114,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4131,7 +4182,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4188,7 +4243,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4252,7 +4311,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4309,7 +4372,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4779,7 +4846,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4849,7 +4920,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4913,7 +4988,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -4970,7 +5049,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -5034,7 +5117,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -5091,7 +5178,11 @@ void EncSampleAdaptiveOffset::getCcSaoBlkStatsEdgeNew(const ComponentID compID, const int edgeNumUni = g_ccSaoEdgeNum[edgeIdc][1]; for (int edgeThr = 0; edgeThr < MAX_CCSAO_EDGE_THR; edgeThr++) { +#if JVET_AJ0237_INTERNAL_12BIT + const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr] << bdShift; +#else const int edgeThrVal = g_ccSaoEdgeThr[edgeIdc][edgeThr]; +#endif const int edgeIdxA = getCcSaoEdgeIdx(*colE, *colA, edgeThrVal, edgeIdc); const int edgeIdxB = getCcSaoEdgeIdx(*colE, *colB, edgeThrVal, edgeIdc); const int edgeIdx = edgeIdxA * edgeNumUni + edgeIdxB; @@ -6726,14 +6817,23 @@ void EncSampleAdaptiveOffset::deriveCcSaoOffsets(const ComponentID compID, const { int quantOffsets[MAX_CCSAO_CLASS_NUM] = { 0 }; +#if JVET_AJ0237_INTERNAL_12BIT + int shift = 2 * DISTORTION_PRECISION_ADJUSTMENT(bitDepth); +#endif + for(int k = 0; k < MAX_CCSAO_CLASS_NUM; k++) { if(frameStats[setIdx].count[k] == 0) continue; quantOffsets[k] = +#if JVET_AJ0237_INTERNAL_12BIT + (int) xRoundIbdi(bitDepth, (double)(frameStats[setIdx].diff [k] << DISTORTION_PRECISION_ADJUSTMENT(bitDepth)) + / (double)((int64_t)frameStats[setIdx].count[k] << m_offsetStepLog2[compID])); +#else (int) xRoundIbdi(bitDepth, (double)(frameStats[setIdx].diff [k] << DISTORTION_PRECISION_ADJUSTMENT(bitDepth)) / (double)(frameStats[setIdx].count[k])); +#endif quantOffsets[k] = Clip3(-MAX_CCSAO_OFFSET_THR, MAX_CCSAO_OFFSET_THR, quantOffsets[k]); } @@ -6744,7 +6844,11 @@ void EncSampleAdaptiveOffset::deriveCcSaoOffsets(const ComponentID compID, const cost[k] = m_lambda[compID]; if (quantOffsets[k] != 0) { +#if JVET_AJ0237_INTERNAL_12BIT + quantOffsets[k] = estCcSaoIterOffset(m_lambda[compID], quantOffsets[k], frameStats[setIdx].count[k], frameStats[setIdx].diff[k], shift, m_offsetStepLog2[compID], dist[k], cost[k], MAX_CCSAO_OFFSET_THR); +#else quantOffsets[k] = estCcSaoIterOffset(m_lambda[compID], quantOffsets[k], frameStats[setIdx].count[k], frameStats[setIdx].diff[k], 0, 0, dist[k], cost[k], MAX_CCSAO_OFFSET_THR); +#endif } } @@ -6757,7 +6861,11 @@ void EncSampleAdaptiveOffset::deriveCcSaoOffsets(const ComponentID compID, const void EncSampleAdaptiveOffset::getCcSaoDistortion(const ComponentID compID, const int setIdx, CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM] , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM] +#if JVET_AJ0237_INTERNAL_12BIT + , int64_t* trainingDistortion[MAX_CCSAO_SET_NUM], const int shift) +#else , int64_t* trainingDistortion[MAX_CCSAO_SET_NUM]) +#endif { ::memset(trainingDistortion[setIdx], 0, sizeof(int64_t) * m_numCTUsInPic); @@ -6766,7 +6874,11 @@ void EncSampleAdaptiveOffset::getCcSaoDistortion(const ComponentID compID, const for (int k = 0; k < MAX_CCSAO_CLASS_NUM; k++) { trainingDistortion[setIdx][ctbIdx] +#if JVET_AJ0237_INTERNAL_12BIT + += estSaoDist(blkStats[setIdx][ctbIdx].count[k], offset[setIdx][k], blkStats[setIdx][ctbIdx].diff[k], shift, m_offsetStepLog2[toChannelType(compID)]); +#else += estSaoDist(blkStats[setIdx][ctbIdx].count[k], offset[setIdx][k], blkStats[setIdx][ctbIdx].diff[k], 0); +#endif } } } @@ -7064,6 +7176,10 @@ void EncSampleAdaptiveOffset::deriveCcSaoRDO(CodingStructure& cs, const Componen const TempCtx ctxStartCcSaoControlFlag ( m_ctxCache, SubCtx( Ctx::CcSaoControlIdc, m_CABACEstimator->getCtx() ) ); +#if JVET_AJ0237_INTERNAL_12BIT + const int shift = 2 * DISTORTION_PRECISION_ADJUSTMENT(cs.sps->getBitDepth(toChannelType(compID))); +#endif + int trainingIter = 0; bool keepTraining = true; bool improved = false; @@ -7081,7 +7197,11 @@ void EncSampleAdaptiveOffset::deriveCcSaoRDO(CodingStructure& cs, const Componen { getCcSaoDistortion(compID, setIdx , tempCcSaoParam.setType[setIdx] == CCSAO_SET_TYPE_BAND ? blkStats : blkStatsEdge +#if JVET_AJ0237_INTERNAL_12BIT + , tempCcSaoParam.offset, trainingDistortion, shift); +#else , tempCcSaoParam.offset, trainingDistortion); +#endif } else { @@ -7099,12 +7219,20 @@ void EncSampleAdaptiveOffset::deriveCcSaoRDO(CodingStructure& cs, const Componen #endif { deriveCcSaoOffsets(compID, cs.sps->getBitDepth(toChannelType(compID)), setIdx, frameStats, tempCcSaoParam.offset); +#if JVET_AJ0237_INTERNAL_12BIT + getCcSaoDistortion(compID, setIdx, blkStats, tempCcSaoParam.offset, trainingDistortion, shift); +#else getCcSaoDistortion(compID, setIdx, blkStats, tempCcSaoParam.offset, trainingDistortion); +#endif } else { deriveCcSaoOffsets(compID, cs.sps->getBitDepth(toChannelType(compID)), setIdx, frameStatsEdge, tempCcSaoParam.offset); +#if JVET_AJ0237_INTERNAL_12BIT + getCcSaoDistortion(compID, setIdx, blkStatsEdge, tempCcSaoParam.offset, trainingDistortion, shift); +#else getCcSaoDistortion(compID, setIdx, blkStatsEdge, tempCcSaoParam.offset, trainingDistortion); +#endif } #else deriveCcSaoOffsets(compID, cs.sps->getBitDepth(toChannelType(compID)), setIdx, frameStats, diff --git a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h index 7cfd6e040..1ef5732cb 100644 --- a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h +++ b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h @@ -216,7 +216,11 @@ private: //methods void deriveModeMergeRDO(const BitDepths &bitDepths, int ctuRsAddr, SAOBlkParam* mergeList[NUM_SAO_MERGE_TYPES], bool* sliceEnabled, std::vector<SAOStatData**>& blkStats, SAOBlkParam& modeParam, double& modeNormCost ); int64_t getDistortion(const int channelBitDepth, int typeIdc, int typeAuxInfo, int* offsetVal, SAOStatData& statData); void deriveOffsets(ComponentID compIdx, const int channelBitDepth, int typeIdc, SAOStatData& statData, int* quantOffsets, int& typeAuxInfo); +#if JVET_AJ0237_INTERNAL_12BIT + inline int64_t estSaoDist(int64_t count, int64_t offset, int64_t diffSum, int shift, int bdShift = 0); +#else inline int64_t estSaoDist(int64_t count, int64_t offset, int64_t diffSum, int shift); +#endif inline int estIterOffset(int typeIdx, double lambda, int offsetInput, int64_t count, int64_t diffSum, int shift, int bitIncrease, int64_t& bestDist, double& bestCost, int offsetTh ); void addPreDBFStatistics(std::vector<SAOStatData**>& blkStats); #if JVET_W0066_CCSAO @@ -335,8 +339,13 @@ private: //methods , CcSaoStatData frameStats[MAX_CCSAO_SET_NUM] , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM]); inline int estCcSaoIterOffset(const double lambda, const int offsetInput, const int64_t count, const int64_t diffSum, const int shift, const int bitIncrease, int64_t& bestDist, double& bestCost, const int offsetTh); +#if JVET_AJ0237_INTERNAL_12BIT + void getCcSaoDistortion(const ComponentID compID, const int setIdx, CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM] + , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM], int64_t* trainingDistortion[MAX_CCSAO_SET_NUM], const int shift); +#else void getCcSaoDistortion(const ComponentID compID, const int setIdx, CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM] , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM], int64_t* trainingDistortion[MAX_CCSAO_SET_NUM]); +#endif #if JVET_Y0106_CCSAO_EDGE_CLASSIFIER && !JVET_AE0151_CCSAO_HISTORY_OFFSETS_AND_EXT_EO void getCcSaoDistortionEdge(const ComponentID compID, const int setIdx, CcSaoStatData *blkStatsEdge[MAX_CCSAO_SET_NUM], diff --git a/source/Lib/EncoderLib/EncTemporalFilter.cpp b/source/Lib/EncoderLib/EncTemporalFilter.cpp index 5cbf713dd..17e202fec 100644 --- a/source/Lib/EncoderLib/EncTemporalFilter.cpp +++ b/source/Lib/EncoderLib/EncTemporalFilter.cpp @@ -349,6 +349,16 @@ void EncTemporalFilter::subsampleLuma(const PelStorage &input, PelStorage &outpu output.extendBorderPel(m_padding, m_padding); } +#if JVET_AJ0237_INTERNAL_12BIT +int64_t EncTemporalFilter::motionErrorLuma(const PelStorage& orig, + const PelStorage& buffer, + const int x, + const int y, + int dx, + int dy, + const int bs, + const int64_t besterror) const +#else int EncTemporalFilter::motionErrorLuma(const PelStorage &orig, const PelStorage &buffer, const int x, @@ -357,13 +367,18 @@ int EncTemporalFilter::motionErrorLuma(const PelStorage &orig, int dy, const int bs, const int besterror = 8 * 8 * 1024 * 1024) const +#endif { const Pel* origOrigin = orig.Y().buf; const int origStride = orig.Y().stride; const Pel* buffOrigin = buffer.Y().buf; const int buffStride = buffer.Y().stride; +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = 0; +#else int error = 0; +#endif if (((dx | dy) & 0xF) == 0) { dx /= m_motionVectorFactor; @@ -454,6 +469,12 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P const int origWidth = orig.Y().width; const int origHeight = orig.Y().height; +#if JVET_AJ0237_INTERNAL_12BIT + const int bitShift = 2 * (16 - m_internalBitDepth[CHANNEL_TYPE_LUMA]); + const int denorm = 204800 / (1 << bitShift); + const double offset = 20480 / (1 << bitShift); +#endif + #if JVET_V0056 for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize) { @@ -489,7 +510,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P if ((testx >= 0) && (testx < origWidth / (2 * blockSize)) && (testy >= 0) && (testy < origHeight / (2 * blockSize))) { MotionVector old = previous->get(testx, testy); +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = motionErrorLuma(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, best.error); +#else int error = motionErrorLuma(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, best.error); +#endif if (error < best.error) { best.set(old.x * factor, old.y * factor, error); @@ -498,7 +523,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P } } #if JVET_V0056 +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = motionErrorLuma(orig, buffer, blockX, blockY, 0, 0, blockSize, best.error); +#else int error = motionErrorLuma(orig, buffer, blockX, blockY, 0, 0, blockSize, best.error); +#endif if (error < best.error) { best.set(0, 0, error); @@ -510,7 +539,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P { for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++) { +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = motionErrorLuma(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, best.error); +#else int error = motionErrorLuma(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, best.error); +#endif if (error < best.error) { best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, error); @@ -525,7 +558,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P { for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2 += 4) { +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = motionErrorLuma(orig, buffer, blockX, blockY, x2, y2, blockSize, best.error); +#else int error = motionErrorLuma(orig, buffer, blockX, blockY, x2, y2, blockSize, best.error); +#endif if (error < best.error) { best.set(x2, y2, error); @@ -539,7 +576,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P { for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2++) { +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = motionErrorLuma(orig, buffer, blockX, blockY, x2, y2, blockSize, best.error); +#else int error = motionErrorLuma(orig, buffer, blockX, blockY, x2, y2, blockSize, best.error); +#endif if (error < best.error) { best.set(x2, y2, error); @@ -552,7 +593,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P if (blockY > 0) { MotionVector aboveMV = mvs.get(blockX / stepSize, (blockY - stepSize) / stepSize); +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = motionErrorLuma(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, best.error); +#else int error = motionErrorLuma(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, best.error); +#endif if (error < best.error) { best.set(aboveMV.x, aboveMV.y, error); @@ -561,7 +606,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P if (blockX > 0) { MotionVector leftMV = mvs.get((blockX - stepSize) / stepSize, blockY / stepSize); +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error = motionErrorLuma(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, best.error); +#else int error = motionErrorLuma(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, best.error); +#endif if (error < best.error) { best.set(leftMV.x, leftMV.y, error); @@ -589,7 +638,11 @@ void EncTemporalFilter::motionEstimationLuma(Array2D<MotionVector> &mvs, const P variance = variance + (pix - avg) * (pix - avg); } } +#if JVET_AJ0237_INTERNAL_12BIT + best.error = (int)(20 * ((best.error + offset) / (variance + offset)) + (best.error / (blockSize * blockSize)) / denorm); +#else best.error = (int)(20 * ((best.error + 5.0) / (variance + 5.0)) + (best.error / (blockSize * blockSize)) / 50); +#endif #endif mvs.get(blockX / stepSize, blockY / stepSize) = best; } @@ -731,6 +784,12 @@ void EncTemporalFilter::bilateralFilter(const PelStorage &orgPic, const double weightScaling = overallStrength * (isChroma(compID) ? m_chromaFactor : 0.4); const Pel maxSampleValue = (1 << m_internalBitDepth[toChannelType(compID)]) - 1; const double bitDepthDiffWeighting = 1024.0 / (maxSampleValue + 1); + +#if JVET_AJ0237_INTERNAL_12BIT + const int bitShift = 2 * (16 - m_internalBitDepth[toChannelType(compID)]); + const double offset = 20480 / (1 << bitShift); +#endif + #if JVET_V0056 const int lumaBlockSize = 8; const int csx = getComponentScaleX(compID, m_chromaFormatIDC); @@ -783,7 +842,11 @@ void EncTemporalFilter::bilateralFilter(const PelStorage &orgPic, const int cntV = blockSizeX * blockSizeY; const int cntD = 2 * cntV - blockSizeX - blockSizeY; srcFrameInfo[i].mvs.get( x / blockSizeX, y / blockSizeY ).noise = +#if JVET_AJ0237_INTERNAL_12BIT + ( int ) round( (15.0 * cntD / cntV * variance + offset) / (diffsum + offset) ); +#else ( int ) round( (15.0 * cntD / cntV * variance + 5.0) / (diffsum + 5.0) ); +#endif } } double minError = 9999999; @@ -795,7 +858,11 @@ void EncTemporalFilter::bilateralFilter(const PelStorage &orgPic, for (int i = 0; i < numRefs; i++) { #if JVET_V0056 +#if JVET_AJ0237_INTERNAL_12BIT + const int64_t error = srcFrameInfo[i].mvs.get(x / blockSizeX, y / blockSizeY).error; +#else const int error = srcFrameInfo[i].mvs.get(x / blockSizeX, y / blockSizeY).error; +#endif const int noise = srcFrameInfo[i].mvs.get(x / blockSizeX, y / blockSizeY).noise; #endif const Pel* pCorrectedPelPtr = srcFrameInfo[i].picBuffer.bufs[c].buf + (y * srcFrameInfo[i].picBuffer.bufs[c].stride + x); diff --git a/source/Lib/EncoderLib/EncTemporalFilter.h b/source/Lib/EncoderLib/EncTemporalFilter.h index 38b4f3279..c5cb475d3 100644 --- a/source/Lib/EncoderLib/EncTemporalFilter.h +++ b/source/Lib/EncoderLib/EncTemporalFilter.h @@ -50,14 +50,26 @@ struct MotionVector { int x, y; +#if JVET_AJ0237_INTERNAL_12BIT + int64_t error; +#else int error; +#endif #if JVET_V0056 int noise; +#if JVET_AJ0237_INTERNAL_12BIT + MotionVector() : x(0), y(0), error(INT_LEAST64_MAX), noise(0) {} +#else MotionVector() : x(0), y(0), error(INT_LEAST32_MAX), noise(0) {} +#endif #else MotionVector() : x(0), y(0), error(INT_LEAST32_MAX) {} #endif +#if JVET_AJ0237_INTERNAL_12BIT + void set(int vectorX, int vectorY, int64_t errorValue) { x = vectorX; y = vectorY; error = errorValue; } +#else void set(int vectorX, int vectorY, int errorValue) { x = vectorX; y = vectorY; error = errorValue; } +#endif }; template <class T> @@ -169,7 +181,11 @@ private: // Private functions void subsampleLuma(const PelStorage &input, PelStorage &output, const int factor = 2) const; +#if JVET_AJ0237_INTERNAL_12BIT + int64_t motionErrorLuma(const PelStorage& orig, const PelStorage& buffer, const int x, const int y, int dx, int dy, const int bs, const int64_t besterror) const; +#else int motionErrorLuma(const PelStorage &orig, const PelStorage &buffer, const int x, const int y, int dx, int dy, const int bs, const int besterror) const; +#endif void motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &orig, const PelStorage &buffer, const int bs, const Array2D<MotionVector> *previous=0, const int factor = 1, const bool doubleRes = false) const; void motionEstimation(Array2D<MotionVector> &mvs, const PelStorage &orgPic, const PelStorage &buffer, const PelStorage &origSubsampled2, const PelStorage &origSubsampled4) const; diff --git a/source/Lib/EncoderLib/InterSearch.cpp b/source/Lib/EncoderLib/InterSearch.cpp index f4824391e..ff0f3e365 100644 --- a/source/Lib/EncoderLib/InterSearch.cpp +++ b/source/Lib/EncoderLib/InterSearch.cpp @@ -498,10 +498,18 @@ void InterSearch::init( EncCfg* pcEncCfg, #if INTER_LIC || (TM_AMVP || TM_MRG || JVET_Z0084_IBC_TM) || JVET_W0090_ARMC_TM || JVET_Z0056_GPM_SPLIT_MODE_REORDERING #if JVET_Z0153_IBC_EXT_REF #if JVET_AJ0172_IBC_ITMP_ALIGN_REF_AREA +#if JVET_AJ0237_INTERNAL_12BIT + InterPrediction::init( pcRdCost, cform, maxCUHeight, m_pcReshape, curPicWidthY, curPicHeightY, pcEncCfg->getBitDepth(CHANNEL_TYPE_LUMA)); +#else InterPrediction::init( pcRdCost, cform, maxCUHeight, m_pcReshape, curPicWidthY, curPicHeightY ); +#endif +#else +#if JVET_AJ0237_INTERNAL_12BIT + InterPrediction::init( pcRdCost, cform, maxCUHeight, m_pcReshape, curPicWidthY, pcEncCfg->getBitDepth(CHANNEL_TYPE_LUMA)); #else InterPrediction::init( pcRdCost, cform, maxCUHeight, m_pcReshape, curPicWidthY ); #endif +#endif #else InterPrediction::init( pcRdCost, cform, maxCUHeight, m_pcReshape ); #endif diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp index f8f9b5bd8..8453b1015 100644 --- a/source/Lib/EncoderLib/IntraSearch.cpp +++ b/source/Lib/EncoderLib/IntraSearch.cpp @@ -114,8 +114,10 @@ IntraSearch::IntraSearch() m_dimdPredBuf = nullptr; m_obicPredBuf = nullptr; #endif +#if !JVET_AJ0237_INTERNAL_12BIT m_truncBinBits = nullptr; m_escapeNumBins = nullptr; +#endif m_minErrorIndexMap = nullptr; for (unsigned i = 0; i < (MAXPLTSIZE + 1); i++) { @@ -324,6 +326,7 @@ void IntraSearch::destroy() m_obicPredBuf = nullptr; #endif m_isInitialized = false; +#if !JVET_AJ0237_INTERNAL_12BIT if (m_truncBinBits != nullptr) { for (unsigned i = 0; i < m_symbolSize; i++) @@ -339,6 +342,7 @@ void IntraSearch::destroy() delete[] m_escapeNumBins; m_escapeNumBins = nullptr; } +#endif if (m_indexError[0] != nullptr) { for (unsigned i = 0; i < (MAXPLTSIZE + 1); i++) @@ -597,6 +601,7 @@ void IntraSearch::init( EncCfg* pcEncCfg, m_isInitialized = true; if (pcEncCfg->getPLTMode()) { +#if !JVET_AJ0237_INTERNAL_12BIT m_symbolSize = (1 << bitDepthY); // pixel values are within [0, SymbolSize-1] with size SymbolSize if (m_truncBinBits == nullptr) { @@ -611,6 +616,7 @@ void IntraSearch::init( EncCfg* pcEncCfg, m_escapeNumBins = new uint16_t[m_symbolSize]; } initTBCTable(bitDepthY); +#endif if (m_indexError[0] == nullptr) { for (unsigned i = 0; i < (MAXPLTSIZE + 1); i++) @@ -9119,7 +9125,11 @@ void IntraSearch::preCalcPLTIndexRD(CodingStructure& cs, Partitioner& partitione { if (lossless) { +#if JVET_AJ0237_INTERNAL_12BIT + rate += getEpExGolombNumBins(curPel[comp], 5); +#else rate += m_escapeNumBins[curPel[comp]]; +#endif } else { @@ -9132,7 +9142,11 @@ void IntraSearch::preCalcPLTIndexRD(CodingStructure& cs, Partitioner& partitione { error += tmpErr * tmpErr; } +#if JVET_AJ0237_INTERNAL_12BIT + rate += getEpExGolombNumBins(paPixelValue[comp], 5); +#else rate += m_escapeNumBins[paPixelValue[comp]]; // encode quantized escape color +#endif } } double rdCost = (double)error + m_pcRdCost->getLambda()*(double)rate; @@ -9439,7 +9453,11 @@ double IntraSearch::rateDistOptPLT( rdCost = MAX_DOUBLE; return rdCost; } +#if JVET_AJ0237_INTERNAL_12BIT + rdCost += m_pcRdCost->getLambda() * (getTruncBinBits((runIndex > refIndex) ? runIndex - 1 : runIndex, (scanPos == 0) ? (indexMaxValue + 1) : indexMaxValue) << SCALE_BITS); +#else rdCost += m_pcRdCost->getLambda()*(m_truncBinBits[(runIndex > refIndex) ? runIndex - 1 : runIndex][(scanPos == 0) ? (indexMaxValue + 1) : indexMaxValue] << SCALE_BITS); +#endif } rdCost += m_indexError[runIndex][m_scanOrder[scanPos].idx] * (1 << SCALE_BITS); if (scanPos > 0) @@ -9508,6 +9526,7 @@ uint32_t IntraSearch::getTruncBinBits(uint32_t symbol, uint32_t maxSymbol) return idxCodeBit; } +#if !JVET_AJ0237_INTERNAL_12BIT void IntraSearch::initTBCTable(int bitDepth) { for (uint32_t i = 0; i < m_symbolSize; i++) @@ -9527,6 +9546,7 @@ void IntraSearch::initTBCTable(int bitDepth) m_escapeNumBins[i] = getEpExGolombNumBins(i, 5); } } +#endif void IntraSearch::calcPixelPred(CodingStructure& cs, Partitioner& partitioner, uint32_t yPos, uint32_t xPos, ComponentID compBegin, uint32_t numComp) { @@ -9680,7 +9700,11 @@ void IntraSearch::derivePLTLossy(CodingStructure& cs, Partitioner& partitioner, TransformUnit &tu = *cs.getTU(partitioner.chType); QpParam cQP(tu, compBegin); +#if JVET_AJ0237_INTERNAL_12BIT + int qp = cQP.Qp(true) - 6 * (channelBitDepth_L - 8); +#else int qp = cQP.Qp(true) - 12; +#endif qp = (qp < 0) ? 0 : ((qp > 56) ? 56 : qp); int errorLimit = g_paletteQuant[qp]; if (lossless) diff --git a/source/Lib/EncoderLib/IntraSearch.h b/source/Lib/EncoderLib/IntraSearch.h index 081574c43..d08dc5706 100644 --- a/source/Lib/EncoderLib/IntraSearch.h +++ b/source/Lib/EncoderLib/IntraSearch.h @@ -742,9 +742,11 @@ protected: CtxCache* m_ctxCache; bool m_isInitialized; +#if !JVET_AJ0237_INTERNAL_12BIT uint32_t m_symbolSize; uint16_t** m_truncBinBits; uint16_t* m_escapeNumBins; +#endif bool m_bestEscape; double* m_indexError[MAXPLTSIZE + 1]; uint8_t* m_minErrorIndexMap; // store the best index in terms of distortion for each pixel @@ -932,7 +934,9 @@ protected: void deriveIndexMap (CodingStructure& cs, Partitioner& partitioner, ComponentID compBegin, uint32_t numComp, PLTScanMode pltScanMode, double& dCost, bool* idxExist); bool deriveSubblockIndexMap(CodingStructure& cs, Partitioner& partitioner, ComponentID compBegin, PLTScanMode pltScanMode, int minSubPos, int maxSubPos, const BinFracBits& fracBitsPltRunType, const BinFracBits* fracBitsPltIndexINDEX, const BinFracBits* fracBitsPltIndexCOPY, const double minCost, bool useRotate); double rateDistOptPLT (bool RunType, uint8_t RunIndex, bool prevRunType, uint8_t prevRunIndex, uint8_t aboveRunIndex, bool& prevCodedRunType, int& prevCodedRunPos, int scanPos, uint32_t width, int dist, int indexMaxValue, const BinFracBits* IndexfracBits, const BinFracBits& TypefracBits); +#if !JVET_AJ0237_INTERNAL_12BIT void initTBCTable (int bitDepth); +#endif uint32_t getTruncBinBits (uint32_t symbol, uint32_t maxSymbol); uint32_t getEpExGolombNumBins (uint32_t symbol, uint32_t count); void xGetNextISPMode ( ModeInfo& modeInfo, const ModeInfo* lastMode, const Size cuSize ); diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index b92443151..af74cd1fb 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -3604,7 +3604,11 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) bool lambdaCanBePredicted = false; if (index !=-1) { +#if JVET_AJ0237_INTERNAL_12BIT + if (pcSlice->getSPS()->getLambdaVal(index) == pcSlice->getCostForARMC(pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA))) +#else if(pcSlice->getSPS()->getLambdaVal(index) == pcSlice->getCostForARMC()) +#endif { lambdaCanBePredicted = true; } @@ -3612,7 +3616,11 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) if (!lambdaCanBePredicted) { #if JVET_AB0082 +#if JVET_AJ0237_INTERNAL_12BIT + WRITE_CODE(pcSlice->getCostForARMC(pcSlice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)), 10, "Lambda"); +#else WRITE_CODE(pcSlice->getCostForARMC(), 10, "Lambda"); +#endif #else WRITE_CODE(pcSlice->getCostForARMC(), 9, "Lambda"); #endif @@ -3648,6 +3656,9 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) { clipDeltaShift = ADAPTIVE_CLIP_SHIFT_DELTA_VALUE_0; } +#if JVET_AJ0237_INTERNAL_12BIT + clipDeltaShift += std::max(0, pcSlice->getSPS()->getBitDepth(toChannelType(COMPONENT_Y)) - 10); +#endif if (pcSlice->getSliceType() == I_SLICE) { #if JVET_AI0096_ADAPTIVE_CLIPPING_BIT_DEPTH_FIX -- GitLab