From 8607019780a132f7e01d6c5c2c5a6b3748bdb01b Mon Sep 17 00:00:00 2001 From: Karl Sharman <karl.sharman@sony.com> Date: Wed, 17 Jun 2020 01:10:08 +0200 Subject: [PATCH] JVET-R0351: High bit depth coding --- source/Lib/CommonLib/AdaptiveLoopFilter.cpp | 25 +- source/Lib/CommonLib/AdaptiveLoopFilter.h | 33 ++- source/Lib/CommonLib/AlfParameters.h | 8 + source/Lib/CommonLib/Buffer.cpp | 32 +++ source/Lib/CommonLib/Buffer.h | 5 + source/Lib/CommonLib/CommonDef.h | 6 + source/Lib/CommonLib/ContextModelling.cpp | 4 + source/Lib/CommonLib/ContextModelling.h | 24 ++ source/Lib/CommonLib/DepQuant.cpp | 8 + source/Lib/CommonLib/InterPrediction.cpp | 20 ++ source/Lib/CommonLib/InterpolationFilter.cpp | 16 ++ source/Lib/CommonLib/InterpolationFilter.h | 3 + source/Lib/CommonLib/Rom.cpp | 8 + source/Lib/CommonLib/Rom.h | 8 + source/Lib/CommonLib/RomTr.cpp | 86 ++++++- source/Lib/CommonLib/TrQuant.cpp | 41 ++- source/Lib/CommonLib/TrQuant.h | 5 + source/Lib/CommonLib/TrQuant_EMT.cpp | 237 +++++++++++++++++- source/Lib/CommonLib/TypeDef.h | 7 + source/Lib/CommonLib/WeightPrediction.cpp | 12 + .../Lib/CommonLib/x86/AdaptiveLoopFilterX86.h | 8 + .../CommonLib/x86/InterpolationFilterX86.h | 16 ++ source/Lib/DecoderLib/CABACReader.cpp | 25 ++ source/Lib/DecoderLib/VLCReader.cpp | 4 + .../Lib/EncoderLib/EncAdaptiveLoopFilter.cpp | 92 +++++++ source/Lib/EncoderLib/EncAdaptiveLoopFilter.h | 8 + source/Lib/EncoderLib/VLCWriter.cpp | 4 + 27 files changed, 733 insertions(+), 12 deletions(-) diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp index c82551538..9e16dcac0 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp @@ -501,7 +501,11 @@ void AdaptiveLoopFilter::ALFProcess(CodingStructure& cs) deriveClassification( m_classifier, buf.get(COMPONENT_Y), blkDst, blkSrc ); short filterSetIndex = alfCtuFilterIndex[ctuIdx]; short *coeff; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel *clip; +#else short *clip; +#endif if (filterSetIndex >= NUM_FIXED_FILTER_SETS) { coeff = m_coeffApsLuma[filterSetIndex - NUM_FIXED_FILTER_SETS]; @@ -565,7 +569,11 @@ void AdaptiveLoopFilter::ALFProcess(CodingStructure& cs) deriveClassification( m_classifier, tmpYuv.get( COMPONENT_Y ), blk, blk ); short filterSetIndex = alfCtuFilterIndex[ctuIdx]; short *coeff; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel *clip; +#else short *clip; +#endif if (filterSetIndex >= NUM_FIXED_FILTER_SETS) { coeff = m_coeffApsLuma[filterSetIndex - NUM_FIXED_FILTER_SETS]; @@ -662,7 +670,11 @@ void AdaptiveLoopFilter::reconstructCoeff( AlfParam& alfParam, ChannelType chann { int numFilters = isLuma( channel ) ? alfParam.numLumaFilters : 1; short* coeff = isLuma( channel ) ? alfParam.lumaCoeff : alfParam.chromaCoeff[altIdx]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel* clipp = isLuma( channel ) ? alfParam.lumaClipp : alfParam.chromaClipp[altIdx]; +#else short* clipp = isLuma( channel ) ? alfParam.lumaClipp : alfParam.chromaClipp[altIdx]; +#endif for( int filterIdx = 0; filterIdx < numFilters; filterIdx++ ) { @@ -1058,7 +1070,11 @@ void AdaptiveLoopFilter::deriveClassificationBlk(AlfClassifier **classifier, int template<AlfFilterType filtType> void AdaptiveLoopFilter::filterBlk(AlfClassifier **classifier, const PelUnitBuf &recDst, const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const short *filterSet, const Pel *fClipSet, const ClpRng &clpRng, +#else const short *filterSet, const short *fClipSet, const ClpRng &clpRng, +#endif CodingStructure &cs, const int vbCTUHeight, int vbPos) { CHECK((vbCTUHeight & (vbCTUHeight - 1)) != 0, "vbCTUHeight must be a power of 2"); @@ -1087,8 +1103,11 @@ void AdaptiveLoopFilter::filterBlk(AlfClassifier **classifier, const PelUnitBuf const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6; const short *coef = filterSet; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel *clip = fClipSet; +#else const short *clip = fClipSet; - +#endif const int shift = m_NUM_BITS - 1; const int offset = 1 << ( shift - 1 ); @@ -1225,7 +1244,11 @@ void AdaptiveLoopFilter::filterBlk(AlfClassifier **classifier, const PelUnitBuf { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel sum = 0; +#else int sum = 0; +#endif const Pel curr = pImg0[+0]; if( filtType == ALF_FILTER_7 ) { diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h index 74c9bdbd0..be9819ada 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.h +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.h @@ -67,10 +67,17 @@ enum Direction class AdaptiveLoopFilter { public: +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + static inline Pel clipALF(const Pel clip, const Pel ref, const Pel val0, const Pel val1) + { + return Clip3<Pel>(-clip, +clip, val0-ref) + Clip3<Pel>(-clip, +clip, val1-ref); + } +#else static inline int clipALF(const int clip, const short ref, const short val0, const short val1) { return Clip3<int>(-clip, +clip, val0-ref) + Clip3<int>(-clip, +clip, val1-ref); } +#endif static constexpr int AlfNumClippingValues[MAX_NUM_CHANNEL_TYPE] = { 4, 4 }; static constexpr int MaxAlfNumClippingValues = 4; @@ -99,8 +106,12 @@ public: template<AlfFilterType filtType> static void filterBlk(AlfClassifier **classifier, const PelUnitBuf &recDst, const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, const short *filterSet, - const short *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, - int vbPos); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#else + const short *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#endif + int vbPos); void (*m_deriveClassificationBlk)(AlfClassifier **classifier, int **laplacian[NUM_DIRECTIONS], const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const int shift, const int vbCTUHeight, int vbPos); @@ -116,11 +127,19 @@ public: uint8_t* getCcAlfControlIdc(const ComponentID compID) { return m_ccAlfFilterControl[compID-1]; } void (*m_filter5x5Blk)(AlfClassifier **classifier, const PelUnitBuf &recDst, const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, const short *filterSet, +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#else const short *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#endif int vbPos); void (*m_filter7x7Blk)(AlfClassifier **classifier, const PelUnitBuf &recDst, const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, const short *filterSet, +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#else const short *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#endif int vbPos); #ifdef TARGET_SIMD_X86 @@ -138,8 +157,13 @@ protected: static const int m_fixedFilterSetCoeff[ALF_FIXED_FILTER_NUM][MAX_NUM_ALF_LUMA_COEFF]; short m_fixedFilterSetCoeffDec[NUM_FIXED_FILTER_SETS][MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; short m_coeffApsLuma[ALF_CTB_MAX_NUM_APS][MAX_NUM_ALF_LUMA_COEFF * MAX_NUM_ALF_CLASSES]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel m_clippApsLuma[ALF_CTB_MAX_NUM_APS][MAX_NUM_ALF_LUMA_COEFF * MAX_NUM_ALF_CLASSES]; + Pel m_clipDefault[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; +#else short m_clippApsLuma[ALF_CTB_MAX_NUM_APS][MAX_NUM_ALF_LUMA_COEFF * MAX_NUM_ALF_CLASSES]; short m_clipDefault[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; +#endif bool m_created = false; short m_chromaCoeffFinal[MAX_NUM_ALF_ALTERNATIVES_CHROMA][MAX_NUM_ALF_CHROMA_COEFF]; AlfParam* m_alfParamChroma; @@ -148,8 +172,13 @@ protected: std::vector<AlfFilterShape> m_filterShapes[MAX_NUM_CHANNEL_TYPE]; AlfClassifier** m_classifier; short m_coeffFinal[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel m_clippFinal[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; + Pel m_chromaClippFinal[MAX_NUM_ALF_ALTERNATIVES_CHROMA][MAX_NUM_ALF_CHROMA_COEFF]; +#else short m_clippFinal[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; short m_chromaClippFinal[MAX_NUM_ALF_ALTERNATIVES_CHROMA][MAX_NUM_ALF_CHROMA_COEFF]; +#endif int** m_laplacian[NUM_DIRECTIONS]; int * m_laplacianPtr[NUM_DIRECTIONS][m_CLASSIFICATION_BLK_SIZE + 5]; int m_laplacianData[NUM_DIRECTIONS][m_CLASSIFICATION_BLK_SIZE + 5][m_CLASSIFICATION_BLK_SIZE + 5]; diff --git a/source/Lib/CommonLib/AlfParameters.h b/source/Lib/CommonLib/AlfParameters.h index 989952d81..fe7f23ae8 100644 --- a/source/Lib/CommonLib/AlfParameters.h +++ b/source/Lib/CommonLib/AlfParameters.h @@ -129,10 +129,18 @@ struct AlfParam bool enabledFlag[MAX_NUM_COMPONENT]; // alf_slice_enable_flag, alf_chroma_idc bool nonLinearFlag[MAX_NUM_CHANNEL_TYPE]; // alf_[luma/chroma]_clip_flag short lumaCoeff[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; // alf_coeff_luma_delta[i][j] +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel lumaClipp[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; // alf_clipp_luma_[i][j] +#else short lumaClipp[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; // alf_clipp_luma_[i][j] +#endif int numAlternativesChroma; // alf_chroma_num_alts_minus_one + 1 short chromaCoeff[MAX_NUM_ALF_ALTERNATIVES_CHROMA][MAX_NUM_ALF_CHROMA_COEFF]; // alf_coeff_chroma[i] +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel chromaClipp[MAX_NUM_ALF_ALTERNATIVES_CHROMA][MAX_NUM_ALF_CHROMA_COEFF]; // alf_clipp_chroma[i] +#else short chromaClipp[MAX_NUM_ALF_ALTERNATIVES_CHROMA][MAX_NUM_ALF_CHROMA_COEFF]; // alf_clipp_chroma[i] +#endif short filterCoeffDeltaIdx[MAX_NUM_ALF_CLASSES]; // filter_coeff_delta[i] bool alfLumaCoeffFlag[MAX_NUM_ALF_CLASSES]; // alf_luma_coeff_flag[i] int numLumaFilters; // number_of_filters_minus1 + 1 diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp index b1ed883e4..3db48cdb6 100644 --- a/source/Lib/CommonLib/Buffer.cpp +++ b/source/Lib/CommonLib/Buffer.cpp @@ -94,16 +94,32 @@ void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Str for (int x = 0; x < width; x += 4) { b = tmpx * (gradX0[x] - gradX1[x]) + tmpy * (gradY0[x] - gradY1[x]); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[x] = ClipPel(rightShift((src0[x] + src1[x] + b + offset), shift), clpRng); +#else dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng); +#endif b = tmpx * (gradX0[x + 1] - gradX1[x + 1]) + tmpy * (gradY0[x + 1] - gradY1[x + 1]); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[x + 1] = ClipPel(rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng); +#else dst[x + 1] = ClipPel((int16_t)rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng); +#endif b = tmpx * (gradX0[x + 2] - gradX1[x + 2]) + tmpy * (gradY0[x + 2] - gradY1[x + 2]); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[x + 2] = ClipPel(rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng); +#else dst[x + 2] = ClipPel((int16_t)rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng); +#endif b = tmpx * (gradX0[x + 3] - gradX1[x + 3]) + tmpy * (gradY0[x + 3] - gradY1[x + 3]); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[x + 3] = ClipPel(rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng); +#else dst[x + 3] = ClipPel((int16_t)rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng); +#endif } dst += dstStride; src0 += src0Stride; src1 += src1Stride; gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride; @@ -361,7 +377,11 @@ void AreaBuf<Pel>::addWeightedAvg(const AreaBuf<const Pel> &other1, const AreaBu const unsigned src2Stride = other2.stride; const unsigned destStride = stride; const int clipbd = clpRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd) + log2WeightBase; +#else const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase; +#endif const int offset = (1 << (shiftNum - 1)) + (IF_INTERNAL_OFFS << log2WeightBase); #define ADD_AVG_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( ( src0[ADDR]*w0 + src2[ADDR]*w1 + offset ), shiftNum ), clpRng ) @@ -454,7 +474,11 @@ void AreaBuf<Pel>::addAvg( const AreaBuf<const Pel> &other1, const AreaBuf<const const unsigned src2Stride = other2.stride; const unsigned destStride = stride; const int clipbd = clpRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd) + 1; +#else const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + 1; +#endif const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; #if ENABLE_SIMD_OPT_BUFFER && defined(TARGET_SIMD_X86) @@ -489,7 +513,11 @@ void AreaBuf<Pel>::toLast( const ClpRng& clpRng ) const uint32_t srcStride = stride; const int clipbd = clpRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd); +#else const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)); +#endif const int offset = ( 1 << ( shiftNum - 1 ) ) + IF_INTERNAL_OFFS; if (width == 1) @@ -562,7 +590,11 @@ void AreaBuf<Pel>::roundToOutputBitdepth( const AreaBuf<const Pel> &src, const C const unsigned destStride = stride; const int32_t clipbd = clpRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int32_t shiftDefault = IF_INTERNAL_FRAC_BITS(clipbd); +#else const int32_t shiftDefault = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)); +#endif const int32_t offsetDefault = (1<<(shiftDefault-1)) + IF_INTERNAL_OFFS; if( width == 1 ) diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index 5719f521c..4dfe5a27c 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -440,8 +440,13 @@ void AreaBuf<T>::removeWeightHighFreq(const AreaBuf<T>& other, const bool bClip, { #endif int normalizer = ((1 << 16) + (bcwWeight > 0 ? (bcwWeight >> 1) : -(bcwWeight >> 1))) / bcwWeight; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Intermediate_Int weight0 = normalizer << log2WeightBase; + Intermediate_Int weight1 = bcwWeightOther * normalizer; +#else int weight0 = normalizer << log2WeightBase; int weight1 = bcwWeightOther * normalizer; +#endif #define REM_HF_INC \ src += srcStride; \ dst += dstStride; \ diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 2fa1f4a5e..40bf83cdf 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -203,8 +203,10 @@ static const int MAX_BDOF_APPLICATION_REGION = 16; static const int MAX_CPB_CNT = 32; ///< Upper bound of (cpb_cnt_minus1 + 1) static const int MAX_NUM_LAYER_IDS = 64; static const int COEF_REMAIN_BIN_REDUCTION = 5; ///< indicates the level at which the VLC transitions from Golomb-Rice to TU+EG(k) +#if !JVET_R0351_HIGH_BIT_DEPTH_SUPPORT static const int COEFF_MIN = -32768; static const int COEFF_MAX = 32767; +#endif static const int CU_DQP_TU_CMAX = 5; ///< max number bins for truncated unary static const int CU_DQP_EG_k = 0; ///< expgolomb order @@ -629,7 +631,11 @@ const char* read_x86_extension(const std::string &extStrId); template <typename ValueType> inline ValueType leftShift (const ValueType value, const int shift) { return (shift >= 0) ? ( value << shift) : ( value >> -shift); } template <typename ValueType> inline ValueType rightShift (const ValueType value, const int shift) { return (shift >= 0) ? ( value >> shift) : ( value << -shift); } template <typename ValueType> inline ValueType leftShift_round (const ValueType value, const int shift) { return (shift >= 0) ? ( value << shift) : ((value + (ValueType(1) << (-shift - 1))) >> -shift); } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +template <typename ValueType> inline ValueType rightShift_round(const ValueType value, const int shift) { return (shift > 0) ? ((value + (ValueType(1) << (shift - 1))) >> shift) : ( value << -shift); } +#else template <typename ValueType> inline ValueType rightShift_round(const ValueType value, const int shift) { return (shift >= 0) ? ((value + (ValueType(1) << (shift - 1))) >> shift) : ( value << -shift); } +#endif static inline int floorLog2(uint32_t x) { diff --git a/source/Lib/CommonLib/ContextModelling.cpp b/source/Lib/CommonLib/ContextModelling.cpp index 56562db08..34d20b05a 100644 --- a/source/Lib/CommonLib/ContextModelling.cpp +++ b/source/Lib/CommonLib/ContextModelling.cpp @@ -69,6 +69,10 @@ CoeffCodingContext::CoeffCodingContext( const TransformUnit& tu, ComponentID com , m_lastShiftX (0) , m_lastShiftY (0) , m_TrafoBypass (tu.cs->sps->getSpsRangeExtension().getTransformSkipContextEnabledFlag() && (tu.mtsIdx[m_compID] == MTS_SKIP)) +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + , m_minCoeff (-(1 << tu.cs->sps->getMaxLog2TrDynamicRange(m_chType))) + , m_maxCoeff ((1 << tu.cs->sps->getMaxLog2TrDynamicRange(m_chType)) - 1) +#endif , m_scanPosLast (-1) , m_subSetId (-1) , m_subSetPos (-1) diff --git a/source/Lib/CommonLib/ContextModelling.h b/source/Lib/CommonLib/ContextModelling.h index b06839a3b..10160254f 100644 --- a/source/Lib/CommonLib/ContextModelling.h +++ b/source/Lib/CommonLib/ContextModelling.h @@ -102,6 +102,11 @@ public: void decimateNumCtxBins(int n) { m_remainingContextBins -= n; } void increaseNumCtxBins(int n) { m_remainingContextBins += n; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff minCoeff() const { return m_minCoeff; } + TCoeff maxCoeff() const { return m_maxCoeff; } +#endif + unsigned sigCtxIdAbs( int scanPos, const TCoeff* coeff, const int state ) { const uint32_t posY = m_scan[scanPos].y; @@ -241,6 +246,13 @@ public: return m_tsLrg1FlagCtxSet(numPos); } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + template <typename T> int sgn(T val) + { + return (T(0) < val) - (val < T(0)); + } + +#endif unsigned signCtxIdAbsTS(int scanPos, const TCoeff* coeff, int bdpcm) { const uint32_t posY = m_scan[scanPos].y; @@ -252,11 +264,19 @@ public: if (posX > 0) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + rightSign = sgn(pData[-1]); +#else rightSign = pData[-1]; +#endif } if (posY > 0) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + belowSign = sgn(pData[-(int)m_width]); +#else belowSign = pData[-(int)m_width]; +#endif } if ((rightSign == 0 && belowSign == 0) || ((rightSign*belowSign) < 0)) @@ -380,6 +400,10 @@ private: const int m_lastShiftX; const int m_lastShiftY; const bool m_TrafoBypass; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const TCoeff m_minCoeff; + const TCoeff m_maxCoeff; +#endif // modified int m_scanPosLast; int m_subSetId; diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp index 2bb92ba14..72f65178e 100644 --- a/source/Lib/CommonLib/DepQuant.cpp +++ b/source/Lib/CommonLib/DepQuant.cpp @@ -1084,7 +1084,11 @@ namespace DQIntern } #undef UPDATE TCoeff sumGt1 = sumAbs1 - sumNum; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + m_sigFracBits = m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + std::min<TCoeff>( (sumAbs1+1)>>1, 3 )]; +#else m_sigFracBits = m_sigFracBitsArray[scanInfo.sigCtxOffsetNext + std::min( (sumAbs1+1)>>1, 3 )]; +#endif m_coeffFracBits = m_gtxFracBitsArray[scanInfo.gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)]; TCoeff sumAbs = m_absLevelsAndCtxInit[8 + scanInfo.nextInsidePos] >> 8; @@ -1198,7 +1202,11 @@ namespace DQIntern TCoeff sumNum = tinit & 7; TCoeff sumAbs1 = ( tinit >> 3 ) & 31; TCoeff sumGt1 = sumAbs1 - sumNum; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + m_sigFracBits = m_sigFracBitsArray[ scanInfo.sigCtxOffsetNext + std::min<TCoeff>( (sumAbs1+1)>>1, 3 ) ]; +#else m_sigFracBits = m_sigFracBitsArray[ scanInfo.sigCtxOffsetNext + std::min( (sumAbs1+1)>>1, 3 ) ]; +#endif m_coeffFracBits = m_gtxFracBitsArray[ scanInfo.gtxCtxOffsetNext + ( sumGt1 < 4 ? sumGt1 : 4 ) ]; } } diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index 4adf3ac3c..ce9c6c1d5 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -779,7 +779,11 @@ void InterPrediction::xPredInterBlk ( const ComponentID& compID, const Predictio JVET_J0090_SET_CACHE_ENABLE((srcPadStride == 0) && (bioApplied == false)); // Enabled only in non-DMVR-non-BDOF process, In DMVR process, srcPadStride is always non-zero if (bioApplied && compID == COMPONENT_Y) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shift = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else const int shift = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); +#endif int xOffset = (xFrac < 8) ? 1 : 0; int yOffset = (yFrac < 8) ? 1 : 0; const Pel* refPel = refBuf.buf - yOffset * refBuf.stride - xOffset; @@ -1192,7 +1196,11 @@ void InterPrediction::xPredAffineBlk(const ComponentID &compID, const Prediction } if (enablePROF) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shift = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else const int shift = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); +#endif const int xOffset = xFrac >> 3; const int yOffset = yFrac >> 3; @@ -1219,15 +1227,23 @@ void InterPrediction::xPredAffineBlk(const ComponentID &compID, const Prediction PelBuf gradYBuf = gradYExt.subBuf(0, 0, blockWidth + 2, blockHeight + 2); g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, gradXBuf.stride, gradXBuf.buf, gradYBuf.buf, clpRng.bd); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel offset = (1 << (shift - 1)) + IF_INTERNAL_OFFS; +#else const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); const Pel offset = (1 << (shiftNum - 1)) + IF_INTERNAL_OFFS; +#endif Pel* src = dstExtBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); Pel* gX = gradXBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); Pel* gY = gradYBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H); Pel * dstY = dstBuf.bufAt(w, h); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, bi, shift, offset, clpRng); +#else g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, bi, shiftNum, offset, clpRng); +#endif } } } @@ -1280,7 +1296,11 @@ void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf const ClpRng& clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y); const int bitDepth = clipBitDepths.recon[toChannelType(COMPONENT_Y)]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shiftNum = IF_INTERNAL_FRAC_BITS(bitDepth) + 1; +#else const int shiftNum = IF_INTERNAL_PREC + 1 - bitDepth; +#endif const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; const int limit = ( 1 << 4 ) - 1; diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp index ba76ca68f..22148635c 100644 --- a/source/Lib/CommonLib/InterpolationFilter.cpp +++ b/source/Lib/CommonLib/InterpolationFilter.cpp @@ -415,7 +415,11 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel *src, int } else if ( isFirst ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shift = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else const int shift = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); +#endif if (biMCForDMVR) { @@ -464,7 +468,11 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel *src, int } else { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shift = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else const int shift = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); +#endif if (biMCForDMVR) { @@ -567,7 +575,11 @@ void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcSt src -= ( N/2 - 1 ) * cStride; int offset; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + int headRoom = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else int headRoom = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)); +#endif int shift = IF_FILTER_PREC; // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 @@ -899,7 +911,11 @@ void InterpolationFilter::xWeightedGeoBlk(const PredictionUnit &pu, const uint32 const char log2WeightBase = 3; const ClpRng clipRng = pu.cu->slice->clpRngs().comp[compIdx]; const int32_t clipbd = clipRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int32_t shiftWeighted = IF_INTERNAL_FRAC_BITS(clipbd) + log2WeightBase; +#else const int32_t shiftWeighted = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase; +#endif const int32_t offsetWeighted = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase); const uint32_t scaleX = getComponentScaleX(compIdx, pu.chromaFormat); const uint32_t scaleY = getComponentScaleY(compIdx, pu.chromaFormat); diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h index 58a811c38..bae5ddda6 100644 --- a/source/Lib/CommonLib/InterpolationFilter.h +++ b/source/Lib/CommonLib/InterpolationFilter.h @@ -50,6 +50,9 @@ #define IF_INTERNAL_OFFS (1<<(IF_INTERNAL_PREC-1)) ///< Offset used internally #define IF_INTERNAL_PREC_BILINEAR 10 ///< Number of bits for internal precision #define IF_FILTER_PREC_BILINEAR 4 ///< Bilinear filter coeff precision so that intermediate value will not exceed 16 bit for SIMD - bit exact +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +#define IF_INTERNAL_FRAC_BITS(bd) std::max(2, IF_INTERNAL_PREC - int(bd)) +#endif /** * \brief Interpolation filter class */ diff --git a/source/Lib/CommonLib/Rom.cpp b/source/Lib/CommonLib/Rom.cpp index 61814be66..d00d1da9e 100644 --- a/source/Lib/CommonLib/Rom.cpp +++ b/source/Lib/CommonLib/Rom.cpp @@ -723,7 +723,11 @@ void initGeoTemplate() if (g_angle2mask[angleIdx] == -1) continue; g_globalGeoWeights[g_angle2mask[angleIdx]] = new int16_t[GEO_WEIGHT_MASK_SIZE * GEO_WEIGHT_MASK_SIZE]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + g_globalGeoEncSADmask[g_angle2mask[angleIdx]] = new Pel[GEO_WEIGHT_MASK_SIZE * GEO_WEIGHT_MASK_SIZE]; +#else g_globalGeoEncSADmask[g_angle2mask[angleIdx]] = new int16_t[GEO_WEIGHT_MASK_SIZE * GEO_WEIGHT_MASK_SIZE]; +#endif int distanceX = angleIdx; int distanceY = (distanceX + (GEO_NUM_ANGLES >> 2)) % GEO_NUM_ANGLES; @@ -775,7 +779,11 @@ void initGeoTemplate() } int16_t** g_GeoParams; int16_t* g_globalGeoWeights [GEO_NUM_PRESTORED_MASK]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +Pel* g_globalGeoEncSADmask[GEO_NUM_PRESTORED_MASK]; +#else int16_t* g_globalGeoEncSADmask[GEO_NUM_PRESTORED_MASK]; +#endif int16_t g_weightOffset [GEO_NUM_PARTITION_MODE][GEO_NUM_CU_SIZE][GEO_NUM_CU_SIZE][2]; int8_t g_angle2mask[GEO_NUM_ANGLES] = { 0, -1, 1, 2, 3, 4, -1, -1, 5, -1, -1, 4, 3, 2, 1, -1, 0, -1, 1, 2, 3, 4, -1, -1, 5, -1, -1, 4, 3, 2, 1, -1 }; int8_t g_Dis[GEO_NUM_ANGLES] = { 8, 8, 8, 8, 4, 4, 2, 1, 0, -1, -2, -4, -4, -8, -8, -8, -8, -8, -8, -8, -4, -4, -2, -1, 0, 1, 2, 4, 4, 8, 8, 8 }; diff --git a/source/Lib/CommonLib/Rom.h b/source/Lib/CommonLib/Rom.h index 54e7a3b1c..42d43aa10 100644 --- a/source/Lib/CommonLib/Rom.h +++ b/source/Lib/CommonLib/Rom.h @@ -77,7 +77,11 @@ extern const int g_quantScales [2/*0=4^n blocks, 1=2*4^n blocks*/][SCALING_LIS extern const int g_invQuantScales[2/*0=4^n blocks, 1=2*4^n blocks*/][SCALING_LIST_REM_NUM]; // IQ(QP%6) static const int g_numTransformMatrixSizes = 6; +#if RExt__HIGH_PRECISION_FORWARD_TRANSFORM +static const int g_transformMatrixShift[TRANSFORM_NUMBER_OF_DIRECTIONS] = { 14, 6 }; +#else static const int g_transformMatrixShift[TRANSFORM_NUMBER_OF_DIRECTIONS] = { 6, 6 }; +#endif // ==================================================================================================================== @@ -224,7 +228,11 @@ const int g_IBCBufferSize = 256 * 128; void initGeoTemplate(); extern int16_t** g_GeoParams; extern int16_t* g_globalGeoWeights [GEO_NUM_PRESTORED_MASK]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +extern Pel* g_globalGeoEncSADmask[GEO_NUM_PRESTORED_MASK]; +#else extern int16_t* g_globalGeoEncSADmask[GEO_NUM_PRESTORED_MASK]; +#endif extern int16_t g_weightOffset [GEO_NUM_PARTITION_MODE][GEO_NUM_CU_SIZE][GEO_NUM_CU_SIZE][2]; extern int8_t g_angle2mask [GEO_NUM_ANGLES]; extern int8_t g_Dis[GEO_NUM_ANGLES]; diff --git a/source/Lib/CommonLib/RomTr.cpp b/source/Lib/CommonLib/RomTr.cpp index a60611a63..722df0e3c 100644 --- a/source/Lib/CommonLib/RomTr.cpp +++ b/source/Lib/CommonLib/RomTr.cpp @@ -345,6 +345,90 @@ { b, -d, f, -h, j, -l, n, -p, r, -t, v, -x, z, -B, D, -F, E, -C, A, -y, w, -u, s, -q, o, -m, k, -i, g, -e, c, -a,}, \ } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT && RExt__HIGH_PRECISION_FORWARD_TRANSFORM +//-------------------------------------------------------------------------------------------------- +// DCT-2 +const TMatrixCoeff g_trCoreDCT2P2[TRANSFORM_NUMBER_OF_DIRECTIONS][2][2] = +{ + DEFINE_DCT2_P2_MATRIX(16384), + DEFINE_DCT2_P2_MATRIX(64) +}; + +const TMatrixCoeff g_trCoreDCT2P4 [TRANSFORM_NUMBER_OF_DIRECTIONS][4][4] = +{ + DEFINE_DCT2_P4_MATRIX (16384, 21266, 9224), + DEFINE_DCT2_P4_MATRIX ( 64, 83, 36) +}; + +const TMatrixCoeff g_trCoreDCT2P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8] = +{ + DEFINE_DCT2_P8_MATRIX(16384, 21266, 9224, 22813, 19244, 12769, 4563), + DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18) +}; + +const TMatrixCoeff g_trCoreDCT2P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16] = +{ + DEFINE_DCT2_P16_MATRIX(16384, 21266, 9224, 22813, 19244, 12769, 4563, 23120, 22063, 20450, 17972, 14642, 11109, 6446, 2316), + DEFINE_DCT2_P16_MATRIX( 64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9) +}; + +const TMatrixCoeff g_trCoreDCT2P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32] = +{ + DEFINE_DCT2_P32_MATRIX(16384, 21266, 9224, 22813, 19244, 12769, 4563, 23120, 22063, 20450, 17972, 14642, 11109, 6446 , 2316, 23106, 22852, 22445, 21848, 20995, 19810, 18601, 17143, 15718, 13853, 11749, 9846, 7908, 5573, 3281, 946), + DEFINE_DCT2_P32_MATRIX( 64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) +}; + +const TMatrixCoeff g_trCoreDCT2P64[TRANSFORM_NUMBER_OF_DIRECTIONS][64][64] = +{ + DEFINE_DCT2_P64_MATRIX(16384, 21266, 9224, 22813, 19244, 12769, 4563, 23129, 22063, 20450, 17972, 14642, 11109, 6446, 2316, 23106, 22852, 22445, 21848, 20995, 19810, 18601, 17143, 15718, 13853, 11749, 9846, 7908, 5573, 3281, 946, 23360, 23053, 23048, 23023, 22610, 22339, 21936, 21502, 21266, 20730, 20251, 19726, 18731, 18201, 17638, 16604, 15881, 15084, 14322, 13340, 12238, 11330, 10493, 9428, 8426, 7100, 6151, 5101, 3848, 2734, 1754, 574), + DEFINE_DCT2_P64_MATRIX( 64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 91, 90, 90, 90, 88, 87, 86, 84, 83, 81, 79, 77, 73, 71, 69, 65, 62, 59, 56, 52, 48, 44, 41, 37, 33, 28, 24, 20, 15, 11, 7, 2) +}; + +// DCT-8 +const TMatrixCoeff g_trCoreDCT8P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4] = +{ + DEFINE_DCT8_P4_MATRIX(21505, 18893, 14081, 7425), + DEFINE_DCT8_P4_MATRIX(84, 74, 55, 29) +}; +const TMatrixCoeff g_trCoreDCT8P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8] = +{ + DEFINE_DCT8_P8_MATRIX(22018, 21790, 19958, 18154, 15363, 11754, 8148, 4350), + DEFINE_DCT8_P8_MATRIX( 86, 85, 78, 71, 60, 46, 32, 17) +}; +const TMatrixCoeff g_trCoreDCT8P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16] = +{ + DEFINE_DCT8_P16_MATRIX(22569, 22542, 22202, 21664, 20754, 19738, 18787, 17369, 15781, 14044, 12209, 10360, 8498, 6421, 4295, 1967), + DEFINE_DCT8_P16_MATRIX( 88, 88, 87, 85, 81, 77, 73, 68, 62, 55, 48, 40, 33, 25, 17, 8) +}; +const TMatrixCoeff g_trCoreDCT8P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32] = +{ + DEFINE_DCT8_P32_MATRIX(23065, 23136, 22715, 22533, 22544, 22053, 21901, 21463, 21131, 20385, 20019, 19708, 19007, 18415, 17448, 16894, 16143, 15230, 14312, 13616, 12679, 11526, 10770, 9720, 8606, 7734, 6623, 5414, 4478, 3225, 2291, 1043), + DEFINE_DCT8_P32_MATRIX( 90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4) +}; + +// DST-7 +const TMatrixCoeff g_trCoreDST7P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4] = +{ + DEFINE_DST7_P4_MATRIX( 7425, 14081, 18893, 21505), + DEFINE_DST7_P4_MATRIX( 29, 55, 74, 84) +}; +const TMatrixCoeff g_trCoreDST7P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8] = +{ + DEFINE_DST7_P8_MATRIX( 4350, 8148, 11754, 15363, 18154, 19958, 21790, 22018), + DEFINE_DST7_P8_MATRIX( 17, 32, 46, 60, 71, 78, 85, 86) +}; +const TMatrixCoeff g_trCoreDST7P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16] = +{ + DEFINE_DST7_P16_MATRIX(1967, 4295, 6421, 8498, 10360, 12209, 14044, 15781, 17369, 18787, 19738, 20754, 21664, 22202, 22542, 22569), + DEFINE_DST7_P16_MATRIX( 8, 17, 25, 33, 40, 48, 55, 62, 68, 73, 77, 81, 85, 87, 88, 88) +}; +const TMatrixCoeff g_trCoreDST7P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32] = +{ + DEFINE_DST7_P32_MATRIX( 1043, 2291, 3225, 4478, 5414, 6623, 7734, 8606, 9720, 10770, 11526, 12679, 13616, 14312, 15230, 16143, 16894, 17448, 18415, 19007, 19708, 20019, 20385, 21131, 21463, 21901, 22053, 22544, 22533, 22715, 23136, 23065), + DEFINE_DST7_P32_MATRIX( 4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90) +}; + +#else //-------------------------------------------------------------------------------------------------- // DCT-2 const TMatrixCoeff g_trCoreDCT2P2[TRANSFORM_NUMBER_OF_DIRECTIONS][2][2] = @@ -426,5 +510,5 @@ const TMatrixCoeff g_trCoreDST7P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32] = DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90), DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90) }; - +#endif //-------------------------------------------------------------------------------------------------- diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp index 7532aea6f..69908d9af 100644 --- a/source/Lib/CommonLib/TrQuant.cpp +++ b/source/Lib/CommonLib/TrQuant.cpp @@ -229,18 +229,30 @@ void TrQuant::init( const Quant* otherQuant, } } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +void TrQuant::fwdLfnstNxN( TCoeff* src, TCoeff* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ) +#else void TrQuant::fwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ) +#endif { const int8_t* trMat = ( size > 4 ) ? g_lfnst8x8[ mode ][ index ][ 0 ] : g_lfnst4x4[ mode ][ index ][ 0 ]; const int trSize = ( size > 4 ) ? 48 : 16; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff coef; + TCoeff* out = dst; +#else int coef; int* out = dst; - +#endif assert( index < 3 ); for( int j = 0; j < zeroOutSize; j++ ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff* srcPtr = src; +#else int* srcPtr = src; +#endif const int8_t* trMatTmp = trMat; coef = 0; for( int i = 0; i < trSize; i++ ) @@ -254,29 +266,46 @@ void TrQuant::fwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32 ::memset( out, 0, ( trSize - zeroOutSize ) * sizeof( int ) ); } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +void TrQuant::invLfnstNxN( TCoeff* src, TCoeff* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize, const int maxLog2TrDynamicRange ) +{ +#else void TrQuant::invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ) { int maxLog2TrDynamicRange = 15; +#endif const TCoeff outputMinimum = -( 1 << maxLog2TrDynamicRange ); const TCoeff outputMaximum = ( 1 << maxLog2TrDynamicRange ) - 1; const int8_t* trMat = ( size > 4 ) ? g_lfnst8x8[ mode ][ index ][ 0 ] : g_lfnst4x4[ mode ][ index ][ 0 ]; const int trSize = ( size > 4 ) ? 48 : 16; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff resi; + TCoeff* out = dst; +#else int resi; int* out = dst; - +#endif assert( index < 3 ); for( int j = 0; j < trSize; j++ ) { resi = 0; const int8_t* trMatTmp = trMat; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff* srcPtr = src; +#else int* srcPtr = src; +#endif for( int i = 0; i < zeroOutSize; i++ ) { resi += *srcPtr++ * *trMatTmp; trMatTmp += trSize; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + *out++ = Clip3<TCoeff>( outputMinimum, outputMaximum, ( resi + 64 ) >> 7 ); +#else *out++ = Clip3( outputMinimum, outputMaximum, ( int ) ( resi + 64 ) >> 7 ); +#endif trMat++; } } @@ -309,6 +338,9 @@ bool TrQuant::getTransposeFlag( uint32_t intraMode ) void TrQuant::xInvLfnst( const TransformUnit &tu, const ComponentID compID ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int maxLog2TrDynamicRange = tu.cs->sps->getMaxLog2TrDynamicRange(toChannelType(compID)); +#endif const CompArea& area = tu.blocks[ compID ]; const uint32_t width = area.width; const uint32_t height = area.height; @@ -352,8 +384,11 @@ void TrQuant::xInvLfnst( const TransformUnit &tu, const ComponentID compID ) scanPtr++; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + invLfnstNxN( m_tempInMatrix, m_tempOutMatrix, g_lfnstLut[ intraMode ], lfnstIdx - 1, sbSize, ( tu4x4Flag || tu8x8Flag ) ? 8 : 16, maxLog2TrDynamicRange ); +#else invLfnstNxN( m_tempInMatrix, m_tempOutMatrix, g_lfnstLut[ intraMode ], lfnstIdx - 1, sbSize, ( tu4x4Flag || tu8x8Flag ) ? 8 : 16 ); - +#endif lfnstTemp = m_tempOutMatrix; // inverse spectral rearrangement if( transposeFlag ) diff --git a/source/Lib/CommonLib/TrQuant.h b/source/Lib/CommonLib/TrQuant.h index 50f893da8..619f743f9 100644 --- a/source/Lib/CommonLib/TrQuant.h +++ b/source/Lib/CommonLib/TrQuant.h @@ -79,8 +79,13 @@ public: ); void getTrTypes(const TransformUnit tu, const ComponentID compID, int &trTypeHor, int &trTypeVer); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + void fwdLfnstNxN( TCoeff* src, TCoeff* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ); + void invLfnstNxN( TCoeff* src, TCoeff* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize, const int maxLog2TrDynamicRange ); +#else void fwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ); void invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ); +#endif uint32_t getLFNSTIntraMode( int wideAngPredMode ); bool getTransposeFlag ( uint32_t intraMode ); diff --git a/source/Lib/CommonLib/TrQuant_EMT.cpp b/source/Lib/CommonLib/TrQuant_EMT.cpp index b21ede257..82e34f176 100644 --- a/source/Lib/CommonLib/TrQuant_EMT.cpp +++ b/source/Lib/CommonLib/TrQuant_EMT.cpp @@ -51,7 +51,11 @@ void fastForwardDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2) { int j; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff E, O; +#else int E, O; +#endif TCoeff add = (shift > 0) ? (1 << (shift - 1)) : 0; const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_FORWARD][0]; @@ -85,8 +89,13 @@ void fastForwardDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int void fastInverseDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) { int j; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff E, O; + TCoeff add = 1 << (shift - 1); +#else int E, O; int add = 1 << (shift - 1); +#endif const TMatrixCoeff *iT = g_trCoreDCT2P2[TRANSFORM_INVERSE][0]; @@ -98,8 +107,13 @@ void fastInverseDCT2_B2(const TCoeff *src, TCoeff *dst, int shift, int line, int O = iT[2] * (src[0] - src[line]); /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[0] = Clip3<TCoeff>(outputMinimum, outputMaximum, (E + add) >> shift); + dst[1] = Clip3<TCoeff>(outputMinimum, outputMaximum, (O + add) >> shift); +#else dst[0] = Clip3(outputMinimum, outputMaximum, (E + add) >> shift); dst[1] = Clip3(outputMinimum, outputMaximum, (O + add) >> shift); +#endif src++; dst += 2; @@ -176,8 +190,13 @@ void fastForwardDCT2_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int void fastInverseDCT2_B4( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum ) { int j; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff E[2], O[2]; + TCoeff add = 1 << ( shift - 1 ); +#else int E[2], O[2]; int add = 1 << ( shift - 1 ); +#endif const TMatrixCoeff *iT = g_trCoreDCT2P4[TRANSFORM_INVERSE][0]; @@ -191,10 +210,17 @@ void fastInverseDCT2_B4( const TCoeff *src, TCoeff *dst, int shift, int line, in E[1] = iT[0 * 4 + 1] * src[ 0] + iT[2 * 4 + 1] * src[2 * line]; /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[0] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[0] + O[0] + add ) >> shift ); + dst[1] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[1] + O[1] + add ) >> shift ); + dst[2] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[1] - O[1] + add ) >> shift ); + dst[3] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[0] - O[0] + add ) >> shift ); +#else dst[0] = Clip3( outputMinimum, outputMaximum, ( E[0] + O[0] + add ) >> shift ); dst[1] = Clip3( outputMinimum, outputMaximum, ( E[1] + O[1] + add ) >> shift ); dst[2] = Clip3( outputMinimum, outputMaximum, ( E[1] - O[1] + add ) >> shift ); dst[3] = Clip3( outputMinimum, outputMaximum, ( E[0] - O[0] + add ) >> shift ); +#endif src++; dst += 4; @@ -209,7 +235,11 @@ void fastInverseDCT2_B4( const TCoeff *src, TCoeff *dst, int shift, int line, in template< int uiTrSize > inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum, const TMatrixCoeff* iT ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const TCoeff rnd_factor = 1 << (shift - 1); +#else const int rnd_factor = 1 << (shift - 1); +#endif const int reducedLine = line - iSkipLine; const int cutoff = uiTrSize - iSkipLine2; @@ -217,12 +247,20 @@ inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, { for( int j = 0; j<uiTrSize; j++ ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff iSum = 0; +#else int iSum = 0; +#endif for( int k = 0; k<cutoff; k++) { iSum += src[k*line + i] * iT[k*uiTrSize + j]; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[i*uiTrSize + j] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iSum + rnd_factor) >> shift); +#else dst[i*uiTrSize + j] = Clip3(outputMinimum, outputMaximum, (int)(iSum + rnd_factor) >> shift); +#endif } } @@ -236,7 +274,11 @@ inline void _fastInverseMM( const TCoeff *src, TCoeff *dst, int shift, int line, template< int uiTrSize > inline void _fastForwardMM( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TMatrixCoeff* tc ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const TCoeff rnd_factor = 1 << (shift - 1); +#else const int rnd_factor = 1 << (shift - 1); +#endif const int reducedLine = line - iSkipLine; const int cutoff = uiTrSize - iSkipLine2; TCoeff *pCoef; @@ -247,7 +289,11 @@ inline void _fastForwardMM( const TCoeff *src, TCoeff *dst, int shift, int line, const TMatrixCoeff* iT = tc; for( int j = 0; j<cutoff; j++ ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff iSum = 0; +#else int iSum = 0; +#endif for( int k = 0; k<uiTrSize; k++ ) { iSum += src[k] * iT[k]; @@ -344,9 +390,15 @@ void fastForwardDCT2_B8( const TCoeff *src, TCoeff *dst, int shift, int line, in void fastInverseDCT2_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) { int j, k; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff E[4], O[4]; + TCoeff EE[2], EO[2]; + TCoeff add = 1 << (shift - 1); +#else int E[4], O[4]; int EE[2], EO[2]; int add = 1 << (shift - 1); +#endif const TMatrixCoeff *iT = g_trCoreDCT2P8[TRANSFORM_INVERSE][0]; @@ -372,8 +424,13 @@ void fastInverseDCT2_B8(const TCoeff *src, TCoeff *dst, int shift, int line, int for( k = 0; k < 4; k++ ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[k ] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[ k] + O[ k] + add ) >> shift ); + dst[k + 4] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[3 - k] - O[3 - k] + add ) >> shift ); +#else dst[k ] = Clip3( outputMinimum, outputMaximum, ( E[ k] + O[ k] + add ) >> shift ); dst[k + 4] = Clip3( outputMinimum, outputMaximum, ( E[3 - k] - O[3 - k] + add ) >> shift ); +#endif } src++; dst += 8; @@ -465,10 +522,17 @@ void fastForwardDCT2_B16(const TCoeff *src, TCoeff *dst, int shift, int line, in void fastInverseDCT2_B16( const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum ) { int j, k; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff E [8], O [8]; + TCoeff EE [4], EO [4]; + TCoeff EEE[2], EEO[2]; + TCoeff add = 1 << ( shift - 1 ); +#else int E [8], O [8]; int EE [4], EO [4]; int EEE[2], EEO[2]; int add = 1 << ( shift - 1 ); +#endif const TMatrixCoeff *iT = g_trCoreDCT2P16[TRANSFORM_INVERSE][0]; @@ -504,8 +568,13 @@ void fastInverseDCT2_B16( const TCoeff *src, TCoeff *dst, int shift, int line, i } for( k = 0; k < 8; k++ ) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[k ] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[ k] + O[ k] + add ) >> shift ); + dst[k + 8] = Clip3<TCoeff>( outputMinimum, outputMaximum, ( E[7 - k] - O[7 - k] + add ) >> shift ); +#else dst[k ] = Clip3( outputMinimum, outputMaximum, ( E[ k] + O[ k] + add ) >> shift ); dst[k + 8] = Clip3( outputMinimum, outputMaximum, ( E[7 - k] - O[7 - k] + add ) >> shift ); +#endif } src++; dst += 16; @@ -607,13 +676,21 @@ void fastForwardDCT2_B32( const TCoeff *src, TCoeff *dst, int shift, int line, i */ void fastInverseDCT2_B32(const TCoeff *src, TCoeff *dst, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum) { - +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff j, k; + TCoeff E[16], O[16]; + TCoeff EE[8], EO[8]; + TCoeff EEE[4], EEO[4]; + TCoeff EEEE[2], EEEO[2]; + TCoeff add = 1 << (shift - 1); +#else int j, k; int E[16], O[16]; int EE[8], EO[8]; int EEE[4], EEO[4]; int EEEE[2], EEEO[2]; int add = 1 << (shift - 1); +#endif const TMatrixCoeff *iT = g_trCoreDCT2P32[TRANSFORM_INVERSE][0]; @@ -659,8 +736,13 @@ void fastInverseDCT2_B32(const TCoeff *src, TCoeff *dst, int shift, int line, in } for (k = 0;k<16;k++) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[k] = Clip3<TCoeff>(outputMinimum, outputMaximum, (E[k] + O[k] + add) >> shift); + dst[k + 16] = Clip3<TCoeff>(outputMinimum, outputMaximum, (E[15 - k] - O[15 - k] + add) >> shift); +#else dst[k] = Clip3(outputMinimum, outputMaximum, (E[k] + O[k] + add) >> shift); dst[k + 16] = Clip3(outputMinimum, outputMaximum, (E[15 - k] - O[15 - k] + add) >> shift); +#endif } src++; dst += 32; @@ -851,8 +933,13 @@ void fastInverseDCT2_B64(const TCoeff *src, TCoeff *dst, int shift, int line, in } for (k = 0;k<32;k++) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[k] = Clip3<TCoeff>(outputMinimum, outputMaximum, (E[k] + O[k] + rnd_factor) >> shift); + dst[k + 32] = Clip3<TCoeff>(outputMinimum, outputMaximum, (E[31 - k] - O[31 - k] + rnd_factor) >> shift); +#else dst[k] = Clip3(outputMinimum, outputMaximum, (E[k] + O[k] + rnd_factor) >> shift); dst[k + 32] = Clip3(outputMinimum, outputMaximum, (E[31 - k] - O[31 - k] + rnd_factor) >> shift); +#endif } src++; dst += uiTrSize; @@ -871,7 +958,11 @@ void fastForwardDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int const TMatrixCoeff *iT = g_trCoreDST7P4[TRANSFORM_FORWARD][0]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff c[4]; +#else int c[4]; +#endif TCoeff *pCoeff = dst; const int reducedLine = line - iSkipLine; for (i = 0; i<reducedLine; i++) @@ -918,10 +1009,17 @@ void fastInverseDST7_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int c[2] = src[0 * line] - src[3 * line]; c[3] = iT[2] * src[1 * line]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[0] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift); + dst[1] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift); + dst[2] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[2] * (src[0 * line] - src[2 * line] + src[3 * line]) + rnd_factor) >> shift); + dst[3] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[1] * c[0] + iT[0] * c[2] - c[3] + rnd_factor) >> shift); +#else dst[0] = Clip3(outputMinimum, outputMaximum, (iT[0] * c[0] + iT[1] * c[1] + c[3] + rnd_factor) >> shift); dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[2] - iT[0] * c[1] + c[3] + rnd_factor) >> shift); dst[2] = Clip3(outputMinimum, outputMaximum, (iT[2] * (src[0 * line] - src[2 * line] + src[3 * line]) + rnd_factor) >> shift); dst[3] = Clip3(outputMinimum, outputMaximum, (iT[1] * c[0] + iT[0] * c[2] - c[3] + rnd_factor) >> shift); +#endif dst += 4; src++; @@ -1037,6 +1135,28 @@ void fastInverseDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, in t = iT[10] * src[5 * line]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[ 2] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[ 2]*d[0] + iT[ 8]*d[1] + iT[14]*d[2] + iT[11]*d[3] + iT[ 5]*d[4] + add ) >> shift); + dst[ 5] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[ 5]*d[0] + iT[14]*d[1] + iT[ 2]*d[2] - iT[ 8]*d[3] - iT[11]*d[4] + add ) >> shift); + dst[ 8] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[ 8]*d[0] + iT[ 5]*d[1] - iT[11]*d[2] - iT[ 2]*d[3] + iT[14]*d[4] + add ) >> shift); + dst[11] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[11]*d[0] - iT[ 2]*d[1] - iT[ 5]*d[2] + iT[14]*d[3] - iT[ 8]*d[4] + add ) >> shift); + dst[14] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[14]*d[0] - iT[11]*d[1] + iT[ 8]*d[2] - iT[ 5]*d[3] + iT[ 2]*d[4] + add ) >> shift); + + dst[10] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[10]*(src[ 0*line]-src[ 2*line]+src[ 3*line]-src[5*line] + +src[ 6*line]-src[ 8*line]+src[ 9*line]-src[11*line] + +src[12*line]-src[14*line]+src[15*line]) + add ) >> shift); + + dst[ 0] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0]*a[0] + iT[9]*b[0] + iT[2]*a[1] + iT[7]*b[1] + iT[4]*a[2] + iT[5]*b[2] + iT[6]*a[3] + iT[3]*b[3] + iT[8]*a[4] + iT[1]*b[4] + t + add ) >> shift); + dst[ 1] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[1]*c[0] - iT[8]*b[0] + iT[5]*c[1] - iT[4]*b[1] + iT[9]*c[2] - iT[0]*b[2] + iT[2]*a[3] + iT[7]*c[3] + iT[6]*a[4] + iT[3]*c[4] + t + add ) >> shift); + dst[ 3] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[3]*a[0] + iT[6]*b[0] + iT[0]*c[1] + iT[9]*a[1] + iT[1]*a[2] + iT[8]*c[2] + iT[4]*c[3] - iT[5]*b[3] - iT[2]*a[4] - iT[7]*b[4] - t + add ) >> shift); + dst[ 4] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[4]*c[0] - iT[5]*b[0] + iT[6]*c[1] + iT[3]*a[1] + iT[7]*a[2] + iT[2]*b[2] - iT[1]*c[3] + iT[8]*b[3] - iT[9]*c[4] - iT[0]*a[4] - t + add ) >> shift); + dst[ 6] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[6]*a[0] + iT[3]*b[0] + iT[9]*c[1] + iT[0]*a[1] - iT[1]*a[2] - iT[8]*b[2] - iT[4]*c[3] - iT[5]*a[3] - iT[2]*c[4] + iT[7]*b[4] + t + add ) >> shift); + dst[ 7] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[7]*c[0] - iT[2]*b[0] + iT[8]*a[1] + iT[1]*b[1] - iT[6]*c[2] + iT[3]*b[2] - iT[9]*a[3] - iT[0]*b[3] + iT[5]*c[4] - iT[4]*b[4] + t + add ) >> shift); + dst[ 9] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[9]*a[0] + iT[0]*b[0] + iT[2]*c[1] - iT[7]*b[1] - iT[5]*c[2] - iT[4]*a[2] + iT[3]*a[3] + iT[6]*b[3] + iT[8]*c[4] - iT[1]*b[4] - t + add ) >> shift); + dst[12] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[1]*c[0] + iT[8]*a[0] - iT[5]*a[1] - iT[4]*b[1] - iT[0]*c[2] + iT[9]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[6]*c[4] - iT[3]*a[4] + t + add ) >> shift); + dst[13] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[7]*c[0] + iT[2]*a[0] - iT[8]*c[1] + iT[1]*b[1] + iT[3]*c[2] - iT[6]*b[2] + iT[0]*a[3] + iT[9]*b[3] - iT[5]*a[4] - iT[4]*b[4] + t + add ) >> shift); + dst[15] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[4]*c[0] + iT[5]*a[0] - iT[3]*c[1] - iT[6]*a[1] + iT[2]*c[2] + iT[7]*a[2] - iT[1]*c[3] - iT[8]*a[3] + iT[0]*c[4] + iT[9]*a[4] - t + add ) >> shift); +#else dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 2]*d[0] + iT[ 8]*d[1] + iT[14]*d[2] + iT[11]*d[3] + iT[ 5]*d[4] + add ) >> shift); dst[ 5] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 5]*d[0] + iT[14]*d[1] + iT[ 2]*d[2] - iT[ 8]*d[3] - iT[11]*d[4] + add ) >> shift); dst[ 8] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 8]*d[0] + iT[ 5]*d[1] - iT[11]*d[2] - iT[ 2]*d[3] + iT[14]*d[4] + add ) >> shift); @@ -1057,7 +1177,7 @@ void fastInverseDST7_B16(const TCoeff *src, TCoeff *dst, int shift, int line, in dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[1]*c[0] + iT[8]*a[0] - iT[5]*a[1] - iT[4]*b[1] - iT[0]*c[2] + iT[9]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[6]*c[4] - iT[3]*a[4] + t + add ) >> shift); dst[13] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] + iT[2]*a[0] - iT[8]*c[1] + iT[1]*b[1] + iT[3]*c[2] - iT[6]*b[2] + iT[0]*a[3] + iT[9]*b[3] - iT[5]*a[4] - iT[4]*b[4] + t + add ) >> shift); dst[15] = Clip3(outputMinimum, outputMaximum, (int)( iT[4]*c[0] + iT[5]*a[0] - iT[3]*c[1] - iT[6]*a[1] + iT[2]*c[2] + iT[7]*a[2] - iT[1]*c[3] - iT[8]*a[3] + iT[0]*c[4] + iT[9]*a[4] - t + add ) >> shift); - +#endif src++; dst += 16; } @@ -1209,6 +1329,42 @@ void fastInverseDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, in t[0] = iT[12] * src[6*line] + iT[25] * src[19*line]; t[1] = iT[25] * src[6*line] - iT[12] * src[19*line]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[ 0] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[1][0] - iT[11] * a[8][0] + iT[13] * a[7][0] + iT[24] * a[4][5] - iT[1] * a[8][5] + iT[10] * a[1][5] + iT[14] * a[4][0] + iT[23] * a[7][5] + iT[2] * a[1][1] - iT[9] * a[8][1] + iT[15] * a[7][1] + iT[22] * a[4][4] - iT[3] * a[8][4] + iT[8] * a[1][4] + iT[16] * a[4][1] + iT[21] * a[7][4] + iT[4] * a[1][2] - iT[7] * a[8][2] + iT[17] * a[7][2] + iT[20] * a[4][3] - iT[5] * a[8][3] + iT[6] * a[1][3] + iT[18] * a[4][2] + iT[19] * a[7][3] + t[0] + add) >> shift); + dst[ 1] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[4][2] - iT[11] * a[6][2] + iT[13] * a[0][3] + iT[24] * a[5][2] + iT[1] * a[2][0] + iT[10] * a[7][0] + iT[14] * a[5][5] - iT[23] * a[9][5] + iT[2] * a[7][2] + iT[9] * a[2][2] - iT[15] * a[9][3] + iT[22] * a[5][3] - iT[3] * a[6][0] - iT[8] * a[4][0] + iT[16] * a[5][0] + iT[21] * a[0][5] - iT[4] * a[4][1] - iT[7] * a[6][1] + iT[17] * a[0][4] + iT[20] * a[5][1] + iT[5] * a[2][1] + iT[6] * a[7][1] + iT[18] * a[5][4] - iT[19] * a[9][4] + t[1] + add) >> shift); + dst[ 2] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[2][4] - iT[11] * a[3][4] + iT[13] * a[0][4] + iT[24] * a[1][4] + iT[1] * a[4][3] + iT[10] * a[7][2] + iT[14] * a[1][2] - iT[23] * a[8][2] + iT[2] * a[3][0] - iT[9] * a[6][5] - iT[15] * a[8][0] + iT[22] * a[9][5] - iT[3] * a[6][4] + iT[8] * a[3][1] + iT[16] * a[9][4] - iT[21] * a[8][1] + iT[4] * a[7][3] + iT[7] * a[4][2] - iT[17] * a[8][3] + iT[20] * a[1][3] - iT[5] * a[3][5] - iT[6] * a[2][5] + iT[18] * a[1][5] + iT[19] * a[0][5] + t[1] + add) >> shift); + dst[ 3] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[5][4] + iT[11] * a[0][1] - iT[13] * a[4][4] - iT[24] * a[6][4] - iT[1] * a[1][3] - iT[10] * a[0][3] + iT[14] * a[2][3] + iT[23] * a[3][3] - iT[2] * a[0][4] - iT[9] * a[1][4] + iT[15] * a[3][4] + iT[22] * a[2][4] + iT[3] * a[0][0] + iT[8] * a[5][5] - iT[16] * a[6][5] - iT[21] * a[4][5] + iT[4] * a[5][0] - iT[7] * a[9][0] + iT[17] * a[7][5] + iT[20] * a[2][5] - iT[5] * a[8][2] + iT[6] * a[9][3] - iT[18] * a[6][3] + iT[19] * a[3][2] + t[0] + add) >> shift); + dst[ 5] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[1][5] + iT[11] * a[8][5] - iT[13] * a[7][5] - iT[24] * a[4][0] + iT[1] * a[5][1] + iT[10] * a[0][4] - iT[14] * a[4][1] - iT[23] * a[6][1] - iT[2] * a[8][3] + iT[9] * a[9][2] - iT[15] * a[6][2] + iT[22] * a[3][3] - iT[3] * a[0][2] - iT[8] * a[1][2] + iT[16] * a[3][2] + iT[21] * a[2][2] - iT[4] * a[9][4] + iT[7] * a[5][4] + iT[17] * a[2][1] + iT[20] * a[7][1] + iT[5] * a[1][0] - iT[6] * a[8][0] + iT[18] * a[7][0] + iT[19] * a[4][5] - t[0] + add) >> shift); + dst[ 6] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[7][5] - iT[11] * a[2][5] + iT[13] * a[9][0] - iT[24] * a[5][0] + iT[1] * a[3][4] - iT[10] * a[6][1] - iT[14] * a[8][4] + iT[23] * a[9][1] + iT[2] * a[4][2] + iT[9] * a[7][3] + iT[15] * a[1][3] - iT[22] * a[8][3] - iT[3] * a[2][2] - iT[8] * a[3][2] + iT[16] * a[0][2] + iT[21] * a[1][2] - iT[4] * a[6][4] - iT[7] * a[4][4] + iT[17] * a[5][4] + iT[20] * a[0][1] + iT[5] * a[7][0] + iT[6] * a[2][0] - iT[18] * a[9][5] + iT[19] * a[5][5] - t[1] + add) >> shift); + dst[ 7] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[6][3] - iT[11] * a[4][3] + iT[13] * a[5][3] + iT[24] * a[0][2] + iT[1] * a[7][1] + iT[10] * a[4][4] - iT[14] * a[8][1] + iT[23] * a[1][1] - iT[2] * a[7][5] - iT[9] * a[4][0] + iT[15] * a[8][5] - iT[22] * a[1][5] + iT[3] * a[7][3] + iT[8] * a[2][3] - iT[16] * a[9][2] + iT[21] * a[5][2] - iT[4] * a[6][5] + iT[7] * a[3][0] + iT[17] * a[9][5] - iT[20] * a[8][0] + iT[5] * a[6][1] - iT[6] * a[3][4] - iT[18] * a[9][1] + iT[19] * a[8][4] - t[1] + add) >> shift); + dst[ 8] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[1][1] - iT[11] * a[0][1] + iT[13] * a[2][1] + iT[24] * a[3][1] + iT[1] * a[1][3] - iT[10] * a[8][3] + iT[14] * a[7][3] + iT[23] * a[4][2] - iT[2] * a[9][1] + iT[9] * a[8][4] - iT[15] * a[3][4] + iT[22] * a[6][1] + iT[3] * a[5][5] + iT[8] * a[0][0] - iT[16] * a[4][5] - iT[21] * a[6][5] + iT[4] * a[0][5] + iT[7] * a[1][5] - iT[17] * a[3][5] - iT[20] * a[2][5] + iT[5] * a[5][3] - iT[6] * a[9][3] + iT[18] * a[7][2] + iT[19] * a[2][2] - t[0] + add) >> shift); + dst[10] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[8][3] - iT[11] * a[1][3] - iT[13] * a[4][2] - iT[24] * a[7][3] - iT[1] * a[8][0] + iT[10] * a[1][0] + iT[14] * a[4][5] + iT[23] * a[7][0] + iT[2] * a[5][3] + iT[9] * a[0][2] - iT[15] * a[4][3] - iT[22] * a[6][3] - iT[3] * a[5][0] - iT[8] * a[0][5] + iT[16] * a[4][0] + iT[21] * a[6][0] + iT[4] * a[1][4] + iT[7] * a[0][4] - iT[17] * a[2][4] - iT[20] * a[3][4] - iT[5] * a[1][1] - iT[6] * a[0][1] + iT[18] * a[2][1] + iT[19] * a[3][1] + t[0] + add) >> shift); + dst[11] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[7][0] + iT[11] * a[2][0] - iT[13] * a[9][5] + iT[24] * a[5][5] + iT[1] * a[2][5] + iT[10] * a[7][5] + iT[14] * a[5][0] - iT[23] * a[9][0] - iT[2] * a[2][1] - iT[9] * a[3][1] + iT[15] * a[0][1] + iT[22] * a[1][1] - iT[3] * a[7][4] - iT[8] * a[4][1] + iT[16] * a[8][4] - iT[21] * a[1][4] + iT[4] * a[3][2] - iT[7] * a[6][3] - iT[17] * a[8][2] + iT[20] * a[9][3] + iT[5] * a[4][2] + iT[6] * a[6][2] - iT[18] * a[0][3] - iT[19] * a[5][2] + t[1] + add) >> shift); + dst[13] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[9][5] - iT[11] * a[8][0] + iT[13] * a[3][0] - iT[24] * a[6][5] - iT[1] * a[8][5] + iT[10] * a[9][0] - iT[14] * a[6][0] + iT[23] * a[3][5] + iT[2] * a[5][4] - iT[9] * a[9][4] + iT[15] * a[7][1] + iT[22] * a[2][1] - iT[3] * a[1][4] + iT[8] * a[8][4] - iT[16] * a[7][4] - iT[21] * a[4][1] - iT[4] * a[0][2] - iT[7] * a[5][3] + iT[17] * a[6][3] + iT[20] * a[4][3] + iT[5] * a[0][3] + iT[6] * a[1][3] - iT[18] * a[3][3] - iT[19] * a[2][3] + t[0] + add) >> shift); + dst[15] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[9][1] + iT[11] * a[5][1] + iT[13] * a[2][4] + iT[24] * a[7][4] + iT[1] * a[9][3] - iT[10] * a[5][3] - iT[14] * a[2][2] - iT[23] * a[7][2] - iT[2] * a[9][5] + iT[9] * a[5][5] + iT[15] * a[2][0] + iT[22] * a[7][0] + iT[3] * a[9][4] - iT[8] * a[8][1] + iT[16] * a[3][1] - iT[21] * a[6][4] - iT[4] * a[9][2] + iT[7] * a[8][3] - iT[17] * a[3][3] + iT[20] * a[6][2] + iT[5] * a[9][0] - iT[6] * a[8][5] + iT[18] * a[3][5] - iT[19] * a[6][0] - t[0] + add) >> shift); + dst[16] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[4][4] + iT[11] * a[7][1] + iT[13] * a[1][1] - iT[24] * a[8][1] + iT[1] * a[6][2] - iT[10] * a[3][3] - iT[14] * a[9][2] + iT[23] * a[8][3] - iT[2] * a[6][1] - iT[9] * a[4][1] + iT[15] * a[5][1] + iT[22] * a[0][4] - iT[3] * a[4][5] - iT[8] * a[6][5] + iT[16] * a[0][0] + iT[21] * a[5][5] - iT[4] * a[6][0] + iT[7] * a[3][5] + iT[17] * a[9][0] - iT[20] * a[8][5] + iT[5] * a[6][3] + iT[6] * a[4][3] - iT[18] * a[5][3] - iT[19] * a[0][2] - t[1] + add) >> shift); + dst[17] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[7][2] - iT[11] * a[4][3] + iT[13] * a[8][2] - iT[24] * a[1][2] + iT[1] * a[7][1] + iT[10] * a[2][1] - iT[14] * a[9][4] + iT[23] * a[5][4] - iT[2] * a[3][5] + iT[9] * a[6][0] + iT[15] * a[8][5] - iT[22] * a[9][0] - iT[3] * a[2][3] - iT[8] * a[7][3] - iT[16] * a[5][2] + iT[21] * a[9][2] + iT[4] * a[4][5] + iT[7] * a[7][0] + iT[17] * a[1][0] - iT[20] * a[8][0] - iT[5] * a[2][4] - iT[6] * a[3][4] + iT[18] * a[0][4] + iT[19] * a[1][4] - t[1] + add) >> shift); + dst[18] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[9][0] + iT[11] * a[8][5] - iT[13] * a[3][5] + iT[24] * a[6][0] + iT[1] * a[5][1] - iT[10] * a[9][1] + iT[14] * a[7][4] + iT[23] * a[2][4] + iT[2] * a[0][3] + iT[9] * a[5][2] - iT[15] * a[6][2] - iT[22] * a[4][2] + iT[3] * a[1][2] + iT[8] * a[0][2] - iT[16] * a[2][2] - iT[21] * a[3][2] - iT[4] * a[8][1] + iT[7] * a[1][1] + iT[17] * a[4][4] + iT[20] * a[7][1] + iT[5] * a[9][5] - iT[6] * a[8][0] + iT[18] * a[3][0] - iT[19] * a[6][5] - t[0] + add) >> shift); + dst[20] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[8][2] - iT[11] * a[9][3] + iT[13] * a[6][3] - iT[24] * a[3][2] + iT[1] * a[0][1] + iT[10] * a[5][4] - iT[14] * a[6][4] - iT[23] * a[4][4] + iT[2] * a[1][5] + iT[9] * a[0][5] - iT[15] * a[2][5] - iT[22] * a[3][5] - iT[3] * a[9][2] + iT[8] * a[5][2] + iT[16] * a[2][3] + iT[21] * a[7][3] + iT[4] * a[5][5] - iT[7] * a[9][5] + iT[17] * a[7][0] + iT[20] * a[2][0] + iT[5] * a[0][4] + iT[6] * a[5][1] - iT[18] * a[6][1] - iT[19] * a[4][1] + t[0] + add) >> shift); + dst[21] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[2][1] - iT[11] * a[7][1] - iT[13] * a[5][4] + iT[24] * a[9][4] - iT[1] * a[6][2] - iT[10] * a[4][2] + iT[14] * a[5][2] + iT[23] * a[0][3] - iT[2] * a[2][4] - iT[9] * a[7][4] - iT[15] * a[5][1] + iT[22] * a[9][1] - iT[3] * a[6][5] - iT[8] * a[4][5] + iT[16] * a[5][5] + iT[21] * a[0][0] - iT[4] * a[4][0] - iT[7] * a[7][5] - iT[17] * a[1][5] + iT[20] * a[8][5] - iT[5] * a[7][2] - iT[6] * a[4][3] + iT[18] * a[8][2] - iT[19] * a[1][2] + t[1] + add) >> shift); + dst[22] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[6][1] - iT[11] * a[3][4] - iT[13] * a[9][1] + iT[24] * a[8][4] + iT[1] * a[4][3] + iT[10] * a[6][3] - iT[14] * a[0][2] - iT[23] * a[5][3] + iT[2] * a[7][0] + iT[9] * a[4][5] - iT[15] * a[8][0] + iT[22] * a[1][0] - iT[3] * a[3][1] + iT[8] * a[6][4] + iT[16] * a[8][1] - iT[21] * a[9][4] - iT[4] * a[2][3] - iT[7] * a[3][3] + iT[17] * a[0][3] + iT[20] * a[1][3] - iT[5] * a[7][5] - iT[6] * a[2][5] + iT[18] * a[9][0] - iT[19] * a[5][0] + t[1] + add) >> shift); + dst[23] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[0][3] - iT[11] * a[1][3] + iT[13] * a[3][3] + iT[24] * a[2][3] - iT[1] * a[8][0] + iT[10] * a[9][5] - iT[14] * a[6][5] + iT[23] * a[3][0] + iT[2] * a[8][2] - iT[9] * a[1][2] - iT[15] * a[4][3] - iT[22] * a[7][2] + iT[3] * a[0][5] + iT[8] * a[5][0] - iT[16] * a[6][0] - iT[21] * a[4][0] + iT[4] * a[8][4] - iT[7] * a[9][1] + iT[17] * a[6][1] - iT[20] * a[3][4] - iT[5] * a[5][4] - iT[6] * a[0][1] + iT[18] * a[4][4] + iT[19] * a[6][4] + t[0] + add) >> shift); + dst[26] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[3][0] - iT[11] * a[2][0] + iT[13] * a[1][0] + iT[24] * a[0][0] - iT[1] * a[2][5] - iT[10] * a[3][5] + iT[14] * a[0][5] + iT[23] * a[1][5] + iT[2] * a[4][4] + iT[9] * a[6][4] - iT[15] * a[0][1] - iT[22] * a[5][4] - iT[3] * a[4][1] - iT[8] * a[7][4] - iT[16] * a[1][4] + iT[21] * a[8][4] + iT[4] * a[2][2] + iT[7] * a[7][2] + iT[17] * a[5][3] - iT[20] * a[9][3] + iT[5] * a[3][3] - iT[6] * a[6][2] - iT[18] * a[8][3] + iT[19] * a[9][2] - t[1] + add) >> shift); + dst[27] = Clip3<TCoeff>(outputMinimum, outputMaximum, (-iT[0] * a[3][3] + iT[11] * a[6][2] + iT[13] * a[8][3] - iT[24] * a[9][2] - iT[1] * a[2][0] - iT[10] * a[3][0] + iT[14] * a[0][0] + iT[23] * a[1][0] - iT[2] * a[6][3] + iT[9] * a[3][2] + iT[15] * a[9][3] - iT[22] * a[8][2] - iT[3] * a[4][0] - iT[8] * a[6][0] + iT[16] * a[0][5] + iT[21] * a[5][0] - iT[4] * a[7][4] - iT[7] * a[2][4] + iT[17] * a[9][1] - iT[20] * a[5][1] - iT[5] * a[4][4] - iT[6] * a[7][1] - iT[18] * a[1][1] + iT[19] * a[8][1] - t[1] + add) >> shift); + dst[28] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[0][4] + iT[11] * a[5][1] - iT[13] * a[6][1] - iT[24] * a[4][1] + iT[1] * a[9][3] - iT[10] * a[8][2] + iT[14] * a[3][2] - iT[23] * a[6][3] - iT[2] * a[1][0] - iT[9] * a[0][0] + iT[15] * a[2][0] + iT[22] * a[3][0] + iT[3] * a[8][1] - iT[8] * a[9][4] + iT[16] * a[6][4] - iT[21] * a[3][1] - iT[4] * a[5][2] - iT[7] * a[0][3] + iT[17] * a[4][2] + iT[20] * a[6][2] + iT[5] * a[1][5] - iT[6] * a[8][5] + iT[18] * a[7][5] + iT[19] * a[4][0] - t[0] + add) >> shift); + dst[30] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[5][3] - iT[11] * a[9][3] + iT[13] * a[7][2] + iT[24] * a[2][2] + iT[1] * a[0][1] + iT[10] * a[1][1] - iT[14] * a[3][1] - iT[23] * a[2][1] + iT[2] * a[9][0] - iT[9] * a[5][0] - iT[15] * a[2][5] - iT[22] * a[7][5] - iT[3] * a[5][2] + iT[8] * a[9][2] - iT[16] * a[7][3] - iT[21] * a[2][3] - iT[4] * a[0][0] - iT[7] * a[1][0] + iT[17] * a[3][0] + iT[20] * a[2][0] - iT[5] * a[9][1] + iT[6] * a[5][1] + iT[18] * a[2][4] + iT[19] * a[7][4] + t[0] + add) >> shift); + dst[31] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[3][5] + iT[11] * a[2][5] - iT[13] * a[1][5] - iT[24] * a[0][5] - iT[1] * a[3][4] - iT[10] * a[2][4] + iT[14] * a[1][4] + iT[23] * a[0][4] + iT[2] * a[3][3] + iT[9] * a[2][3] - iT[15] * a[1][3] - iT[22] * a[0][3] - iT[3] * a[3][2] - iT[8] * a[2][2] + iT[16] * a[1][2] + iT[21] * a[0][2] + iT[4] * a[3][1] + iT[7] * a[2][1] - iT[17] * a[1][1] - iT[20] * a[0][1] - iT[5] * a[3][0] - iT[6] * a[2][0] + iT[18] * a[1][0] + iT[19] * a[0][0] + t[1] + add) >> shift); + + dst[ 4] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[ 4] * b[0] + iT[14] * b[1] + iT[24] * b[2] + iT[29] * b[3] + iT[19] * b[4] + iT[ 9] * b[5] + add) >> shift); + dst[ 9] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[ 9] * b[0] + iT[29] * b[1] + iT[14] * b[2] - iT[ 4] * b[3] - iT[24] * b[4] - iT[19] * b[5] + add) >> shift); + dst[14] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[14] * b[0] + iT[19] * b[1] - iT[ 9] * b[2] - iT[24] * b[3] + iT[ 4] * b[4] + iT[29] * b[5] + add) >> shift); + dst[19] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[19] * b[0] + iT[ 4] * b[1] - iT[29] * b[2] + iT[ 9] * b[3] + iT[14] * b[4] - iT[24] * b[5] + add) >> shift); + dst[24] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[24] * b[0] - iT[ 9] * b[1] - iT[ 4] * b[2] + iT[19] * b[3] - iT[29] * b[4] + iT[14] * b[5] + add) >> shift); + dst[29] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[29] * b[0] - iT[24] * b[1] + iT[19] * b[2] - iT[14] * b[3] + iT[ 9] * b[4] - iT[ 4] * b[5] + add) >> shift); + + dst[12] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[12]*c[0] + iT[25]*c[1] + add) >> shift); + dst[25] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[25]*c[0] - iT[12]*c[1] + add) >> shift); +#else dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[1][0] - iT[11] * a[8][0] + iT[13] * a[7][0] + iT[24] * a[4][5] - iT[1] * a[8][5] + iT[10] * a[1][5] + iT[14] * a[4][0] + iT[23] * a[7][5] + iT[2] * a[1][1] - iT[9] * a[8][1] + iT[15] * a[7][1] + iT[22] * a[4][4] - iT[3] * a[8][4] + iT[8] * a[1][4] + iT[16] * a[4][1] + iT[21] * a[7][4] + iT[4] * a[1][2] - iT[7] * a[8][2] + iT[17] * a[7][2] + iT[20] * a[4][3] - iT[5] * a[8][3] + iT[6] * a[1][3] + iT[18] * a[4][2] + iT[19] * a[7][3] + t[0] + add) >> shift); dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[4][2] - iT[11] * a[6][2] + iT[13] * a[0][3] + iT[24] * a[5][2] + iT[1] * a[2][0] + iT[10] * a[7][0] + iT[14] * a[5][5] - iT[23] * a[9][5] + iT[2] * a[7][2] + iT[9] * a[2][2] - iT[15] * a[9][3] + iT[22] * a[5][3] - iT[3] * a[6][0] - iT[8] * a[4][0] + iT[16] * a[5][0] + iT[21] * a[0][5] - iT[4] * a[4][1] - iT[7] * a[6][1] + iT[17] * a[0][4] + iT[20] * a[5][1] + iT[5] * a[2][1] + iT[6] * a[7][1] + iT[18] * a[5][4] - iT[19] * a[9][4] + t[1] + add) >> shift); dst[ 2] = Clip3(outputMinimum, outputMaximum, (int)(-iT[0] * a[2][4] - iT[11] * a[3][4] + iT[13] * a[0][4] + iT[24] * a[1][4] + iT[1] * a[4][3] + iT[10] * a[7][2] + iT[14] * a[1][2] - iT[23] * a[8][2] + iT[2] * a[3][0] - iT[9] * a[6][5] - iT[15] * a[8][0] + iT[22] * a[9][5] - iT[3] * a[6][4] + iT[8] * a[3][1] + iT[16] * a[9][4] - iT[21] * a[8][1] + iT[4] * a[7][3] + iT[7] * a[4][2] - iT[17] * a[8][3] + iT[20] * a[1][3] - iT[5] * a[3][5] - iT[6] * a[2][5] + iT[18] * a[1][5] + iT[19] * a[0][5] + t[1] + add) >> shift); @@ -1243,7 +1399,7 @@ void fastInverseDST7_B32(const TCoeff *src, TCoeff *dst, int shift, int line, in dst[12] = Clip3(outputMinimum, outputMaximum, (int)(iT[12]*c[0] + iT[25]*c[1] + add) >> shift); dst[25] = Clip3(outputMinimum, outputMaximum, (int)(iT[25]*c[0] - iT[12]*c[1] + add) >> shift); - +#endif src++; dst += 32; } @@ -1265,7 +1421,11 @@ void fastForwardDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int int rnd_factor = 1 << (shift - 1); const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_FORWARD][0]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff c[4]; +#else int c[4]; +#endif TCoeff *pCoeff = dst; const int reducedLine = line - iSkipLine; for (i = 0; i<reducedLine; i++) @@ -1302,7 +1462,11 @@ void fastInverseDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int const TMatrixCoeff *iT = g_trCoreDCT8P4[TRANSFORM_INVERSE][0]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + TCoeff c[4]; +#else int c[4]; +#endif const int reducedLine = line - iSkipLine; for (i = 0; i<reducedLine; i++) { @@ -1312,11 +1476,17 @@ void fastInverseDCT8_B4(const TCoeff *src, TCoeff *dst, int shift, int line, int c[2] = src[3 * line] - src[2 * line]; c[3] = iT[1] * src[1 * line]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[0] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift); + dst[1] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[1] * (src[0 * line] - src[2 * line] - src[3 * line]) + rnd_factor) >> shift); + dst[2] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift); + dst[3] = Clip3<TCoeff>(outputMinimum, outputMaximum, (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift); +#else dst[0] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[0] + iT[2] * c[1] + c[3] + rnd_factor) >> shift); dst[1] = Clip3(outputMinimum, outputMaximum, (iT[1] * (src[0 * line] - src[2 * line] - src[3 * line]) + rnd_factor) >> shift); dst[2] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[2] + iT[2] * c[0] - c[3] + rnd_factor) >> shift); dst[3] = Clip3(outputMinimum, outputMaximum, (iT[3] * c[1] - iT[2] * c[2] - c[3] + rnd_factor) >> shift); - +#endif dst += 4; src++; } @@ -1430,6 +1600,26 @@ void fastInverseDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, in t = iT[10] * src[5*line]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[ 1] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift); + dst[ 4] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift); + dst[ 7] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift); + dst[10] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[11]*d[0] - iT[ 8]*d[1] - iT[ 2]*d[2] + iT[14]*d[3] - iT[ 5]*d[4] + add) >> shift); + dst[13] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[ 5]*d[0] + iT[11]*d[1] - iT[14]*d[2] + iT[ 8]*d[3] - iT[ 2]*d[4] + add) >> shift); + + dst[ 5] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[10] * (src[15 * line] + src[14 * line] - src[12 * line] - src[11 * line] + src[9 * line] + src[8 * line] - src[6 * line] - src[5 * line] + src[3 * line] + src[2 * line] - src[0 * line]) + add) >> shift); + + dst[ 0] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0]*a[0] + iT[9]*b[0] + iT[1]*a[1] + iT[8]*b[1] + iT[2]*a[2] + iT[7]*b[2] + iT[3]*a[3] + iT[6]*b[3] + iT[4]*a[4] + iT[5]*b[4] + t + add ) >> shift ); + dst[ 2] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[4]*c[0] - iT[5]*b[0] + iT[9]*c[1] - iT[0]*b[1] + iT[6]*c[2] + iT[3]*a[2] + iT[1]*c[3] + iT[8]*a[3] + iT[7]*a[4] + iT[2]*b[4] - t + add ) >> shift ); + dst[ 3] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[6]*a[0] - iT[3]*b[0] - iT[2]*c[1] - iT[7]*a[1] - iT[9]*c[2] - iT[0]*a[2] - iT[4]*c[3] + iT[5]*b[3] + iT[1]*a[4] + iT[8]*b[4] - t + add ) >> shift ); + dst[ 6] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[8]*a[0] + iT[1]*c[0] + iT[6]*c[1] - iT[3]*b[1] - iT[5]*a[2] - iT[4]*b[2] - iT[7]*c[3] - iT[2]*a[3] - iT[0]*c[4] + iT[9]*b[4] + t + add ) >> shift ); + dst[ 8] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[4]*c[0] + iT[5]*a[0] - iT[0]*c[1] + iT[9]*b[1] - iT[3]*c[2] - iT[6]*a[2] + iT[1]*c[3] - iT[8]*b[3] + iT[2]*c[4] + iT[7]*a[4] - t + add ) >> shift ); + dst[ 9] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[7]*c[0] - iT[2]*a[0] + iT[4]*a[1] + iT[5]*b[1] + iT[8]*c[2] - iT[1]*b[2] - iT[9]*a[3] - iT[0]*b[3] - iT[3]*c[4] + iT[6]*b[4] - t + add ) >> shift ); + dst[11] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[9]*a[0] - iT[0]*b[0] + iT[8]*c[1] + iT[1]*a[1] - iT[2]*c[2] + iT[7]*b[2] - iT[6]*a[3] - iT[3]*b[3] + iT[5]*c[4] + iT[4]*a[4] + t + add ) >> shift ); + dst[12] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift ); + dst[14] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift ); + dst[15] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift ); +#else dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( - iT[ 2]*d[0] - iT[ 5]*d[1] - iT[ 8]*d[2] - iT[11]*d[3] - iT[14]*d[4] + add) >> shift); dst[ 4] = Clip3(outputMinimum, outputMaximum, (int)( iT[ 8]*d[0] + iT[14]*d[1] + iT[ 5]*d[2] - iT[ 2]*d[3] - iT[11]*d[4] + add) >> shift); dst[ 7] = Clip3(outputMinimum, outputMaximum, (int)( - iT[14]*d[0] - iT[ 2]*d[1] + iT[11]*d[2] + iT[ 5]*d[3] - iT[ 8]*d[4] + add) >> shift); @@ -1448,7 +1638,7 @@ void fastInverseDCT8_B16(const TCoeff *src, TCoeff *dst, int shift, int line, in dst[12] = Clip3(outputMinimum, outputMaximum, (int)( iT[7]*c[0] - iT[2]*b[0] - iT[5]*c[1] - iT[4]*a[1] + iT[8]*a[2] + iT[1]*b[2] - iT[0]*a[3] - iT[9]*b[3] - iT[6]*c[4] + iT[3]*b[4] + t + add ) >> shift ); dst[14] = Clip3(outputMinimum, outputMaximum, (int)( iT[3]*a[0] + iT[6]*b[0] - iT[7]*a[1] - iT[2]*b[1] + iT[0]*c[2] + iT[9]*a[2] - iT[4]*c[3] - iT[5]*a[3] + iT[8]*c[4] + iT[1]*a[4] - t + add ) >> shift ); dst[15] = Clip3(outputMinimum, outputMaximum, (int)( - iT[1]*c[0] + iT[8]*b[0] + iT[3]*c[1] - iT[6]*b[1] - iT[5]*c[2] + iT[4]*b[2] + iT[7]*c[3] - iT[2]*b[3] - iT[9]*c[4] + iT[0]*b[4] - t + add ) >> shift ); - +#endif src++; dst += 16; } @@ -1603,6 +1793,42 @@ void fastInverseDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, in t[0] = iT[12] * src[19 * line] + iT[25] * src[ 6 * line]; t[1] = iT[12] * src[ 6 * line] - iT[25] * src[19 * line]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + dst[ 0] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift); + dst[ 1] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift); + dst[ 3] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift); + dst[ 4] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[3][2] - iT[11] * a[2][2] + iT[13] * a[1][2] + iT[24] * a[0][2] + iT[1] * a[6][0] + iT[10] * a[3][5] + iT[14] * a[9][0] + iT[23] * a[8][5] - iT[2] * a[2][3] - iT[9] * a[3][3] + iT[15] * a[0][3] + iT[22] * a[1][3] - iT[3] * a[7][0] + iT[8] * a[2][0] - iT[16] * a[9][5] - iT[21] * a[5][5] + iT[4] * a[4][4] + iT[7] * a[6][4] + iT[17] * a[0][1] - iT[20] * a[5][4] - iT[5] * a[7][4] - iT[6] * a[4][1] + iT[18] * a[8][4] + iT[19] * a[1][4] - t[0] + add) >> shift); + dst[ 5] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[3][5] + iT[11] * a[6][0] + iT[13] * a[8][5] + iT[24] * a[9][0] - iT[1] * a[6][5] - iT[10] * a[3][0] - iT[14] * a[9][5] - iT[23] * a[8][0] + iT[2] * a[7][4] - iT[9] * a[2][4] + iT[15] * a[9][1] + iT[22] * a[5][1] + iT[3] * a[7][1] + iT[8] * a[4][4] - iT[16] * a[8][1] - iT[21] * a[1][1] - iT[4] * a[6][2] - iT[7] * a[4][2] + iT[17] * a[5][2] - iT[20] * a[0][3] + iT[5] * a[3][2] + iT[6] * a[2][2] - iT[18] * a[1][2] - iT[19] * a[0][2] - t[0] + add) >> shift); + dst[ 8] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[9][3] + iT[11] * a[8][2] + iT[13] * a[3][2] + iT[24] * a[6][3] + iT[1] * a[1][5] + iT[10] * a[0][5] - iT[14] * a[2][5] - iT[23] * a[3][5] - iT[2] * a[1][3] - iT[9] * a[8][3] + iT[15] * a[7][3] + iT[22] * a[4][2] - iT[3] * a[9][5] - iT[8] * a[5][5] + iT[16] * a[2][0] - iT[21] * a[7][0] - iT[4] * a[1][1] - iT[7] * a[0][1] + iT[17] * a[2][1] + iT[20] * a[3][1] + iT[5] * a[5][1] + iT[6] * a[9][1] + iT[18] * a[7][4] - iT[19] * a[2][4] + t[1] + add) >> shift); + dst[ 9] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[2][1] + iT[11] * a[3][1] - iT[13] * a[0][1] - iT[24] * a[1][1] - iT[1] * a[7][3] + iT[10] * a[2][3] - iT[14] * a[9][2] - iT[23] * a[5][2] - iT[2] * a[4][0] - iT[9] * a[7][5] + iT[15] * a[1][5] + iT[22] * a[8][5] - iT[3] * a[3][4] - iT[8] * a[2][4] + iT[16] * a[1][4] + iT[21] * a[0][4] - iT[4] * a[6][3] - iT[7] * a[3][2] - iT[17] * a[9][3] - iT[20] * a[8][2] - iT[5] * a[4][5] - iT[6] * a[6][5] - iT[18] * a[0][0] + iT[19] * a[5][5] + t[0] + add) >> shift); + dst[10] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[6][1] - iT[11] * a[4][1] + iT[13] * a[5][1] - iT[24] * a[0][4] + iT[1] * a[2][2] - iT[10] * a[7][2] - iT[14] * a[5][3] - iT[23] * a[9][3] + iT[2] * a[6][4] + iT[9] * a[4][4] - iT[15] * a[5][4] + iT[22] * a[0][1] - iT[3] * a[2][5] + iT[8] * a[7][5] + iT[16] * a[5][0] + iT[21] * a[9][0] - iT[4] * a[7][0] - iT[7] * a[4][5] + iT[17] * a[8][0] + iT[20] * a[1][0] + iT[5] * a[4][2] + iT[6] * a[7][3] - iT[18] * a[1][3] - iT[19] * a[8][3] + t[0] + add) >> shift); + dst[11] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[1][3] - iT[11] * a[0][3] + iT[13] * a[2][3] + iT[24] * a[3][3] - iT[1] * a[9][1] - iT[10] * a[5][1] + iT[14] * a[2][4] - iT[23] * a[7][4] - iT[2] * a[8][0] - iT[9] * a[9][5] - iT[15] * a[6][5] - iT[22] * a[3][0] + iT[3] * a[0][2] - iT[8] * a[5][3] + iT[16] * a[6][3] + iT[21] * a[4][3] + iT[4] * a[5][0] - iT[7] * a[0][5] - iT[17] * a[4][0] - iT[20] * a[6][0] + iT[5] * a[9][4] + iT[6] * a[5][4] - iT[18] * a[2][1] + iT[19] * a[7][1] + t[1] + add) >> shift); + dst[13] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[0][0] + iT[11] * a[1][0] - iT[13] * a[3][0] - iT[24] * a[2][0] + iT[1] * a[5][4] - iT[10] * a[0][1] - iT[14] * a[4][4] - iT[23] * a[6][4] - iT[2] * a[9][3] - iT[9] * a[5][3] + iT[15] * a[2][2] - iT[22] * a[7][2] + iT[3] * a[8][3] + iT[8] * a[9][2] + iT[16] * a[6][2] + iT[21] * a[3][3] - iT[4] * a[1][4] - iT[7] * a[8][4] + iT[17] * a[7][4] + iT[20] * a[4][1] + iT[5] * a[0][5] + iT[6] * a[1][5] - iT[18] * a[3][5] - iT[19] * a[2][5] - t[1] + add) >> shift); + dst[14] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[4][2] + iT[11] * a[7][3] - iT[13] * a[1][3] - iT[24] * a[8][3] + iT[1] * a[4][1] + iT[10] * a[6][1] + iT[14] * a[0][4] - iT[23] * a[5][1] - iT[2] * a[3][0] - iT[9] * a[2][0] + iT[15] * a[1][0] + iT[22] * a[0][0] - iT[3] * a[6][3] - iT[8] * a[4][3] + iT[16] * a[5][3] - iT[21] * a[0][2] - iT[4] * a[7][5] - iT[7] * a[4][0] + iT[17] * a[8][5] + iT[20] * a[1][5] + iT[5] * a[6][4] + iT[6] * a[3][1] + iT[18] * a[9][4] + iT[19] * a[8][1] - t[0] + add) >> shift); + dst[15] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[7][4] + iT[11] * a[4][1] - iT[13] * a[8][4] - iT[24] * a[1][4] - iT[1] * a[2][2] - iT[10] * a[3][2] + iT[14] * a[0][2] + iT[23] * a[1][2] - iT[2] * a[2][1] + iT[9] * a[7][1] + iT[15] * a[5][4] + iT[22] * a[9][4] + iT[3] * a[7][5] - iT[8] * a[2][5] + iT[16] * a[9][0] + iT[21] * a[5][0] + iT[4] * a[2][0] + iT[7] * a[3][0] - iT[17] * a[0][0] - iT[20] * a[1][0] + iT[5] * a[2][3] - iT[6] * a[7][3] - iT[18] * a[5][2] - iT[19] * a[9][2] - t[0] + add) >> shift); + dst[16] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[0][1] + iT[11] * a[5][4] - iT[13] * a[6][4] - iT[24] * a[4][4] + iT[1] * a[0][3] - iT[10] * a[5][2] + iT[14] * a[6][2] + iT[23] * a[4][2] - iT[2] * a[0][5] + iT[9] * a[5][0] - iT[15] * a[6][0] - iT[22] * a[4][0] - iT[3] * a[0][4] - iT[8] * a[1][4] + iT[16] * a[3][4] + iT[21] * a[2][4] + iT[4] * a[0][2] + iT[7] * a[1][2] - iT[17] * a[3][2] - iT[20] * a[2][2] - iT[5] * a[0][0] - iT[6] * a[1][0] + iT[18] * a[3][0] + iT[19] * a[2][0] - t[1] + add) >> shift); + dst[18] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[0][5] + iT[11] * a[1][5] - iT[13] * a[3][5] - iT[24] * a[2][5] - iT[1] * a[1][0] - iT[10] * a[0][0] + iT[14] * a[2][0] + iT[23] * a[3][0] - iT[2] * a[5][1] + iT[9] * a[0][4] + iT[15] * a[4][1] + iT[22] * a[6][1] - iT[3] * a[8][1] - iT[8] * a[1][1] + iT[16] * a[4][4] + iT[21] * a[7][1] - iT[4] * a[9][2] - iT[7] * a[5][2] + iT[17] * a[2][3] - iT[20] * a[7][3] - iT[5] * a[9][3] - iT[6] * a[8][2] - iT[18] * a[3][2] - iT[19] * a[6][3] + t[1] + add) >> shift); + dst[20] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[4][0] - iT[11] * a[6][0] - iT[13] * a[0][5] + iT[24] * a[5][0] + iT[1] * a[6][5] + iT[10] * a[4][5] - iT[14] * a[5][5] + iT[23] * a[0][0] - iT[2] * a[6][1] - iT[9] * a[3][4] - iT[15] * a[9][1] - iT[22] * a[8][4] + iT[3] * a[4][4] + iT[8] * a[7][1] - iT[16] * a[1][1] - iT[21] * a[8][1] - iT[4] * a[3][3] - iT[7] * a[2][3] + iT[17] * a[1][3] + iT[20] * a[0][3] + iT[5] * a[7][2] - iT[6] * a[2][2] + iT[18] * a[9][3] + iT[19] * a[5][3] + t[0] + add) >> shift); + dst[21] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[1][2] + iT[11] * a[8][2] - iT[13] * a[7][2] - iT[24] * a[4][3] + iT[1] * a[1][5] + iT[10] * a[8][5] - iT[14] * a[7][5] - iT[23] * a[4][0] + iT[2] * a[5][2] + iT[9] * a[9][2] + iT[15] * a[7][3] - iT[22] * a[2][3] + iT[3] * a[5][5] + iT[8] * a[9][5] + iT[16] * a[7][0] - iT[21] * a[2][0] + iT[4] * a[8][1] + iT[7] * a[9][4] + iT[17] * a[6][4] + iT[20] * a[3][1] + iT[5] * a[8][4] + iT[6] * a[9][1] + iT[18] * a[6][1] + iT[19] * a[3][4] + t[1] + add) >> shift); + dst[23] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[8][4] + iT[11] * a[9][1] + iT[13] * a[6][1] + iT[24] * a[3][4] - iT[1] * a[8][2] - iT[10] * a[1][2] + iT[14] * a[4][3] + iT[23] * a[7][2] - iT[2] * a[0][1] - iT[9] * a[1][1] + iT[15] * a[3][1] + iT[22] * a[2][1] + iT[3] * a[5][0] + iT[8] * a[9][0] + iT[16] * a[7][5] - iT[21] * a[2][5] - iT[4] * a[9][5] - iT[7] * a[8][0] - iT[17] * a[3][0] - iT[20] * a[6][5] + iT[5] * a[5][2] - iT[6] * a[0][3] - iT[18] * a[4][2] - iT[19] * a[6][2] - t[1] + add) >> shift); + dst[24] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[2][3] + iT[11] * a[7][3] + iT[13] * a[5][2] + iT[24] * a[9][2] + iT[1] * a[4][1] + iT[10] * a[7][4] - iT[14] * a[1][4] - iT[23] * a[8][4] - iT[2] * a[4][5] - iT[9] * a[7][0] + iT[15] * a[1][0] + iT[22] * a[8][0] + iT[3] * a[4][3] + iT[8] * a[6][3] + iT[16] * a[0][2] - iT[21] * a[5][3] - iT[4] * a[2][5] - iT[7] * a[3][5] + iT[17] * a[0][5] + iT[20] * a[1][5] + iT[5] * a[2][1] + iT[6] * a[3][1] - iT[18] * a[0][1] - iT[19] * a[1][1] - t[0] + add) >> shift); + dst[25] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[4][5] - iT[11] * a[6][5] - iT[13] * a[0][0] + iT[24] * a[5][5] - iT[1] * a[3][1] - iT[10] * a[2][1] + iT[14] * a[1][1] + iT[23] * a[0][1] + iT[2] * a[7][2] + iT[9] * a[4][3] - iT[15] * a[8][2] - iT[22] * a[1][2] + iT[3] * a[6][2] + iT[8] * a[3][3] + iT[16] * a[9][2] + iT[21] * a[8][3] + iT[4] * a[2][4] - iT[7] * a[7][4] - iT[17] * a[5][1] - iT[20] * a[9][1] - iT[5] * a[4][0] - iT[6] * a[6][0] - iT[18] * a[0][5] + iT[19] * a[5][0] - t[0] + add) >> shift); + dst[26] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[8][0] + iT[11] * a[1][0] - iT[13] * a[4][5] - iT[24] * a[7][0] + iT[1] * a[5][4] + iT[10] * a[9][4] + iT[14] * a[7][1] - iT[23] * a[2][1] - iT[2] * a[1][2] - iT[9] * a[0][2] + iT[15] * a[2][2] + iT[22] * a[3][2] - iT[3] * a[9][2] - iT[8] * a[8][3] - iT[16] * a[3][3] - iT[21] * a[6][2] + iT[4] * a[0][4] - iT[7] * a[5][1] + iT[17] * a[6][1] + iT[20] * a[4][1] + iT[5] * a[8][5] + iT[6] * a[1][5] - iT[18] * a[4][0] - iT[19] * a[7][5] - t[1] + add) >> shift); + dst[28] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[5][1] - iT[11] * a[9][1] - iT[13] * a[7][4] + iT[24] * a[2][4] + iT[1] * a[8][2] + iT[10] * a[9][3] + iT[14] * a[6][3] + iT[23] * a[3][2] - iT[2] * a[9][4] - iT[9] * a[8][1] - iT[15] * a[3][1] - iT[22] * a[6][4] + iT[3] * a[9][0] + iT[8] * a[5][0] - iT[16] * a[2][5] + iT[21] * a[7][5] - iT[4] * a[5][5] + iT[7] * a[0][0] + iT[17] * a[4][5] + iT[20] * a[6][5] + iT[5] * a[1][3] + iT[6] * a[0][3] - iT[18] * a[2][3] - iT[19] * a[3][3] + t[1] + add) >> shift); + dst[29] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[6][4] + iT[11] * a[3][1] + iT[13] * a[9][4] + iT[24] * a[8][1] - iT[1] * a[7][3] - iT[10] * a[4][2] + iT[14] * a[8][3] + iT[23] * a[1][3] - iT[2] * a[3][5] - iT[9] * a[2][5] + iT[15] * a[1][5] + iT[22] * a[0][5] + iT[3] * a[2][4] + iT[8] * a[3][4] - iT[16] * a[0][4] - iT[21] * a[1][4] + iT[4] * a[4][3] + iT[7] * a[7][2] - iT[17] * a[1][2] - iT[20] * a[8][2] - iT[5] * a[3][0] - iT[6] * a[6][5] - iT[18] * a[8][0] - iT[19] * a[9][5] + t[0] + add) >> shift); + dst[30] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[0] * a[7][2] + iT[11] * a[2][2] - iT[13] * a[9][3] - iT[24] * a[5][3] - iT[1] * a[6][0] - iT[10] * a[4][0] + iT[14] * a[5][0] - iT[23] * a[0][5] - iT[2] * a[4][2] - iT[9] * a[6][2] - iT[15] * a[0][3] + iT[22] * a[5][2] + iT[3] * a[2][0] - iT[8] * a[7][0] - iT[16] * a[5][5] - iT[21] * a[9][5] + iT[4] * a[7][1] - iT[7] * a[2][1] + iT[17] * a[9][4] + iT[20] * a[5][4] + iT[5] * a[6][1] + iT[6] * a[4][1] - iT[18] * a[5][1] + iT[19] * a[0][4] + t[0] + add) >> shift); + dst[31] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[0] * a[8][5] + iT[11] * a[1][5] - iT[13] * a[4][0] - iT[24] * a[7][5] - iT[1] * a[1][0] - iT[10] * a[8][0] + iT[14] * a[7][0] + iT[23] * a[4][5] - iT[2] * a[8][4] - iT[9] * a[1][4] + iT[15] * a[4][1] + iT[22] * a[7][4] + iT[3] * a[1][1] + iT[8] * a[8][1] - iT[16] * a[7][1] - iT[21] * a[4][4] + iT[4] * a[8][3] + iT[7] * a[1][3] - iT[17] * a[4][2] - iT[20] * a[7][3] - iT[5] * a[1][2] - iT[6] * a[8][2] + iT[18] * a[7][2] + iT[19] * a[4][3] + t[1] + add) >> shift); + + dst[ 2] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[ 4] * b[0] + iT[ 9] * b[1] + iT[14] * b[2] + iT[19] * b[3] + iT[24] * b[4] + iT[29] * b[5] + add) >> shift); + dst[ 7] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[14] * b[0] - iT[29] * b[1] - iT[19] * b[2] - iT[ 4] * b[3] + iT[ 9] * b[4] + iT[24] * b[5] + add) >> shift); + dst[12] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[24] * b[0] + iT[14] * b[1] - iT[ 9] * b[2] - iT[29] * b[3] - iT[ 4] * b[4] + iT[19] * b[5] + add) >> shift); + dst[17] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[29] * b[0] + iT[ 4] * b[1] + iT[24] * b[2] - iT[ 9] * b[3] - iT[19] * b[4] + iT[14] * b[5] + add) >> shift); + dst[22] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[19] * b[0] - iT[24] * b[1] + iT[ 4] * b[2] + iT[14] * b[3] - iT[29] * b[4] + iT[ 9] * b[5] + add) >> shift); + dst[27] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[ 9] * b[0] + iT[19] * b[1] - iT[29] * b[2] + iT[24] * b[3] - iT[14] * b[4] + iT[ 4] * b[5] + add) >> shift); + + dst[ 6] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( iT[12] * c[0] + iT[25] * c[1] + add) >> shift); + dst[19] = Clip3<TCoeff>(outputMinimum, outputMaximum, ( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift); +#else dst[ 0] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[3][0] + iT[11] * a[6][5] + iT[13] * a[8][0] + iT[24] * a[9][5] + iT[1] * a[3][1] + iT[10] * a[6][4] + iT[14] * a[8][1] + iT[23] * a[9][4] + iT[2] * a[3][2] + iT[9] * a[6][3] + iT[15] * a[8][2] + iT[22] * a[9][3] + iT[3] * a[3][3] + iT[8] * a[6][2] + iT[16] * a[8][3] + iT[21] * a[9][2] + iT[4] * a[3][4] + iT[7] * a[6][1] + iT[17] * a[8][4] + iT[20] * a[9][1] + iT[5] * a[3][5] + iT[6] * a[6][0] + iT[18] * a[8][5] + iT[19] * a[9][0] + t[0] + add) >> shift); dst[ 1] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[5][2] - iT[11] * a[0][3] - iT[13] * a[4][2] - iT[24] * a[6][2] - iT[1] * a[9][1] - iT[10] * a[8][4] - iT[14] * a[3][4] - iT[23] * a[6][1] - iT[2] * a[0][0] + iT[9] * a[5][5] - iT[15] * a[6][5] - iT[22] * a[4][5] + iT[3] * a[5][3] - iT[8] * a[0][2] - iT[16] * a[4][3] - iT[21] * a[6][3] - iT[4] * a[9][0] - iT[7] * a[8][5] - iT[17] * a[3][5] - iT[20] * a[6][0] - iT[5] * a[0][1] + iT[6] * a[5][4] - iT[18] * a[6][4] - iT[19] * a[4][4] + t[1] + add) >> shift); dst[ 3] = Clip3(outputMinimum, outputMaximum, (int)( iT[0] * a[9][4] + iT[11] * a[5][4] - iT[13] * a[2][1] + iT[24] * a[7][1] + iT[1] * a[0][3] + iT[10] * a[1][3] - iT[14] * a[3][3] - iT[23] * a[2][3] - iT[2] * a[8][5] - iT[9] * a[9][0] - iT[15] * a[6][0] - iT[22] * a[3][5] + iT[3] * a[1][4] + iT[8] * a[0][4] - iT[16] * a[2][4] - iT[21] * a[3][4] + iT[4] * a[5][3] + iT[7] * a[9][3] + iT[17] * a[7][2] - iT[20] * a[2][2] - iT[5] * a[8][0] - iT[6] * a[1][0] + iT[18] * a[4][5] + iT[19] * a[7][0] - t[1] + add) >> shift); @@ -1638,6 +1864,7 @@ void fastInverseDCT8_B32(const TCoeff *src, TCoeff *dst, int shift, int line, in dst[ 6] = Clip3(outputMinimum, outputMaximum, (int)( iT[12] * c[0] + iT[25] * c[1] + add) >> shift); dst[19] = Clip3(outputMinimum, outputMaximum, (int)( - iT[25] * c[0] + iT[12] * c[1] + add) >> shift); +#endif src++; dst += 32; } diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index c82f1006d..c455aec1a 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -56,6 +56,9 @@ #define RETRAIN_CABAC 1 // CABAC initial values retrained on VTM-9.0rc1 +#define JVET_R0351_HIGH_BIT_DEPTH_SUPPORT 1 // JVET-R0351: high bit depth coding support (syntax changes, no mathematical differences for CTCs) +#define JVET_R0351_HIGH_BIT_DEPTH_ENABLED 0 // JVET-R0351: high bit depth coding enabled (increases accuracies of some calculations, e.g. transforms) + #define JVET_R0058 1 // JVET-R0058: the combination of RPR, subpictures, and scalability #define JVET_R0185_OLS_DPB_CLEANUP 1 // JVET-R0185: Replace if( !vps_all_independent_layers_flag ) condition on vps_num_dpb_params syntax element with if(!each_layer_is_an_ols_flag) @@ -327,8 +330,12 @@ typedef std::pair<int, int> TrCost; // This can be enabled by the makefile #ifndef RExt__HIGH_BIT_DEPTH_SUPPORT +#if JVET_R0351_HIGH_BIT_DEPTH_ENABLED +#define RExt__HIGH_BIT_DEPTH_SUPPORT 1 ///< 0 (default) use data type definitions for 8-10 bit video, 1 = use larger data types to allow for up to 16-bit video (originally developed as part of N0188) +#else #define RExt__HIGH_BIT_DEPTH_SUPPORT 0 ///< 0 (default) use data type definitions for 8-10 bit video, 1 = use larger data types to allow for up to 16-bit video (originally developed as part of N0188) #endif +#endif // SIMD optimizations #define SIMD_ENABLE 1 diff --git a/source/Lib/CommonLib/WeightPrediction.cpp b/source/Lib/CommonLib/WeightPrediction.cpp index cf20eb209..cab07bc6d 100644 --- a/source/Lib/CommonLib/WeightPrediction.cpp +++ b/source/Lib/CommonLib/WeightPrediction.cpp @@ -186,7 +186,11 @@ void WeightPrediction::addWeightBi(const CPelUnitBuf &pcYuvSrc0, const int w0 = wp0[compID].w; const int offset = wp0[compID].offset; const int clipBD = clpRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shiftNum = IF_INTERNAL_FRAC_BITS(clipBD); +#else const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipBD)); +#endif const int shift = wp0[compID].shift + shiftNum; const int round = (enableRounding[compID] && (shift > 0)) ? (1 << (shift - 1)) : 0; const int w1 = wp1[compID].w; @@ -243,7 +247,11 @@ void WeightPrediction::addWeightBiComponent(const CPelUnitBuf &pcYuvSrc const int w0 = wp0[compID].w; const int offset = wp0[compID].offset; const int clipBD = clpRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shiftNum = IF_INTERNAL_FRAC_BITS(clipBD); +#else const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipBD)); +#endif const int shift = wp0[compID].shift + shiftNum; const int round = (enableRounding[compID] && (shift > 0)) ? (1 << (shift - 1)) : 0; const int w1 = wp1[compID].w; @@ -304,7 +312,11 @@ void WeightPrediction::addWeightUni(const CPelUnitBuf &pcYuvSrc0, const int w0 = wp0[compID].w; const int offset = wp0[compID].offset; const int clipBD = clpRng.bd; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int shiftNum = IF_INTERNAL_FRAC_BITS(clipBD); +#else const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipBD)); +#endif const int shift = wp0[compID].shift + shiftNum; const uint32_t iSrc0Stride = pcYuvSrc0.bufs[compID].stride; const uint32_t iDstStride = rpcYuvDst.bufs[compID].stride; diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index 12c44a2f8..447d02b5c 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -297,7 +297,11 @@ static void simdDeriveClassificationBlk(AlfClassifier **classifier, int **laplac template<X86_VEXT vext> static void simdFilter5x5Blk(AlfClassifier **classifier, const PelUnitBuf &recDst, const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, const short *filterSet, +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#else const short *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#endif int vbPos) { @@ -484,7 +488,11 @@ static const uint16_t shuffleTab[4][2][8] = { template<X86_VEXT vext> static void simdFilter7x7Blk(AlfClassifier **classifier, const PelUnitBuf &recDst, const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, const short *filterSet, +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#else const short *fClipSet, const ClpRng &clpRng, CodingStructure &cs, const int vbCTUHeight, +#endif int vbPos) { CHECK((vbCTUHeight & (vbCTUHeight - 1)) != 0, "vbCTUHeight must be a power of 2"); diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h index c4e093f85..6969ebcea 100644 --- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h +++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h @@ -68,7 +68,11 @@ static void fullPelCopySSE( const ClpRng& clpRng, const void*_src, int srcStride { Tsrc* src = (Tsrc*)_src; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + int headroom = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else int headroom = IF_INTERNAL_PREC - clpRng.bd; +#endif int headroom_offset = 1 << ( headroom - 1 ); int offset = IF_INTERNAL_OFFS; __m128i voffset = _mm_set1_epi16( offset ); @@ -131,7 +135,11 @@ static void fullPelCopyAVX2( const ClpRng& clpRng, const void*_src, int srcStrid #ifdef USE_AVX2 Tsrc* src = (Tsrc*)_src; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + int headroom = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else int headroom = IF_INTERNAL_PREC - clpRng.bd; +#endif int offset = 1 << ( headroom - 1 ); int internal_offset = IF_INTERNAL_OFFS; @@ -1184,7 +1192,11 @@ static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel src -= ( N/2 - 1 ) * cStride; int offset; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + int headRoom = IF_INTERNAL_FRAC_BITS(clpRng.bd); +#else int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); +#endif int shift = IF_FILTER_PREC; // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 @@ -1339,7 +1351,11 @@ void xWeightedGeoBlk_SSE(const PredictionUnit &pu, const uint32_t width, const u const char log2WeightBase = 3; const ClpRng clpRng = pu.cu->slice->clpRngs().comp[compIdx]; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const int32_t shiftWeighted = IF_INTERNAL_FRAC_BITS(clpRng.bd) + log2WeightBase; +#else const int32_t shiftWeighted = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)) + log2WeightBase; +#endif const int32_t offsetWeighted = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase); int16_t wIdx = floorLog2(pu.lwidth()) - GEO_MIN_CU_LOG2; diff --git a/source/Lib/DecoderLib/CABACReader.cpp b/source/Lib/DecoderLib/CABACReader.cpp index 248ca9e3d..c972b82d2 100644 --- a/source/Lib/DecoderLib/CABACReader.cpp +++ b/source/Lib/DecoderLib/CABACReader.cpp @@ -3185,10 +3185,19 @@ int CABACReader::last_sig_coeff( CoeffCodingContext& cctx, TransformUnit& tu, Co return scanPos; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +static void check_coeff_conformance(const CoeffCodingContext& cctx, const TCoeff coeff) +#else static void check_coeff_conformance(TCoeff coeff) +#endif { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + CHECK( coeff < cctx.minCoeff() || coeff > cctx.maxCoeff(), + "TransCoeffLevel outside allowable range" ); +#else CHECK( coeff < COEFF_MIN || coeff > COEFF_MAX, "TransCoeffLevel should be in the range [-32768, 32767]" ); +#endif } void CABACReader::residual_coding_subblock( CoeffCodingContext& cctx, TCoeff* coeff, const int stateTransTable, int& state ) @@ -3339,7 +3348,11 @@ void CABACReader::residual_coding_subblock( CoeffCodingContext& cctx, TCoeff* co sumAbs += AbsCoeff; coeff[ sigBlkPos[k] ] = ( signPattern & ( 1u << 31 ) ? -AbsCoeff : AbsCoeff ); signPattern <<= 1; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + check_coeff_conformance( cctx, coeff[ sigBlkPos[k] ] ); +#else check_coeff_conformance( coeff[ sigBlkPos[k] ] ); +#endif } if( numNonZero > numSigns ) { @@ -3347,7 +3360,11 @@ void CABACReader::residual_coding_subblock( CoeffCodingContext& cctx, TCoeff* co int AbsCoeff = coeff[ sigBlkPos[ k ] ]; sumAbs += AbsCoeff; coeff[ sigBlkPos[k] ] = ( sumAbs & 1 ? -AbsCoeff : AbsCoeff ); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + check_coeff_conformance( cctx, coeff[ sigBlkPos[k] ] ); +#else check_coeff_conformance( coeff[ sigBlkPos[k] ] ); +#endif } } @@ -3464,7 +3481,11 @@ void CABACReader::residual_coding_subblockTS( CoeffCodingContext& cctx, TCoeff* DTRACE( g_trace_ctx, D_SYNTAX_RESI, "ts_par_flag() bin=%d ctx=%d\n", parFlag, cctx.parityCtxIdAbsTS() ); cctx.decimateNumCtxBins(1); } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + coeff[ blkPos ] = (sign ? -1 : 1 ) * (TCoeff)(1 + parFlag + gt1Flag); +#else coeff[ blkPos ] = (sign ? -1 : 1 ) * (1 + parFlag + gt1Flag); +#endif } lastScanPosPass1 = nextSigPos; } @@ -3538,7 +3559,11 @@ void CABACReader::residual_coding_subblockTS( CoeffCodingContext& cctx, TCoeff* int AbsCoeff = coeff[ sigBlkPos[ k ] ]; coeff[ sigBlkPos[k] ] = ( signPattern & 1 ? -AbsCoeff : AbsCoeff ); signPattern >>= 1; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + check_coeff_conformance( cctx, coeff[ sigBlkPos[k] ] ); +#else check_coeff_conformance( coeff[ sigBlkPos[k] ] ); +#endif } } diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index 518bbb73d..d7d34965b 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -5614,7 +5614,11 @@ void HLSyntaxReader::alfFilter( AlfParam& alfParam, const bool isChroma, const i AlfFilterShape alfShape( isChroma ? 5 : 7 ); const int numFilters = isChroma ? 1 : alfParam.numLumaFilters; short* coeff = isChroma ? alfParam.chromaCoeff[altIdx] : alfParam.lumaCoeff; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel* clipp = isChroma ? alfParam.chromaClipp[altIdx] : alfParam.lumaClipp; +#else short* clipp = isChroma ? alfParam.chromaClipp[altIdx] : alfParam.lumaClipp; +#endif // Filter coefficients diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index f8021943d..8fadc3b33 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -2255,7 +2255,11 @@ void EncAdaptiveLoopFilter::getBlkStats(AlfCovariance* alfCovariance, const AlfF { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues]; +#else int ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues]; +#endif const int numBins = AlfNumClippingValues[channel]; int transposeIdx = 0; @@ -2283,7 +2287,11 @@ void EncAdaptiveLoopFilter::getBlkStats(AlfCovariance* alfCovariance, const AlfF { weight = m_lumaLevelToWeightPLUT[org[j]]; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Intermediate_Int yLocal = org[j] - rec[j]; +#else int yLocal = org[j] - rec[j]; +#endif calcCovariance(ELocal, rec + j, recStride, shape, transposeIdx, channel, vbDistance); for( int k = 0; k < shape.numCoeff; k++ ) { @@ -2295,11 +2303,19 @@ void EncAdaptiveLoopFilter::getBlkStats(AlfCovariance* alfCovariance, const AlfF { if (m_alfWSSD) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance[classIdx].E[b0][b1][k][l] += weight * (ELocal[k][b0] * (double)ELocal[l][b1]); +#else alfCovariance[classIdx].E[b0][b1][k][l] += weight * (double)(ELocal[k][b0] * ELocal[l][b1]); +#endif } else { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance[classIdx].E[b0][b1][k][l] += ELocal[k][b0] * (double)ELocal[l][b1]; +#else alfCovariance[classIdx].E[b0][b1][k][l] += ELocal[k][b0] * ELocal[l][b1]; +#endif } } } @@ -2308,21 +2324,37 @@ void EncAdaptiveLoopFilter::getBlkStats(AlfCovariance* alfCovariance, const AlfF { if (m_alfWSSD) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance[classIdx].y[b][k] += weight * (ELocal[k][b] * (double)yLocal); +#else alfCovariance[classIdx].y[b][k] += weight * (double)(ELocal[k][b] * yLocal); +#endif } else { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance[classIdx].y[b][k] += ELocal[k][b] * (double)yLocal; +#else alfCovariance[classIdx].y[b][k] += ELocal[k][b] * yLocal; +#endif } } } if (m_alfWSSD) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance[classIdx].pixAcc += weight * (yLocal * (double)yLocal); +#else alfCovariance[classIdx].pixAcc += weight * (double)(yLocal * yLocal); +#endif } else { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance[classIdx].pixAcc += yLocal * (double)yLocal; +#else alfCovariance[classIdx].pixAcc += yLocal * yLocal; +#endif } } org += orgStride; @@ -2348,7 +2380,11 @@ void EncAdaptiveLoopFilter::getBlkStats(AlfCovariance* alfCovariance, const AlfF } } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +void EncAdaptiveLoopFilter::calcCovariance(Pel ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues], const Pel *rec, const int stride, const AlfFilterShape& shape, const int transposeIdx, const ChannelType channel, int vbDistance) +#else void EncAdaptiveLoopFilter::calcCovariance(int ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues], const Pel *rec, const int stride, const AlfFilterShape& shape, const int transposeIdx, const ChannelType channel, int vbDistance) +#endif { int clipTopRow = -4; int clipBotRow = 4; @@ -2369,7 +2405,11 @@ void EncAdaptiveLoopFilter::calcCovariance(int ELocal[MAX_NUM_ALF_LUMA_COEFF][Ma int k = 0; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel curr = rec[0]; +#else const short curr = rec[0]; +#endif if( transposeIdx == 0 ) { @@ -2665,7 +2705,11 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar for (int classIdx = 0; classIdx < MAX_NUM_ALF_CLASSES; classIdx++) { short* pCoeff = m_coeffFinal; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel* pClipp = m_clippFinal; +#else short* pClipp = m_clippFinal; +#endif for (int i = 0; i < MAX_NUM_ALF_LUMA_COEFF; i++) { m_filterTmp[i] = pCoeff[classIdx * MAX_NUM_ALF_LUMA_COEFF + i]; @@ -2743,7 +2787,11 @@ void EncAdaptiveLoopFilter::alfEncoderCtb(CodingStructure& cs, AlfParam& alfPar else { short *pCoeff; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel *pClipp; +#else short *pClipp; +#endif if (useNewFilter && filterSetIdx == NUM_FIXED_FILTER_SETS) { pCoeff = m_coeffFinal; @@ -3147,7 +3195,11 @@ void EncAdaptiveLoopFilter::alfReconstructor(CodingStructure& cs, const PelUnitB const Area blkDst(xStart, yStart, w, h); short filterSetIndex = alfCtuFilterIndex[ctuIdx]; short *coeff; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel *clip; +#else short *clip; +#endif if (filterSetIndex >= NUM_FIXED_FILTER_SETS) { coeff = m_coeffApsLuma[filterSetIndex - NUM_FIXED_FILTER_SETS]; @@ -3196,7 +3248,11 @@ void EncAdaptiveLoopFilter::alfReconstructor(CodingStructure& cs, const PelUnitB Area blk(xPos, yPos, width, height); short filterSetIndex = alfCtuFilterIndex[ctuIdx]; short *coeff; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel *clip; +#else short *clip; +#endif if (filterSetIndex >= NUM_FIXED_FILTER_SETS) { coeff = m_coeffApsLuma[filterSetIndex - NUM_FIXED_FILTER_SETS]; @@ -4147,7 +4203,11 @@ void EncAdaptiveLoopFilter::getBlkStatsCcAlf(AlfCovariance &alfCovariance, const vbPos = m_picHeight; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Pel ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][1]; +#else int ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][1]; +#endif for (int i = 0; i < compArea.height; i++) { @@ -4167,7 +4227,11 @@ void EncAdaptiveLoopFilter::getBlkStatsCcAlf(AlfCovariance &alfCovariance, const weight = m_lumaLevelToWeightPLUT[org[j]]; } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + Intermediate_Int yLocal = org[j] - rec[compID][j]; +#else int yLocal = org[j] - rec[compID][j]; +#endif calcCovarianceCcAlf( ELocal, rec[COMPONENT_Y] + ( j << getComponentScaleX(compID, m_chromaFormat)), recStride[COMPONENT_Y], shape, vbDistance ); @@ -4181,11 +4245,19 @@ void EncAdaptiveLoopFilter::getBlkStatsCcAlf(AlfCovariance &alfCovariance, const { if (m_alfWSSD) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance.E[b0][b1][k][l] += weight * (ELocal[k][b0] * (double)ELocal[l][b1]); +#else alfCovariance.E[b0][b1][k][l] += weight * (double) (ELocal[k][b0] * ELocal[l][b1]); +#endif } else { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance.E[b0][b1][k][l] += ELocal[k][b0] * (double)ELocal[l][b1]; +#else alfCovariance.E[b0][b1][k][l] += ELocal[k][b0] * ELocal[l][b1]; +#endif } } } @@ -4194,21 +4266,37 @@ void EncAdaptiveLoopFilter::getBlkStatsCcAlf(AlfCovariance &alfCovariance, const { if (m_alfWSSD) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance.y[b][k] += weight * (ELocal[k][b] * (double)yLocal); +#else alfCovariance.y[b][k] += weight * (double) (ELocal[k][b] * yLocal); +#endif } else { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance.y[b][k] += ELocal[k][b] * (double)yLocal; +#else alfCovariance.y[b][k] += ELocal[k][b] * yLocal; +#endif } } } if (m_alfWSSD) { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance.pixAcc += weight * (yLocal * (double)yLocal); +#else alfCovariance.pixAcc += weight * (double) (yLocal * yLocal); +#endif } else { +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + alfCovariance.pixAcc += yLocal * (double)yLocal; +#else alfCovariance.pixAcc += yLocal * yLocal; +#endif } } org += orgStride; @@ -4248,7 +4336,11 @@ void EncAdaptiveLoopFilter::getBlkStatsCcAlf(AlfCovariance &alfCovariance, const } } +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT +void EncAdaptiveLoopFilter::calcCovarianceCcAlf(Pel ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][1], const Pel *rec, const int stride, const AlfFilterShape& shape, int vbDistance) +#else void EncAdaptiveLoopFilter::calcCovarianceCcAlf(int ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][1], const Pel *rec, const int stride, const AlfFilterShape& shape, int vbDistance) +#endif { CHECK(shape.filterType != CC_ALF, "Bad CC ALF shape"); diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h index 0aa905998..525aa16c1 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h @@ -318,13 +318,21 @@ private: void getFrameStat( AlfCovariance* frameCov, AlfCovariance** ctbCov, uint8_t* ctbEnableFlags, uint8_t* ctbAltIdx, const int numClasses, int altIdx ); void deriveStatsForFiltering( PelUnitBuf& orgYuv, PelUnitBuf& recYuv, CodingStructure& cs ); void getBlkStats(AlfCovariance* alfCovariace, const AlfFilterShape& shape, AlfClassifier** classifier, Pel* org, const int orgStride, Pel* rec, const int recStride, const CompArea& areaDst, const CompArea& area, const ChannelType channel, int vbCTUHeight, int vbPos); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + void calcCovariance(Pel ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues], const Pel *rec, const int stride, const AlfFilterShape& shape, const int transposeIdx, const ChannelType channel, int vbDistance); +#else void calcCovariance(int ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues], const Pel *rec, const int stride, const AlfFilterShape& shape, const int transposeIdx, const ChannelType channel, int vbDistance); +#endif void deriveStatsForCcAlfFiltering(const PelUnitBuf &orgYuv, const PelUnitBuf &recYuv, const int compIdx, const int maskStride, const uint8_t filterIdc, CodingStructure &cs); void getBlkStatsCcAlf(AlfCovariance &alfCovariance, const AlfFilterShape &shape, const PelUnitBuf &orgYuv, const PelUnitBuf &recYuv, const UnitArea &areaDst, const UnitArea &area, const ComponentID compID, const int yPos); +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + void calcCovarianceCcAlf(Pel ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][1], const Pel* rec, const int stride, const AlfFilterShape& shape, int vbDistance); +#else void calcCovarianceCcAlf(int ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][1], const Pel* rec, const int stride, const AlfFilterShape& shape, int vbDistance); +#endif void mergeClasses(const AlfFilterShape& alfShape, AlfCovariance* cov, AlfCovariance* covMerged, int clipMerged[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF], const int numClasses, short filterIndices[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES]); diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index ec0a1bd15..4db625ea0 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -3291,7 +3291,11 @@ void HLSWriter::alfFilter( const AlfParam& alfParam, const bool isChroma, const { AlfFilterShape alfShape(isChroma ? 5 : 7); const short* coeff = isChroma ? alfParam.chromaCoeff[altIdx] : alfParam.lumaCoeff; +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel* clipp = isChroma ? alfParam.chromaClipp[altIdx] : alfParam.lumaClipp; +#else const short* clipp = isChroma ? alfParam.chromaClipp[altIdx] : alfParam.lumaClipp; +#endif const int numFilters = isChroma ? 1 : alfParam.numLumaFilters; // vlc for all -- GitLab