From 13c29e612bd42b05a91afdae16e2e07666ca8833 Mon Sep 17 00:00:00 2001 From: Jonathan Taquet <jonathan.taquet@crf.canon.fr> Date: Tue, 2 Apr 2019 23:19:25 +0200 Subject: [PATCH] Merge tag 'VTM-4.2' into JVET-N0242 VTM version 4.2 --- doc/software-manual.tex | 12 + source/App/EncoderApp/EncApp.cpp | 4 + source/App/EncoderApp/EncAppCfg.cpp | 8 + source/App/EncoderApp/EncAppCfg.h | 4 + source/Lib/CommonLib/AdaptiveLoopFilter.cpp | 184 +++ source/Lib/CommonLib/AdaptiveLoopFilter.h | 27 + source/Lib/CommonLib/TypeDef.h | 29 + .../Lib/CommonLib/x86/AdaptiveLoopFilterX86.h | 856 ++++++++++++++ source/Lib/DecoderLib/VLCReader.cpp | 87 +- source/Lib/DecoderLib/VLCReader.h | 4 + .../Lib/EncoderLib/EncAdaptiveLoopFilter.cpp | 1018 ++++++++++++++++- source/Lib/EncoderLib/EncAdaptiveLoopFilter.h | 211 ++++ source/Lib/EncoderLib/EncCfg.h | 10 + source/Lib/EncoderLib/EncLib.cpp | 4 + source/Lib/EncoderLib/VLCWriter.cpp | 96 +- source/Lib/EncoderLib/VLCWriter.h | 4 + 16 files changed, 2555 insertions(+), 3 deletions(-) diff --git a/doc/software-manual.tex b/doc/software-manual.tex index 7bf160af9..cbe8e48af 100644 --- a/doc/software-manual.tex +++ b/doc/software-manual.tex @@ -1962,6 +1962,18 @@ luma TUs are also skipped. \par This option has no effect if TransformSkip is disabled. \\ + +\Option{UseNonLinearAlfLuma} & +%\ShortOption{\None} & +\Default{true} & +Enables optimization of non-linear filters for ALF on Luma channel. +\\ + +\Option{UseNonLinearAlfChroma} & +%\ShortOption{\None} & +\Default{true} & +Enables optimization of non-linear filters for ALF on Chroma channels. +\\ \end{OptionTableNoShorthand} %% diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp index 626169456..8ee35e2bf 100644 --- a/source/App/EncoderApp/EncApp.cpp +++ b/source/App/EncoderApp/EncApp.cpp @@ -314,6 +314,10 @@ void EncApp::xInitLibCfg() m_cEncLib.setUseAMaxBT ( m_useAMaxBT ); m_cEncLib.setUseE0023FastEnc ( m_e0023FastEnc ); m_cEncLib.setUseContentBasedFastQtbt ( m_contentBasedFastQtbt ); +#if JVET_N0242_NON_LINEAR_ALF + m_cEncLib.setUseNonLinearAlfLuma ( m_useNonLinearAlfLuma ); + m_cEncLib.setUseNonLinearAlfChroma ( m_useNonLinearAlfChroma ); +#endif m_cEncLib.setCrossComponentPredictionEnabledFlag ( m_crossComponentPredictionEnabledFlag ); m_cEncLib.setUseReconBasedCrossCPredictionEstimate ( m_reconBasedCrossCPredictionEstimate ); m_cEncLib.setLog2SaoOffsetScale ( CHANNEL_TYPE_LUMA , m_log2SaoOffsetScale[CHANNEL_TYPE_LUMA] ); diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index 89aaccaa7..7b0368bde 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -899,6 +899,10 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) ("AMaxBT", m_useAMaxBT, false, "Adaptive maximal BT-size") ("E0023FastEnc", m_e0023FastEnc, true, "Fast encoding setting for QTBT (proposal E0023)") ("ContentBasedFastQtbt", m_contentBasedFastQtbt, false, "Signal based QTBT speed-up") +#if JVET_N0242_NON_LINEAR_ALF + ("UseNonLinearAlfLuma", m_useNonLinearAlfLuma, true, "Non-linear adaptive loop filters for Luma Channel") + ("UseNonLinearAlfChroma", m_useNonLinearAlfChroma, true, "Non-linear adaptive loop filters for Chroma Channels") +#endif // Unit definition parameters ("MaxCUWidth", m_uiMaxCUWidth, 64u) ("MaxCUHeight", m_uiMaxCUHeight, 64u) @@ -3194,6 +3198,10 @@ void EncAppCfg::xPrintParameter() msg( VERBOSE, "AMaxBT:%d ", m_useAMaxBT ); msg( VERBOSE, "E0023FastEnc:%d ", m_e0023FastEnc ); msg( VERBOSE, "ContentBasedFastQtbt:%d ", m_contentBasedFastQtbt ); +#if JVET_N0242_NON_LINEAR_ALF + msg( VERBOSE, "UseNonLinearALFLuma:%d ", m_useNonLinearAlfLuma ); + msg( VERBOSE, "UseNonLinearALFChroma:%d ", m_useNonLinearAlfChroma ); +#endif msg( VERBOSE, "NumSplitThreads:%d ", m_numSplitThreads ); if( m_numSplitThreads > 1 ) diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h index 94a7e46c5..95724dd1c 100644 --- a/source/App/EncoderApp/EncAppCfg.h +++ b/source/App/EncoderApp/EncAppCfg.h @@ -292,6 +292,10 @@ protected: bool m_useFastMrg; bool m_e0023FastEnc; bool m_contentBasedFastQtbt; +#if JVET_N0242_NON_LINEAR_ALF + bool m_useNonLinearAlfLuma; + bool m_useNonLinearAlfChroma; +#endif int m_numSplitThreads; diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp index b1aef8429..a3c9e0dac 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp @@ -39,6 +39,14 @@ #include "CodingStructure.h" #include "Picture.h" +#if JVET_N0242_NON_LINEAR_ALF +#include <array> +#include <cmath> +#endif + +#if JVET_N0242_NON_LINEAR_ALF +constexpr int AdaptiveLoopFilter::AlfNumClippingValues[]; +#endif AdaptiveLoopFilter::AdaptiveLoopFilter() : m_classifier( nullptr ) @@ -83,6 +91,9 @@ void AdaptiveLoopFilter::ALFProcess( CodingStructure& cs, AlfSliceParam& alfSlic m_ctuEnableFlag[compIdx] = cs.picture->getAlfCtuEnableFlag( compIdx ); } reconstructCoeff( alfSliceParam, CHANNEL_TYPE_LUMA ); +#if JVET_N0242_NON_LINEAR_ALF + if( alfSliceParam.enabledFlag[COMPONENT_Cb] || alfSliceParam.enabledFlag[COMPONENT_Cr] ) +#endif reconstructCoeff( alfSliceParam, CHANNEL_TYPE_CHROMA ); PelUnitBuf recYuv = cs.getRecoBuf(); @@ -106,7 +117,11 @@ void AdaptiveLoopFilter::ALFProcess( CodingStructure& cs, AlfSliceParam& alfSlic deriveClassification( m_classifier, tmpYuv.get( COMPONENT_Y ), blk ); Area blkPCM(xPos, yPos, width, height); resetPCMBlkClassInfo(cs, m_classifier, tmpYuv.get(COMPONENT_Y), blkPCM); +#if JVET_N0242_NON_LINEAR_ALF + m_filter7x7Blk( m_classifier, recYuv, tmpYuv, blk, COMPONENT_Y, m_coeffFinal, m_clippFinal, m_clpRngs.comp[COMPONENT_Y], cs ); +#else m_filter7x7Blk(m_classifier, recYuv, tmpYuv, blk, COMPONENT_Y, m_coeffFinal, m_clpRngs.comp[COMPONENT_Y], cs ); +#endif } for( int compIdx = 1; compIdx < MAX_NUM_COMPONENT; compIdx++ ) @@ -119,7 +134,11 @@ void AdaptiveLoopFilter::ALFProcess( CodingStructure& cs, AlfSliceParam& alfSlic { Area blk( xPos >> chromaScaleX, yPos >> chromaScaleY, width >> chromaScaleX, height >> chromaScaleY ); +#if JVET_N0242_NON_LINEAR_ALF + m_filter5x5Blk( m_classifier, recYuv, tmpYuv, blk, compID, alfSliceParam.chromaCoeff, m_chromaClippFinal, m_clpRngs.comp[compIdx], cs ); +#else m_filter5x5Blk( m_classifier, recYuv, tmpYuv, blk, compID, alfSliceParam.chromaCoeff, m_clpRngs.comp[compIdx], cs ); +#endif } } ctuIdx++; @@ -136,6 +155,9 @@ void AdaptiveLoopFilter::reconstructCoeff( AlfSliceParam& alfSliceParam, Channel int numCoeffMinus1 = numCoeff - 1; int numFilters = isLuma( channel ) ? alfSliceParam.numLumaFilters : 1; short* coeff = isLuma( channel ) ? alfSliceParam.lumaCoeff : alfSliceParam.chromaCoeff; +#if JVET_N0242_NON_LINEAR_ALF + short* clipp = isLuma( channel ) ? alfSliceParam.lumaClipp : alfSliceParam.chromaClipp; +#endif if( alfSliceParam.alfLumaCoeffDeltaPredictionFlag && isLuma( channel ) ) { @@ -150,16 +172,26 @@ void AdaptiveLoopFilter::reconstructCoeff( AlfSliceParam& alfSliceParam, Channel for( int filterIdx = 0; filterIdx < numFilters; filterIdx++ ) { +#if JVET_N0242_NON_LINEAR_ALF + coeff[filterIdx* MAX_NUM_ALF_LUMA_COEFF + numCoeffMinus1] = factor; +#else int sum = 0; for( int i = 0; i < numCoeffMinus1; i++ ) { sum += ( coeff[filterIdx* MAX_NUM_ALF_LUMA_COEFF + i] << 1 ); } coeff[filterIdx* MAX_NUM_ALF_LUMA_COEFF + numCoeffMinus1] = factor - sum; +#endif } if( isChroma( channel ) ) { +#if JVET_N0242_NON_LINEAR_ALF + for( int coeffIdx = 0; coeffIdx < numCoeffMinus1; ++coeffIdx ) + { + m_chromaClippFinal[coeffIdx] = alfSliceParam.nonLinearFlag[channel] ? m_alfClippingValues[channel][clipp[coeffIdx]] : m_alfClippingValues[channel][0]; + } +#endif return; } @@ -167,6 +199,12 @@ void AdaptiveLoopFilter::reconstructCoeff( AlfSliceParam& alfSliceParam, Channel { int filterIdx = alfSliceParam.filterCoeffDeltaIdx[classIdx]; memcpy( m_coeffFinal + classIdx * MAX_NUM_ALF_LUMA_COEFF, coeff + filterIdx * MAX_NUM_ALF_LUMA_COEFF, sizeof( short ) * numCoeff ); +#if JVET_N0242_NON_LINEAR_ALF + for( int coeffIdx = 0; coeffIdx < numCoeffMinus1; ++coeffIdx ) + { + (m_clippFinal + classIdx * MAX_NUM_ALF_LUMA_COEFF)[coeffIdx] = alfSliceParam.nonLinearFlag[channel] ? m_alfClippingValues[channel][(clipp + filterIdx * MAX_NUM_ALF_LUMA_COEFF)[coeffIdx]] : m_alfClippingValues[channel][0]; + } +#endif } if( bRedo && alfSliceParam.alfLumaCoeffDeltaPredictionFlag ) @@ -197,6 +235,31 @@ void AdaptiveLoopFilter::create( const int picWidth, const int picHeight, const m_filterShapes[CHANNEL_TYPE_LUMA].push_back( AlfFilterShape( 7 ) ); m_filterShapes[CHANNEL_TYPE_CHROMA].push_back( AlfFilterShape( 5 ) ); +#if JVET_N0242_NON_LINEAR_ALF + static_assert( AlfNumClippingValues[CHANNEL_TYPE_LUMA] > 0, "AlfNumClippingValues[CHANNEL_TYPE_LUMA] must be at least one" ); + for( int i = 0; i < AlfNumClippingValues[CHANNEL_TYPE_LUMA]; ++i ) + { + m_alfClippingValues[CHANNEL_TYPE_LUMA][i] = + (Pel) std::round( + std::pow( + 2., + double( m_inputBitDepth[CHANNEL_TYPE_LUMA] * ( AlfNumClippingValues[CHANNEL_TYPE_LUMA] - i ) ) / AlfNumClippingValues[CHANNEL_TYPE_LUMA] + ) ); + } + static_assert( AlfNumClippingValues[CHANNEL_TYPE_CHROMA] > 0, "AlfNumClippingValues[CHANNEL_TYPE_CHROMA] must be at least one" ); + m_alfClippingValues[CHANNEL_TYPE_CHROMA][0] = 1 << m_inputBitDepth[CHANNEL_TYPE_CHROMA]; + for( int i = 1; i < AlfNumClippingValues[CHANNEL_TYPE_CHROMA]; ++i ) + { + m_alfClippingValues[CHANNEL_TYPE_CHROMA][i] = + (Pel) std::round( + std::pow( + 2., + m_inputBitDepth[CHANNEL_TYPE_CHROMA] - 8 + + 8. * ( AlfNumClippingValues[CHANNEL_TYPE_CHROMA] - i - 1 ) / ( AlfNumClippingValues[CHANNEL_TYPE_CHROMA] - 1 ) + ) ); + } +#endif + m_tempBuf.destroy(); m_tempBuf.create( format, Area( 0, 0, picWidth, picHeight ), maxCUWidth, MAX_ALF_FILTER_LENGTH >> 1, 0, false ); @@ -496,7 +559,11 @@ void AdaptiveLoopFilter::deriveClassificationBlk( AlfClassifier** classifier, in } template<AlfFilterType filtType> +#if JVET_N0242_NON_LINEAR_ALF +void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, short* fClipSet, const ClpRng& clpRng, CodingStructure& cs ) +#else void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, const ClpRng& clpRng, CodingStructure& cs ) +#endif { const bool bChroma = isChroma( compId ); if( bChroma ) @@ -526,6 +593,9 @@ void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6; short *coef = filterSet; +#if JVET_N0242_NON_LINEAR_ALF + short *clip = fClipSet; +#endif const int shift = m_NUM_BITS - 1; @@ -547,7 +617,12 @@ void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf int dstStride2 = dstStride * clsSizeY; int srcStride2 = srcStride * clsSizeY; +#if JVET_N0242_NON_LINEAR_ALF + std::array<int, MAX_NUM_ALF_LUMA_COEFF> filterCoeff; + std::array<int, MAX_NUM_ALF_LUMA_COEFF> filterClipp; +#else std::vector<Pel> filterCoeff( MAX_NUM_ALF_LUMA_COEFF ); +#endif pImgYPad0 = src + startHeight * srcStride + startWidth; pImgYPad1 = pImgYPad0 + srcStride; @@ -578,6 +653,9 @@ void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf continue; } coef = filterSet + cl.classIdx * MAX_NUM_ALF_LUMA_COEFF; +#if JVET_N0242_NON_LINEAR_ALF + clip = fClipSet + cl.classIdx * MAX_NUM_ALF_LUMA_COEFF; +#endif } else if( isPCMFilterDisabled ) { @@ -609,18 +687,30 @@ void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf if( transposeIdx == 1 ) { filterCoeff = { coef[9], coef[4], coef[10], coef[8], coef[1], coef[5], coef[11], coef[7], coef[3], coef[0], coef[2], coef[6], coef[12] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[9], clip[4], clip[10], clip[8], clip[1], clip[5], clip[11], clip[7], clip[3], clip[0], clip[2], clip[6], clip[12] }; +#endif } else if( transposeIdx == 2 ) { filterCoeff = { coef[0], coef[3], coef[2], coef[1], coef[8], coef[7], coef[6], coef[5], coef[4], coef[9], coef[10], coef[11], coef[12] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[0], clip[3], clip[2], clip[1], clip[8], clip[7], clip[6], clip[5], clip[4], clip[9], clip[10], clip[11], clip[12] }; +#endif } else if( transposeIdx == 3 ) { filterCoeff = { coef[9], coef[8], coef[10], coef[4], coef[3], coef[7], coef[11], coef[5], coef[1], coef[0], coef[2], coef[6], coef[12] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[9], clip[8], clip[10], clip[4], clip[3], clip[7], clip[11], clip[5], clip[1], clip[0], clip[2], clip[6], clip[12] }; +#endif } else { filterCoeff = { coef[0], coef[1], coef[2], coef[3], coef[4], coef[5], coef[6], coef[7], coef[8], coef[9], coef[10], coef[11], coef[12] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[0], clip[1], clip[2], clip[3], clip[4], clip[5], clip[6], clip[7], clip[8], clip[9], clip[10], clip[11], clip[12] }; +#endif } } else @@ -628,18 +718,30 @@ void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf if( transposeIdx == 1 ) { filterCoeff = { coef[4], coef[1], coef[5], coef[3], coef[0], coef[2], coef[6] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[4], clip[1], clip[5], clip[3], clip[0], clip[2], clip[6] }; +#endif } else if( transposeIdx == 2 ) { filterCoeff = { coef[0], coef[3], coef[2], coef[1], coef[4], coef[5], coef[6] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[0], clip[3], clip[2], clip[1], clip[4], clip[5], clip[6] }; +#endif } else if( transposeIdx == 3 ) { filterCoeff = { coef[4], coef[3], coef[5], coef[1], coef[0], coef[2], coef[6] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[4], clip[3], clip[5], clip[1], clip[0], clip[2], clip[6] }; +#endif } else { filterCoeff = { coef[0], coef[1], coef[2], coef[3], coef[4], coef[5], coef[6] }; +#if JVET_N0242_NON_LINEAR_ALF + filterClipp = { clip[0], clip[1], clip[2], clip[3], clip[4], clip[5], clip[6] }; +#endif } } @@ -675,39 +777,121 @@ void AdaptiveLoopFilter::filterBlk( AlfClassifier** classifier, const PelUnitBuf } int sum = 0; +#if JVET_N0242_NON_LINEAR_ALF + const Pel curr = pImg0[+0]; +#endif if( filtType == ALF_FILTER_7 ) { +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[0] * ( pImg5[0] + pImg6[0] ); +#else + sum += filterCoeff[0] * ( clipALF(filterClipp[0], curr, pImg5[+0], pImg6[+0]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[1] * ( pImg3[+1] + pImg4[-1] ); +#else + sum += filterCoeff[1] * ( clipALF(filterClipp[1], curr, pImg3[+1], pImg4[-1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[2] * ( pImg3[+0] + pImg4[+0] ); +#else + sum += filterCoeff[2] * ( clipALF(filterClipp[2], curr, pImg3[+0], pImg4[+0]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[3] * ( pImg3[-1] + pImg4[+1] ); +#else + sum += filterCoeff[3] * ( clipALF(filterClipp[3], curr, pImg3[-1], pImg4[+1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[4] * ( pImg1[+2] + pImg2[-2] ); +#else + sum += filterCoeff[4] * ( clipALF(filterClipp[4], curr, pImg1[+2], pImg2[-2]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[5] * ( pImg1[+1] + pImg2[-1] ); +#else + sum += filterCoeff[5] * ( clipALF(filterClipp[5], curr, pImg1[+1], pImg2[-1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[6] * ( pImg1[+0] + pImg2[+0] ); +#else + sum += filterCoeff[6] * ( clipALF(filterClipp[6], curr, pImg1[+0], pImg2[+0]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[7] * ( pImg1[-1] + pImg2[+1] ); +#else + sum += filterCoeff[7] * ( clipALF(filterClipp[7], curr, pImg1[-1], pImg2[+1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[8] * ( pImg1[-2] + pImg2[+2] ); +#else + sum += filterCoeff[8] * ( clipALF(filterClipp[8], curr, pImg1[-2], pImg2[+2]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[9] * ( pImg0[+3] + pImg0[-3] ); +#else + sum += filterCoeff[9] * ( clipALF(filterClipp[9], curr, pImg0[+3], pImg0[-3]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[10] * ( pImg0[+2] + pImg0[-2] ); +#else + sum += filterCoeff[10] * ( clipALF(filterClipp[10], curr, pImg0[+2], pImg0[-2]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[11] * ( pImg0[+1] + pImg0[-1] ); +#else + sum += filterCoeff[11] * ( clipALF(filterClipp[11], curr, pImg0[+1], pImg0[-1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[12] * ( pImg0[+0] ); +#endif } else { +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[0] * ( pImg3[+0] + pImg4[+0] ); +#else + sum += filterCoeff[0] * ( clipALF(filterClipp[0], curr, pImg3[+0], pImg4[+0]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[1] * ( pImg1[+1] + pImg2[-1] ); +#else + sum += filterCoeff[1] * ( clipALF(filterClipp[1], curr, pImg1[+1], pImg2[-1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[2] * ( pImg1[+0] + pImg2[+0] ); +#else + sum += filterCoeff[2] * ( clipALF(filterClipp[2], curr, pImg1[+0], pImg2[+0]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[3] * ( pImg1[-1] + pImg2[+1] ); +#else + sum += filterCoeff[3] * ( clipALF(filterClipp[3], curr, pImg1[-1], pImg2[+1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[4] * ( pImg0[+2] + pImg0[-2] ); +#else + sum += filterCoeff[4] * ( clipALF(filterClipp[4], curr, pImg0[+2], pImg0[-2]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[5] * ( pImg0[+1] + pImg0[-1] ); +#else + sum += filterCoeff[5] * ( clipALF(filterClipp[5], curr, pImg0[+1], pImg0[-1]) ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF sum += filterCoeff[6] * ( pImg0[+0] ); +#endif } sum = ( sum + offset ) >> shift; +#if JVET_N0242_NON_LINEAR_ALF + sum += curr; +#endif pRec1[jj] = ClipPel( sum, clpRng ); pImg0++; diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h index 92928fee0..5d1da4788 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.h +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.h @@ -42,6 +42,7 @@ #include "Unit.h" #include "UnitTools.h" + struct AlfClassifier { AlfClassifier() {} @@ -66,6 +67,16 @@ enum Direction class AdaptiveLoopFilter { public: +#if JVET_N0242_NON_LINEAR_ALF + static inline int clipALF(const int clip, const short ref, const short val0, const short val1) + { + return Clip3<int>(-clip, +clip, val0-ref) + Clip3<int>(-clip, +clip, val1-ref); + } + + static constexpr int AlfNumClippingValues[MAX_NUM_CHANNEL_TYPE] = { 4, 4 }; + static constexpr int MaxAlfNumClippingValues = 4; + +#endif static constexpr int m_NUM_BITS = 8; static constexpr int m_CLASSIFICATION_BLK_SIZE = 32; //non-normative, local buffer size static constexpr int m_ALF_UNUSED_CLASSIDX = 255; @@ -82,15 +93,24 @@ public: void deriveClassification( AlfClassifier** classifier, const CPelBuf& srcLuma, const Area& blk ); void resetPCMBlkClassInfo(CodingStructure & cs, AlfClassifier** classifier, const CPelBuf& srcLuma, const Area& blk); template<AlfFilterType filtType> +#if JVET_N0242_NON_LINEAR_ALF + static void filterBlk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, short* fClipSet, const ClpRng& clpRng, CodingStructure& cs ); +#else static void filterBlk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, const ClpRng& clpRng, CodingStructure& cs ); +#endif inline static int getMaxGolombIdx( AlfFilterType filterType ) { return filterType == ALF_FILTER_5 ? 2 : 3; } void( *m_deriveClassificationBlk )( AlfClassifier** classifier, int** laplacian[NUM_DIRECTIONS], const CPelBuf& srcLuma, const Area& blk, const int shift ); +#if JVET_N0242_NON_LINEAR_ALF + void( *m_filter5x5Blk )( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, short* fClipSet, const ClpRng& clpRng, CodingStructure& cs ); + void( *m_filter7x7Blk )( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, short* fClipSet, const ClpRng& clpRng, CodingStructure& cs ); +#else void( *m_filter5x5Blk )( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, const ClpRng& clpRng, CodingStructure& cs ); void( *m_filter7x7Blk )( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, const ClpRng& clpRng, CodingStructure& cs ); +#endif #ifdef TARGET_SIMD_X86 void initAdaptiveLoopFilterX86(); @@ -99,9 +119,16 @@ public: #endif protected: +#if JVET_N0242_NON_LINEAR_ALF + Pel m_alfClippingValues[MAX_NUM_CHANNEL_TYPE][MaxAlfNumClippingValues]; +#endif std::vector<AlfFilterShape> m_filterShapes[MAX_NUM_CHANNEL_TYPE]; AlfClassifier** m_classifier; short m_coeffFinal[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; +#if JVET_N0242_NON_LINEAR_ALF + short m_clippFinal[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; + short m_chromaClippFinal[MAX_NUM_ALF_LUMA_COEFF]; +#endif int** m_laplacian[NUM_DIRECTIONS]; uint8_t* m_ctuEnableFlag[MAX_NUM_COMPONENT]; PelStorage m_tempBuf; diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index 043b21889..f96bea9de 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -50,6 +50,8 @@ #include <assert.h> #include <cassert> +#define JVET_N0242_NON_LINEAR_ALF 1 // enable CE5-3.2, Non-linear ALF based on clipping function + #define JVET_N0449_MMVD_SIMP 1 // Configurable number of mmvd distance entries used #define JVET_N0137_DUALTREE_CHROMA_SIZE 1 @@ -1555,8 +1557,17 @@ struct AlfFilterShape struct AlfSliceParam { bool enabledFlag[MAX_NUM_COMPONENT]; // alf_slice_enable_flag, alf_chroma_idc +#if JVET_N0242_NON_LINEAR_ALF + bool nonLinearFlag[MAX_NUM_CHANNEL_TYPE]; // alf_nonlinear_enable_flag[Luma/Chroma] +#endif short lumaCoeff[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; // alf_coeff_luma_delta[i][j] +#if JVET_N0242_NON_LINEAR_ALF + short lumaClipp[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; // alf_clipp_luma_[i][j] +#endif short chromaCoeff[MAX_NUM_ALF_CHROMA_COEFF]; // alf_coeff_chroma[i] +#if JVET_N0242_NON_LINEAR_ALF + short chromaClipp[MAX_NUM_ALF_CHROMA_COEFF]; // alf_clipp_chroma[i] +#endif short filterCoeffDeltaIdx[MAX_NUM_ALF_CLASSES]; // filter_coeff_delta[i] bool alfLumaCoeffFlag[MAX_NUM_ALF_CLASSES]; // alf_luma_coeff_flag[i] int numLumaFilters; // number_of_filters_minus1 + 1 @@ -1572,8 +1583,17 @@ struct AlfSliceParam void reset() { std::memset( enabledFlag, false, sizeof( enabledFlag ) ); +#if JVET_N0242_NON_LINEAR_ALF + std::memset( nonLinearFlag, false, sizeof( nonLinearFlag ) ); +#endif std::memset( lumaCoeff, 0, sizeof( lumaCoeff ) ); +#if JVET_N0242_NON_LINEAR_ALF + std::memset( lumaClipp, 0, sizeof( lumaClipp ) ); +#endif std::memset( chromaCoeff, 0, sizeof( chromaCoeff ) ); +#if JVET_N0242_NON_LINEAR_ALF + std::memset( chromaClipp, 0, sizeof( chromaClipp ) ); +#endif std::memset( filterCoeffDeltaIdx, 0, sizeof( filterCoeffDeltaIdx ) ); std::memset( alfLumaCoeffFlag, true, sizeof( alfLumaCoeffFlag ) ); numLumaFilters = 1; @@ -1584,8 +1604,17 @@ struct AlfSliceParam const AlfSliceParam& operator = ( const AlfSliceParam& src ) { std::memcpy( enabledFlag, src.enabledFlag, sizeof( enabledFlag ) ); +#if JVET_N0242_NON_LINEAR_ALF + std::memcpy( nonLinearFlag, src.nonLinearFlag, sizeof( nonLinearFlag ) ); +#endif std::memcpy( lumaCoeff, src.lumaCoeff, sizeof( lumaCoeff ) ); +#if JVET_N0242_NON_LINEAR_ALF + std::memcpy( lumaClipp, src.lumaClipp, sizeof( lumaClipp ) ); +#endif std::memcpy( chromaCoeff, src.chromaCoeff, sizeof( chromaCoeff ) ); +#if JVET_N0242_NON_LINEAR_ALF + std::memcpy( chromaClipp, src.chromaClipp, sizeof( chromaClipp ) ); +#endif std::memcpy( filterCoeffDeltaIdx, src.filterCoeffDeltaIdx, sizeof( filterCoeffDeltaIdx ) ); std::memcpy( alfLumaCoeffFlag, src.alfLumaCoeffFlag, sizeof( alfLumaCoeffFlag ) ); numLumaFilters = src.numLumaFilters; diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index ef368d78f..756e92a38 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -317,12 +317,18 @@ static void simdDeriveClassificationBlk( AlfClassifier** classifier, int** lapla } template<X86_VEXT vext> +#if JVET_N0242_NON_LINEAR_ALF +static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, short* fClipSet, const ClpRng& clpRng, CodingStructure& cs ) +#else static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, const ClpRng& clpRng, CodingStructure& cs ) +#endif { +#if !JVET_N0242_NON_LINEAR_ALF static const unsigned char mask05[16] = { 8, 9, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static const unsigned char mask03[16] = { 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; static const unsigned char mask_c[16] = { 0, 1, 8, 9, 4, 5, 14, 15, 2, 3, 10, 11, 12, 13, 6, 7 }; +#endif const bool bChroma = isChroma( compId ); const SPS* sps = cs.slice->getSPS(); @@ -336,6 +342,7 @@ static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recD const int srcStride = srcLuma.stride; const int dstStride = dstLuma.stride; +#if !JVET_N0242_NON_LINEAR_ALF const Pel* srcExt = srcLuma.buf; Pel* dst = dstLuma.buf; @@ -344,6 +351,7 @@ static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recD short *coef = filterSet; const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5; +#endif const int numBitsMinus1 = AdaptiveLoopFilter::m_NUM_BITS - 1; const int offset = ( 1 << ( AdaptiveLoopFilter::m_NUM_BITS - 2 ) ); @@ -352,56 +360,162 @@ static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recD const int startWidth = blk.x; const int endWidth = blk.x + blk.width; +#if JVET_N0242_NON_LINEAR_ALF + const Pel* src = srcLuma.buf; + Pel* dst = dstLuma.buf + startHeight * dstStride; + + const Pel *pImgYPad0, *pImgYPad1, *pImgYPad2, *pImgYPad3, *pImgYPad4; + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4; + + short *coef[2] = { filterSet, filterSet }; + short *clip[2] = { fClipSet, fClipSet }; + + int transposeIdx[2] = {0, 0}; +#else Pel* imgYRecPost = dst; imgYRecPost += startHeight * dstStride; int transposeIdx = 0; +#endif const int clsSizeY = 4; const int clsSizeX = 4; +#if JVET_N0242_NON_LINEAR_ALF + bool pcmFlags2x2[8] = {0,0,0,0,0,0,0,0}; + Pel pcmRec2x2[32]; +#else bool pcmFlags2x2[4] = {0,0,0,0}; Pel pcmRec2x2[16]; +#endif CHECK( startHeight % clsSizeY, "Wrong startHeight in filtering" ); CHECK( startWidth % clsSizeX, "Wrong startWidth in filtering" ); CHECK( ( endHeight - startHeight ) % clsSizeY, "Wrong endHeight in filtering" ); CHECK( ( endWidth - startWidth ) % clsSizeX, "Wrong endWidth in filtering" ); +#if !JVET_N0242_NON_LINEAR_ALF const Pel* imgYRec = srcExt; Pel *pRec; +#endif AlfClassifier *pClass = nullptr; +#if JVET_N0242_NON_LINEAR_ALF + int dstStride2 = dstStride * clsSizeY; +#endif int srcStride2 = srcStride * clsSizeY; const __m128i mmOffset = _mm_set1_epi32( offset ); +#if JVET_N0242_NON_LINEAR_ALF + const __m128i mmMin = _mm_set1_epi16( clpRng.min ); + const __m128i mmMax = _mm_set1_epi16( clpRng.max ); +#else const __m128i mmMin = _mm_set1_epi32( clpRng.min ); const __m128i mmMax = _mm_set1_epi32( clpRng.max ); +#endif + +#if JVET_N0242_NON_LINEAR_ALF + const unsigned char *filterCoeffIdx[2]; + Pel filterCoeff[MAX_NUM_ALF_LUMA_COEFF][2]; + Pel filterClipp[MAX_NUM_ALF_LUMA_COEFF][2]; + pImgYPad0 = src + startHeight * srcStride + startWidth; +#else const __m128i xmm10 = _mm_loadu_si128( ( __m128i* )mask03 ); const __m128i mm_mask05 = _mm_loadu_si128( ( __m128i* )mask05 ); pImgYPad0 = imgYRec + startHeight * srcStride + startWidth; +#endif pImgYPad1 = pImgYPad0 + srcStride; pImgYPad2 = pImgYPad0 - srcStride; pImgYPad3 = pImgYPad1 + srcStride; pImgYPad4 = pImgYPad2 - srcStride; +#if !JVET_N0242_NON_LINEAR_ALF pImgYPad5 = pImgYPad3 + srcStride; +#endif +#if JVET_N0242_NON_LINEAR_ALF + Pel* pRec0 = dst + startWidth; + Pel* pRec1; +#else pRec = imgYRecPost + startWidth; +#endif +#if JVET_N0242_NON_LINEAR_ALF + for( int i = 0; i < endHeight - startHeight; i += clsSizeY ) +#else for( int i = 0; i < endHeight - startHeight; i += 4 ) +#endif { +#if !JVET_N0242_NON_LINEAR_ALF pRec = imgYRecPost + startWidth + i * dstStride; +#endif if( !bChroma ) { pClass = classifier[startHeight + i] + startWidth; } +#if JVET_N0242_NON_LINEAR_ALF + for( int j = 0; j < endWidth - startWidth; j += 8 ) +#else for( int j = 0; j < endWidth - startWidth; j += 4 ) +#endif { +#if JVET_N0242_NON_LINEAR_ALF + for( int k = 0; k < 2; ++k ) + { + if( !bChroma ) + { + const AlfClassifier& cl = pClass[j+4*k]; + transposeIdx[k] = cl.transposeIdx; + coef[k] = filterSet + cl.classIdx * MAX_NUM_ALF_LUMA_COEFF; + clip[k] = fClipSet + cl.classIdx * MAX_NUM_ALF_LUMA_COEFF; + if ( isPCMFilterDisabled && cl.classIdx == AdaptiveLoopFilter::m_ALF_UNUSED_CLASSIDX && transposeIdx[k] == AdaptiveLoopFilter::m_ALF_UNUSED_TRANSPOSIDX ) + { + // Note that last one (i.e. filterCoeff[6][k]) is not unused with JVET_N0242_NON_LINEAR_ALF; could be simplified + static const unsigned char _filterCoeffIdx[7] = { 0, 0, 0, 0, 0, 0, 0 }; + static short _identityFilterCoeff[] = { 0 }; + static short _identityFilterClipp[] = { 0 }; + filterCoeffIdx[k] = _filterCoeffIdx; + coef[k] = _identityFilterCoeff; + clip[k] = _identityFilterClipp; + } + else if( transposeIdx[k] == 1 ) + { + static const unsigned char _filterCoeffIdx[7] = { 4, 1, 5, 3, 0, 2, 6 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + else if( transposeIdx[k] == 2 ) + { + static const unsigned char _filterCoeffIdx[7] = { 0, 3, 2, 1, 4, 5, 6 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + else if( transposeIdx[k] == 3 ) + { + static const unsigned char _filterCoeffIdx[7] = { 4, 3, 5, 1, 0, 2, 6 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + else + { + static const unsigned char _filterCoeffIdx[7] = { 0, 1, 2, 3, 4, 5, 6 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + } + else + { + static const unsigned char _filterCoeffIdx[7] = { 0, 1, 2, 3, 4, 5, 6 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + + for ( int i=0; i < 7; ++i ) + { + filterCoeff[i][k] = coef[k][filterCoeffIdx[k][i]]; + filterClipp[i][k] = clip[k][filterCoeffIdx[k][i]]; + } + } +#else if( !bChroma ) { AlfClassifier& cl = pClass[j]; @@ -464,13 +578,223 @@ static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recD c0 = _mm_shuffle_epi8( c0, xmm10 ); } +#endif pImg0 = pImgYPad0 + j; pImg1 = pImgYPad1 + j; pImg2 = pImgYPad2 + j; pImg3 = pImgYPad3 + j; pImg4 = pImgYPad4 + j; +#if !JVET_N0242_NON_LINEAR_ALF pImg5 = pImgYPad5 + j; +#endif + +#if JVET_N0242_NON_LINEAR_ALF + pRec1 = pRec0 + j; + + if ( bChroma && isPCMFilterDisabled ) + { + int blkX, blkY; + bool *flags = pcmFlags2x2; + Pel *pcmRec = pcmRec2x2; + + // check which chroma 2x2 blocks use PCM + // chroma PCM may not be aligned with 4x4 ALF processing grid + for( blkY=0; blkY<4; blkY+=2 ) + { + for( blkX=0; blkX<8; blkX+=2 ) + { + Position pos(j+startWidth+blkX, i+startHeight+blkY); + CodingUnit* cu = isDualTree ? cs.getCU(pos, CH_C) : cs.getCU(recalcPosition(nChromaFormat, CH_C, CH_L, pos), CH_L); + *flags++ = cu->ipcm ? 1 : 0; + // save original samples from 2x2 PCM blocks + if( cu->ipcm ) + { + *pcmRec++ = pRec1[(blkY+0)*dstStride + (blkX+0)]; + *pcmRec++ = pRec1[(blkY+0)*dstStride + (blkX+1)]; + *pcmRec++ = pRec1[(blkY+1)*dstStride + (blkX+0)]; + *pcmRec++ = pRec1[(blkY+1)*dstStride + (blkX+1)]; + } + } + } + } + + __m128i xmmNull = _mm_setzero_si128(); + + for( int ii = 0; ii < clsSizeY; ii++ ) + { + __m128i clipp, clipm; + __m128i coeffa, coeffb; + __m128i xmmCur = _mm_lddqu_si128( ( __m128i* ) ( pImg0 + 0 ) ); + + // coeff 0 and 1 + __m128i xmm00 = _mm_lddqu_si128( ( __m128i* ) ( pImg3 + 0 ) ); + xmm00 = _mm_sub_epi16( xmm00, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[0][0], filterClipp[0][0], filterClipp[0][0], filterClipp[0][0], + filterClipp[0][1], filterClipp[0][1], filterClipp[0][1], filterClipp[0][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm00 = _mm_min_epi16( xmm00, clipp ); + xmm00 = _mm_max_epi16( xmm00, clipm ); + + __m128i xmm01 = _mm_lddqu_si128( ( __m128i* ) ( pImg4 + 0 ) ); + xmm01 = _mm_sub_epi16( xmm01, xmmCur ); + xmm01 = _mm_min_epi16( xmm01, clipp ); + xmm01 = _mm_max_epi16( xmm01, clipm ); + + xmm00 = _mm_add_epi16( xmm00, xmm01 ); + + __m128i xmm10 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 + 1 ) ); + xmm10 = _mm_sub_epi16( xmm10, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[1][0], filterClipp[1][0], filterClipp[1][0], filterClipp[1][0], + filterClipp[1][1], filterClipp[1][1], filterClipp[1][1], filterClipp[1][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm10 = _mm_min_epi16( xmm10, clipp ); + xmm10 = _mm_max_epi16( xmm10, clipm ); + + __m128i xmm11 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 - 1 ) ); + xmm11 = _mm_sub_epi16( xmm11, xmmCur ); + xmm11 = _mm_min_epi16( xmm11, clipp ); + xmm11 = _mm_max_epi16( xmm11, clipm ); + + xmm10 = _mm_add_epi16( xmm10, xmm11 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[0][0] ); + coeffb = _mm_set1_epi16( filterCoeff[1][0] ); + __m128i xmm0 = _mm_unpacklo_epi16( xmm00, xmm10 ); + __m128i xmmS0 = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[0][1] ); + coeffb = _mm_set1_epi16( filterCoeff[1][1] ); + __m128i xmm1 = _mm_unpackhi_epi16( xmm00, xmm10 ); + __m128i xmmS1 = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + // coeff 2 and 3 + __m128i xmm20 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 + 0 ) ); + xmm20 = _mm_sub_epi16( xmm20, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[2][0], filterClipp[2][0], filterClipp[2][0], filterClipp[2][0], + filterClipp[2][1], filterClipp[2][1], filterClipp[2][1], filterClipp[2][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm20 = _mm_min_epi16( xmm20, clipp ); + xmm20 = _mm_max_epi16( xmm20, clipm ); + + __m128i xmm21 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 + 0 ) ); + xmm21 = _mm_sub_epi16( xmm21, xmmCur ); + xmm21 = _mm_min_epi16( xmm21, clipp ); + xmm21 = _mm_max_epi16( xmm21, clipm ); + + xmm20 = _mm_add_epi16( xmm20, xmm21 ); + + __m128i xmm30 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 - 1 ) ); + xmm30 = _mm_sub_epi16( xmm30, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[3][0], filterClipp[3][0], filterClipp[3][0], filterClipp[3][0], + filterClipp[3][1], filterClipp[3][1], filterClipp[3][1], filterClipp[3][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm30 = _mm_min_epi16( xmm30, clipp ); + xmm30 = _mm_max_epi16( xmm30, clipm ); + + __m128i xmm31 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 + 1 ) ); + xmm31 = _mm_sub_epi16( xmm31, xmmCur ); + xmm31 = _mm_min_epi16( xmm31, clipp ); + xmm31 = _mm_max_epi16( xmm31, clipm ); + + xmm30 = _mm_add_epi16( xmm30, xmm31 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[2][0] ); + coeffb = _mm_set1_epi16( filterCoeff[3][0] ); + xmm0 = _mm_unpacklo_epi16( xmm20, xmm30 ); + __m128i xmmSt = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS0 = _mm_add_epi32(xmmS0, xmmSt); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[2][1] ); + coeffb = _mm_set1_epi16( filterCoeff[3][1] ); + xmm1 = _mm_unpackhi_epi16( xmm20, xmm30 ); + xmmSt = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS1 = _mm_add_epi32(xmmS1, xmmSt); + + // coeff 4 and 5 + __m128i xmm40 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 + 2 ) ); + xmm40 = _mm_sub_epi16( xmm40, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[4][0], filterClipp[4][0], filterClipp[4][0], filterClipp[4][0], + filterClipp[4][1], filterClipp[4][1], filterClipp[4][1], filterClipp[4][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm40 = _mm_min_epi16( xmm40, clipp ); + xmm40 = _mm_max_epi16( xmm40, clipm ); + + __m128i xmm41 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 - 2 ) ); + xmm41 = _mm_sub_epi16( xmm41, xmmCur ); + xmm41 = _mm_min_epi16( xmm41, clipp ); + xmm41 = _mm_max_epi16( xmm41, clipm ); + + xmm40 = _mm_add_epi16( xmm40, xmm41 ); + + __m128i xmm50 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 + 1 ) ); + xmm50 = _mm_sub_epi16( xmm50, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[5][0], filterClipp[5][0], filterClipp[5][0], filterClipp[5][0], + filterClipp[5][1], filterClipp[5][1], filterClipp[5][1], filterClipp[5][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm50 = _mm_min_epi16( xmm50, clipp ); + xmm50 = _mm_max_epi16( xmm50, clipm ); + + __m128i xmm51 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 - 1 ) ); + xmm51 = _mm_sub_epi16( xmm51, xmmCur ); + xmm51 = _mm_min_epi16( xmm51, clipp ); + xmm51 = _mm_max_epi16( xmm51, clipm ); + + xmm50 = _mm_add_epi16( xmm50, xmm51 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[4][0] ); + coeffb = _mm_set1_epi16( filterCoeff[5][0] ); + xmm0 = _mm_unpacklo_epi16( xmm40, xmm50 ); + xmmSt = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS0 = _mm_add_epi32(xmmS0, xmmSt); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[4][1] ); + coeffb = _mm_set1_epi16( filterCoeff[5][1] ); + xmm1 = _mm_unpackhi_epi16( xmm40, xmm50 ); + xmmSt = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS1 = _mm_add_epi32(xmmS1, xmmSt); + + // finish + xmmS0 = _mm_add_epi32( xmmS0, mmOffset ); + xmmS0 = _mm_srai_epi32( xmmS0, numBitsMinus1 ); + xmmS1 = _mm_add_epi32( xmmS1, mmOffset ); + xmmS1 = _mm_srai_epi32( xmmS1, numBitsMinus1 ); + + xmmS0 = _mm_packs_epi32( xmmS0, xmmS1 ); + // coeff 6 + xmmS0 = _mm_add_epi16(xmmS0, xmmCur); + xmmS0 = _mm_min_epi16( mmMax, _mm_max_epi16( xmmS0, mmMin ) ); + + if( j + 8 <= endWidth - startWidth ) + { + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } + else if( j + 6 == endWidth - startWidth ) + { + xmmS0 = _mm_blend_epi16( xmmS0, xmmCur, 0xC0 ); + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } + else if( j + 4 == endWidth - startWidth ) + { + xmmS0 = _mm_blend_epi16( xmmS0, xmmCur, 0xF0 ); + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } + else + { + xmmS0 = _mm_blend_epi16( xmmS0, xmmCur, 0xFC ); + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } +#else for( int k = 0; k < 4; k++ ) { __m128i xmm4 = _mm_lddqu_si128( ( __m128i* ) ( pImg4 ) ); @@ -552,14 +876,43 @@ static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recD _mm_storel_epi64( ( __m128i* )( pRec ), xmm12 ); pRec += dstStride; +#endif pImg0 += srcStride; pImg1 += srcStride; pImg2 += srcStride; pImg3 += srcStride; pImg4 += srcStride; +#if !JVET_N0242_NON_LINEAR_ALF pImg5 += srcStride; +#endif + +#if JVET_N0242_NON_LINEAR_ALF + pRec1 += dstStride; + } + pRec1 -= dstStride2; + // restore 2x2 PCM chroma blocks + if( bChroma && isPCMFilterDisabled ) + { + int blkX, blkY; + bool *flags = pcmFlags2x2; + Pel *pcmRec = pcmRec2x2; + for( blkY=0; blkY<4; blkY+=2 ) + { + for( blkX=0; blkX<8; blkX+=2 ) + { + if( *flags++ ) + { + pRec1[(blkY+0)*dstStride + (blkX+0)] = *pcmRec++; + pRec1[(blkY+0)*dstStride + (blkX+1)] = *pcmRec++; + pRec1[(blkY+1)*dstStride + (blkX+0)] = *pcmRec++; + pRec1[(blkY+1)*dstStride + (blkX+1)] = *pcmRec++; + } + } + } + } +#else } //<-- end of k-loop pRec -= ( 4 * dstStride ); @@ -586,22 +939,34 @@ static void simdFilter5x5Blk( AlfClassifier** classifier, const PelUnitBuf &recD } pRec += 4; +#endif } +#if JVET_N0242_NON_LINEAR_ALF + pRec0 += dstStride2; +#else pRec += 4 * dstStride; +#endif pImgYPad0 += srcStride2; pImgYPad1 += srcStride2; pImgYPad2 += srcStride2; pImgYPad3 += srcStride2; pImgYPad4 += srcStride2; +#if !JVET_N0242_NON_LINEAR_ALF pImgYPad5 += srcStride2; +#endif } } template<X86_VEXT vext> +#if JVET_N0242_NON_LINEAR_ALF +static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, short* fClipSet, const ClpRng& clpRng, CodingStructure& cs ) +#else static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, short* filterSet, const ClpRng& clpRng, CodingStructure& cs ) +#endif { +#if !JVET_N0242_NON_LINEAR_ALF static const unsigned char mask0[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 7, 4, 5, 2, 3 }; static const unsigned char mask00[16] = { 2, 3, 0, 1, 0, 0, 0, 0, 8, 9, 0, 0, 0, 0, 0, 1 }; static const unsigned char mask02[16] = { 0, 0, 0, 0, 2, 3, 10, 11, 0, 0, 10, 11, 2, 3, 0, 0 }; @@ -609,12 +974,15 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD static const unsigned char mask22[16] = { 14, 15, 0, 0, 6, 7, 4, 5, 12, 13, 0, 0, 8, 9, 0, 1 }; static const unsigned char mask35[16] = { 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7 }; +#endif const bool bChroma = isChroma( compId ); +#if !JVET_N0242_NON_LINEAR_ALF if( bChroma ) { CHECK( 0, "Chroma doesn't support 7x7" ); } +#endif const SPS* sps = cs.slice->getSPS(); bool isDualTree = CS::isDualITree(cs); bool isPCMFilterDisabled = sps->getPCMFilterDisableFlag(); @@ -625,6 +993,7 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD const int srcStride = srcLuma.stride; const int dstStride = dstLuma.stride; +#if !JVET_N0242_NON_LINEAR_ALF const Pel* srcExt = srcLuma.buf; Pel* dst = dstLuma.buf; @@ -634,6 +1003,7 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4; const Pel *pImg5, *pImg6; +#endif const int numBitsMinus1 = AdaptiveLoopFilter::m_NUM_BITS - 1; const int offset = ( 1 << ( AdaptiveLoopFilter::m_NUM_BITS - 2 ) ); @@ -642,37 +1012,70 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD const int startWidth = blk.x; const int endWidth = blk.x + blk.width; +#if JVET_N0242_NON_LINEAR_ALF + const Pel* src = srcLuma.buf; + Pel* dst = dstLuma.buf + startHeight * dstStride; + + const Pel *pImgYPad0, *pImgYPad1, *pImgYPad2, *pImgYPad3, *pImgYPad4, *pImgYPad5, *pImgYPad6; + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6; + + short *coef[2] = { filterSet, filterSet }; + short *clip[2] = { fClipSet, fClipSet }; + + int transposeIdx[2] = {0, 0}; +#else Pel* imgYRecPost = dst; imgYRecPost += startHeight * dstStride; int transposeIdx = 0; +#endif const int clsSizeY = 4; const int clsSizeX = 4; +#if JVET_N0242_NON_LINEAR_ALF + bool pcmFlags2x2[8] = {0,0,0,0,0,0,0,0}; + Pel pcmRec2x2[32]; +#else bool pcmFlags2x2[4] = {0,0,0,0}; Pel pcmRec2x2[16]; +#endif CHECK( startHeight % clsSizeY, "Wrong startHeight in filtering" ); CHECK( startWidth % clsSizeX, "Wrong startWidth in filtering" ); CHECK( ( endHeight - startHeight ) % clsSizeY, "Wrong endHeight in filtering" ); CHECK( ( endWidth - startWidth ) % clsSizeX, "Wrong endWidth in filtering" ); +#if !JVET_N0242_NON_LINEAR_ALF const Pel* imgYRec = srcExt; Pel *pRec; +#endif AlfClassifier *pClass = nullptr; int dstStride2 = dstStride * clsSizeY; int srcStride2 = srcStride * clsSizeY; const __m128i mmOffset = _mm_set1_epi32( offset ); +#if JVET_N0242_NON_LINEAR_ALF + const __m128i mmMin = _mm_set1_epi16( clpRng.min ); + const __m128i mmMax = _mm_set1_epi16( clpRng.max ); +#else const __m128i mmMin = _mm_set1_epi32( clpRng.min ); const __m128i mmMax = _mm_set1_epi32( clpRng.max ); +#endif +#if JVET_N0242_NON_LINEAR_ALF + const unsigned char *filterCoeffIdx[2]; + Pel filterCoeff[MAX_NUM_ALF_LUMA_COEFF][2]; + Pel filterClipp[MAX_NUM_ALF_LUMA_COEFF][2]; + + pImgYPad0 = src + startHeight * srcStride + startWidth; +#else const __m128i xmm10 = _mm_loadu_si128( ( __m128i* )mask35 ); pImgYPad0 = imgYRec + startHeight * srcStride + startWidth; +#endif pImgYPad1 = pImgYPad0 + srcStride; pImgYPad2 = pImgYPad0 - srcStride; pImgYPad3 = pImgYPad1 + srcStride; @@ -680,19 +1083,87 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD pImgYPad5 = pImgYPad3 + srcStride; pImgYPad6 = pImgYPad4 - srcStride; +#if JVET_N0242_NON_LINEAR_ALF + Pel* pRec0 = dst + startWidth; + Pel* pRec1; +#else pRec = imgYRecPost + startWidth; +#endif +#if JVET_N0242_NON_LINEAR_ALF + for( int i = 0; i < endHeight - startHeight; i += clsSizeY ) +#else for( int i = 0; i < endHeight - startHeight; i += 4 ) +#endif { +#if !JVET_N0242_NON_LINEAR_ALF pRec = imgYRecPost + startWidth + i * dstStride; +#endif if( !bChroma ) { pClass = classifier[startHeight + i] + startWidth; } +#if JVET_N0242_NON_LINEAR_ALF + for( int j = 0; j < endWidth - startWidth; j += 8 ) +#else for( int j = 0; j < endWidth - startWidth; j += 4 ) +#endif { +#if JVET_N0242_NON_LINEAR_ALF + for( int k = 0; k < 2; ++k ) + { + if( !bChroma ) + { + const AlfClassifier& cl = pClass[j+4*k]; + transposeIdx[k] = cl.transposeIdx; + coef[k] = filterSet + cl.classIdx * MAX_NUM_ALF_LUMA_COEFF; + clip[k] = fClipSet + cl.classIdx * MAX_NUM_ALF_LUMA_COEFF; + if ( isPCMFilterDisabled && cl.classIdx == AdaptiveLoopFilter::m_ALF_UNUSED_CLASSIDX && transposeIdx[k] == AdaptiveLoopFilter::m_ALF_UNUSED_TRANSPOSIDX ) + { + // Note that last one (i.e. filterCoeff[12][k]) is not unused with JVET_N0242_NON_LINEAR_ALF; could be simplified + static const unsigned char _filterCoeffIdx[13] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + static short _identityFilterCoeff[] = { 0 }; + static short _identityFilterClipp[] = { 0 }; + filterCoeffIdx[k] = _filterCoeffIdx; + coef[k] = _identityFilterCoeff; + clip[k] = _identityFilterClipp; + } + else if( transposeIdx[k] == 1 ) + { + static const unsigned char _filterCoeffIdx[13] = { 9, 4, 10, 8, 1, 5, 11, 7, 3, 0, 2, 6, 12 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + else if( transposeIdx[k] == 2 ) + { + static const unsigned char _filterCoeffIdx[13] = { 0, 3, 2, 1, 8, 7, 6, 5, 4, 9, 10, 11, 12 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + else if( transposeIdx[k] == 3 ) + { + static const unsigned char _filterCoeffIdx[13] = { 9, 8, 10, 4, 3, 7, 11, 5, 1, 0, 2, 6, 12 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + else + { + static const unsigned char _filterCoeffIdx[13] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + } + else + { + static const unsigned char _filterCoeffIdx[13] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; + filterCoeffIdx[k] = _filterCoeffIdx; + } + + for ( int i=0; i < 13; ++i ) + { + filterCoeff[i][k] = coef[k][filterCoeffIdx[k][i]]; + filterClipp[i][k] = clip[k][filterCoeffIdx[k][i]]; + } + } +#else if( !bChroma ) { AlfClassifier& cl = pClass[j]; @@ -773,6 +1244,7 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD { c2 = _mm_shuffle_epi8( c2, xmm10 ); } + #endif pImg0 = pImgYPad0 + j; pImg1 = pImgYPad1 + j; @@ -782,6 +1254,357 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD pImg5 = pImgYPad5 + j; pImg6 = pImgYPad6 + j; +#if JVET_N0242_NON_LINEAR_ALF + pRec1 = pRec0 + j; + + if ( bChroma && isPCMFilterDisabled ) + { + int blkX, blkY; + bool *flags = pcmFlags2x2; + Pel *pcmRec = pcmRec2x2; + + // check which chroma 2x2 blocks use PCM + // chroma PCM may not be aligned with 4x4 ALF processing grid + for( blkY=0; blkY<4; blkY+=2 ) + { + for( blkX=0; blkX<8; blkX+=2 ) + { + Position pos(j+startWidth+blkX, i+startHeight+blkY); + CodingUnit* cu = isDualTree ? cs.getCU(pos, CH_C) : cs.getCU(recalcPosition(nChromaFormat, CH_C, CH_L, pos), CH_L); + *flags++ = cu->ipcm ? 1 : 0; + + // save original samples from 2x2 PCM blocks + if( cu->ipcm ) + { + *pcmRec++ = pRec1[(blkY+0)*dstStride + (blkX+0)]; + *pcmRec++ = pRec1[(blkY+0)*dstStride + (blkX+1)]; + *pcmRec++ = pRec1[(blkY+1)*dstStride + (blkX+0)]; + *pcmRec++ = pRec1[(blkY+1)*dstStride + (blkX+1)]; + } + } + } + } + + __m128i xmmNull = _mm_setzero_si128(); + + for( int ii = 0; ii < clsSizeY; ii++ ) + { + __m128i clipp, clipm; + __m128i coeffa, coeffb; + __m128i xmmCur = _mm_lddqu_si128( ( __m128i* ) ( pImg0 + 0 ) ); + + // coeff 0 and 1 + __m128i xmm00 = _mm_lddqu_si128( ( __m128i* ) ( pImg5 + 0 ) ); + xmm00 = _mm_sub_epi16( xmm00, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[0][0], filterClipp[0][0], filterClipp[0][0], filterClipp[0][0], + filterClipp[0][1], filterClipp[0][1], filterClipp[0][1], filterClipp[0][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm00 = _mm_min_epi16( xmm00, clipp ); + xmm00 = _mm_max_epi16( xmm00, clipm ); + + __m128i xmm01 = _mm_lddqu_si128( ( __m128i* ) ( pImg6 + 0 ) ); + xmm01 = _mm_sub_epi16( xmm01, xmmCur ); + xmm01 = _mm_min_epi16( xmm01, clipp ); + xmm01 = _mm_max_epi16( xmm01, clipm ); + + xmm00 = _mm_add_epi16( xmm00, xmm01 ); + + __m128i xmm10 = _mm_lddqu_si128( ( __m128i* ) ( pImg3 + 1 ) ); + xmm10 = _mm_sub_epi16( xmm10, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[1][0], filterClipp[1][0], filterClipp[1][0], filterClipp[1][0], + filterClipp[1][1], filterClipp[1][1], filterClipp[1][1], filterClipp[1][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm10 = _mm_min_epi16( xmm10, clipp ); + xmm10 = _mm_max_epi16( xmm10, clipm ); + + __m128i xmm11 = _mm_lddqu_si128( ( __m128i* ) ( pImg4 - 1 ) ); + xmm11 = _mm_sub_epi16( xmm11, xmmCur ); + xmm11 = _mm_min_epi16( xmm11, clipp ); + xmm11 = _mm_max_epi16( xmm11, clipm ); + + xmm10 = _mm_add_epi16( xmm10, xmm11 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[0][0] ); + coeffb = _mm_set1_epi16( filterCoeff[1][0] ); + __m128i xmm0 = _mm_unpacklo_epi16( xmm00, xmm10 ); + __m128i xmmS0 = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[0][1] ); + coeffb = _mm_set1_epi16( filterCoeff[1][1] ); + __m128i xmm1 = _mm_unpackhi_epi16( xmm00, xmm10 ); + __m128i xmmS1 = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + // coeff 2 and 3 + __m128i xmm20 = _mm_lddqu_si128( ( __m128i* ) ( pImg3 + 0 ) ); + xmm20 = _mm_sub_epi16( xmm20, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[2][0], filterClipp[2][0], filterClipp[2][0], filterClipp[2][0], + filterClipp[2][1], filterClipp[2][1], filterClipp[2][1], filterClipp[2][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm20 = _mm_min_epi16( xmm20, clipp ); + xmm20 = _mm_max_epi16( xmm20, clipm ); + + __m128i xmm21 = _mm_lddqu_si128( ( __m128i* ) ( pImg4 + 0 ) ); + xmm21 = _mm_sub_epi16( xmm21, xmmCur ); + xmm21 = _mm_min_epi16( xmm21, clipp ); + xmm21 = _mm_max_epi16( xmm21, clipm ); + + xmm20 = _mm_add_epi16( xmm20, xmm21 ); + + __m128i xmm30 = _mm_lddqu_si128( ( __m128i* ) ( pImg3 - 1 ) ); + xmm30 = _mm_sub_epi16( xmm30, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[3][0], filterClipp[3][0], filterClipp[3][0], filterClipp[3][0], + filterClipp[3][1], filterClipp[3][1], filterClipp[3][1], filterClipp[3][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm30 = _mm_min_epi16( xmm30, clipp ); + xmm30 = _mm_max_epi16( xmm30, clipm ); + + __m128i xmm31 = _mm_lddqu_si128( ( __m128i* ) ( pImg4 + 1 ) ); + xmm31 = _mm_sub_epi16( xmm31, xmmCur ); + xmm31 = _mm_min_epi16( xmm31, clipp ); + xmm31 = _mm_max_epi16( xmm31, clipm ); + + xmm30 = _mm_add_epi16( xmm30, xmm31 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[2][0] ); + coeffb = _mm_set1_epi16( filterCoeff[3][0] ); + xmm0 = _mm_unpacklo_epi16( xmm20, xmm30 ); + __m128i xmmSt = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS0 = _mm_add_epi32(xmmS0, xmmSt); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[2][1] ); + coeffb = _mm_set1_epi16( filterCoeff[3][1] ); + xmm1 = _mm_unpackhi_epi16( xmm20, xmm30 ); + xmmSt = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS1 = _mm_add_epi32(xmmS1, xmmSt); + + // coeff 4 and 5 + __m128i xmm40 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 + 2 ) ); + xmm40 = _mm_sub_epi16( xmm40, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[4][0], filterClipp[4][0], filterClipp[4][0], filterClipp[4][0], + filterClipp[4][1], filterClipp[4][1], filterClipp[4][1], filterClipp[4][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm40 = _mm_min_epi16( xmm40, clipp ); + xmm40 = _mm_max_epi16( xmm40, clipm ); + + __m128i xmm41 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 - 2 ) ); + xmm41 = _mm_sub_epi16( xmm41, xmmCur ); + xmm41 = _mm_min_epi16( xmm41, clipp ); + xmm41 = _mm_max_epi16( xmm41, clipm ); + + xmm40 = _mm_add_epi16( xmm40, xmm41 ); + + __m128i xmm50 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 + 1 ) ); + xmm50 = _mm_sub_epi16( xmm50, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[5][0], filterClipp[5][0], filterClipp[5][0], filterClipp[5][0], + filterClipp[5][1], filterClipp[5][1], filterClipp[5][1], filterClipp[5][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm50 = _mm_min_epi16( xmm50, clipp ); + xmm50 = _mm_max_epi16( xmm50, clipm ); + + __m128i xmm51 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 - 1 ) ); + xmm51 = _mm_sub_epi16( xmm51, xmmCur ); + xmm51 = _mm_min_epi16( xmm51, clipp ); + xmm51 = _mm_max_epi16( xmm51, clipm ); + + xmm50 = _mm_add_epi16( xmm50, xmm51 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[4][0] ); + coeffb = _mm_set1_epi16( filterCoeff[5][0] ); + xmm0 = _mm_unpacklo_epi16( xmm40, xmm50 ); + xmmSt = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS0 = _mm_add_epi32(xmmS0, xmmSt); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[4][1] ); + coeffb = _mm_set1_epi16( filterCoeff[5][1] ); + xmm1 = _mm_unpackhi_epi16( xmm40, xmm50 ); + xmmSt = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS1 = _mm_add_epi32(xmmS1, xmmSt); + + + // coeff 6 and 7 + __m128i xmm60 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 + 0 ) ); + xmm60 = _mm_sub_epi16( xmm60, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[6][0], filterClipp[6][0], filterClipp[6][0], filterClipp[6][0], + filterClipp[6][1], filterClipp[6][1], filterClipp[6][1], filterClipp[6][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm60 = _mm_min_epi16( xmm60, clipp ); + xmm60 = _mm_max_epi16( xmm60, clipm ); + + __m128i xmm61 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 + 0 ) ); + xmm61 = _mm_sub_epi16( xmm61, xmmCur ); + xmm61 = _mm_min_epi16( xmm61, clipp ); + xmm61 = _mm_max_epi16( xmm61, clipm ); + + xmm60 = _mm_add_epi16( xmm60, xmm61 ); + + __m128i xmm70 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 - 1 ) ); + xmm70 = _mm_sub_epi16( xmm70, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[7][0], filterClipp[7][0], filterClipp[7][0], filterClipp[7][0], + filterClipp[7][1], filterClipp[7][1], filterClipp[7][1], filterClipp[7][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm70 = _mm_min_epi16( xmm70, clipp ); + xmm70 = _mm_max_epi16( xmm70, clipm ); + + __m128i xmm71 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 + 1 ) ); + xmm71 = _mm_sub_epi16( xmm71, xmmCur ); + xmm71 = _mm_min_epi16( xmm71, clipp ); + xmm71 = _mm_max_epi16( xmm71, clipm ); + + xmm70 = _mm_add_epi16( xmm70, xmm71 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[6][0] ); + coeffb = _mm_set1_epi16( filterCoeff[7][0] ); + xmm0 = _mm_unpacklo_epi16( xmm60, xmm70 ); + xmmSt = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS0 = _mm_add_epi32(xmmS0, xmmSt); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[6][1] ); + coeffb = _mm_set1_epi16( filterCoeff[7][1] ); + xmm1 = _mm_unpackhi_epi16( xmm60, xmm70 ); + xmmSt = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS1 = _mm_add_epi32(xmmS1, xmmSt); + + + // coeff 8 and 9 + __m128i xmm80 = _mm_lddqu_si128( ( __m128i* ) ( pImg1 - 2 ) ); + xmm80 = _mm_sub_epi16( xmm80, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[8][0], filterClipp[8][0], filterClipp[8][0], filterClipp[8][0], + filterClipp[8][1], filterClipp[8][1], filterClipp[8][1], filterClipp[8][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm80 = _mm_min_epi16( xmm80, clipp ); + xmm80 = _mm_max_epi16( xmm80, clipm ); + + __m128i xmm81 = _mm_lddqu_si128( ( __m128i* ) ( pImg2 + 2 ) ); + xmm81 = _mm_sub_epi16( xmm81, xmmCur ); + xmm81 = _mm_min_epi16( xmm81, clipp ); + xmm81 = _mm_max_epi16( xmm81, clipm ); + + xmm80 = _mm_add_epi16( xmm80, xmm81 ); + + __m128i xmm90 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 + 3 ) ); + xmm90 = _mm_sub_epi16( xmm90, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[9][0], filterClipp[9][0], filterClipp[9][0], filterClipp[9][0], + filterClipp[9][1], filterClipp[9][1], filterClipp[9][1], filterClipp[9][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm90 = _mm_min_epi16( xmm90, clipp ); + xmm90 = _mm_max_epi16( xmm90, clipm ); + + __m128i xmm91 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 - 3 ) ); + xmm91 = _mm_sub_epi16( xmm91, xmmCur ); + xmm91 = _mm_min_epi16( xmm91, clipp ); + xmm91 = _mm_max_epi16( xmm91, clipm ); + + xmm90 = _mm_add_epi16( xmm90, xmm91 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[8][0] ); + coeffb = _mm_set1_epi16( filterCoeff[9][0] ); + xmm0 = _mm_unpacklo_epi16( xmm80, xmm90 ); + xmmSt = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS0 = _mm_add_epi32(xmmS0, xmmSt); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[8][1] ); + coeffb = _mm_set1_epi16( filterCoeff[9][1] ); + xmm1 = _mm_unpackhi_epi16( xmm80, xmm90 ); + xmmSt = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS1 = _mm_add_epi32(xmmS1, xmmSt); + + + // coeff 10 and 11 + __m128i xmm100 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 + 2 ) ); + xmm100 = _mm_sub_epi16( xmm100, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[10][0], filterClipp[10][0], filterClipp[10][0], filterClipp[10][0], + filterClipp[10][1], filterClipp[10][1], filterClipp[10][1], filterClipp[10][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm100 = _mm_min_epi16( xmm100, clipp ); + xmm100 = _mm_max_epi16( xmm100, clipm ); + + __m128i xmm101 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 - 2 ) ); + xmm101 = _mm_sub_epi16( xmm101, xmmCur ); + xmm101 = _mm_min_epi16( xmm101, clipp ); + xmm101 = _mm_max_epi16( xmm101, clipm ); + + xmm100 = _mm_add_epi16( xmm100, xmm101 ); + + __m128i xmm110 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 + 1 ) ); + xmm110 = _mm_sub_epi16( xmm110, xmmCur ); + clipp = _mm_setr_epi16( filterClipp[11][0], filterClipp[11][0], filterClipp[11][0], filterClipp[11][0], + filterClipp[11][1], filterClipp[11][1], filterClipp[11][1], filterClipp[11][1] ); + clipm = _mm_sub_epi16( xmmNull, clipp ); + xmm110 = _mm_min_epi16( xmm110, clipp ); + xmm110 = _mm_max_epi16( xmm110, clipm ); + + __m128i xmm111 = _mm_lddqu_si128( ( __m128i* ) ( pImg0 - 1 ) ); + xmm111 = _mm_sub_epi16( xmm111, xmmCur ); + xmm111 = _mm_min_epi16( xmm111, clipp ); + xmm111 = _mm_max_epi16( xmm111, clipm ); + + xmm110 = _mm_add_epi16( xmm110, xmm111 ); + + // 4 first samples + coeffa = _mm_set1_epi16( filterCoeff[10][0] ); + coeffb = _mm_set1_epi16( filterCoeff[11][0] ); + xmm0 = _mm_unpacklo_epi16( xmm100, xmm110 ); + xmmSt = _mm_madd_epi16( xmm0, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS0 = _mm_add_epi32(xmmS0, xmmSt); + + // 4 next samples + coeffa = _mm_set1_epi16( filterCoeff[10][1] ); + coeffb = _mm_set1_epi16( filterCoeff[11][1] ); + xmm1 = _mm_unpackhi_epi16( xmm100, xmm110 ); + xmmSt = _mm_madd_epi16( xmm1, _mm_unpackhi_epi16( coeffa, coeffb ) ); + + xmmS1 = _mm_add_epi32(xmmS1, xmmSt); + + // finish + xmmS0 = _mm_add_epi32( xmmS0, mmOffset ); + xmmS0 = _mm_srai_epi32( xmmS0, numBitsMinus1 ); + xmmS1 = _mm_add_epi32( xmmS1, mmOffset ); + xmmS1 = _mm_srai_epi32( xmmS1, numBitsMinus1 ); + + xmmS0 = _mm_packs_epi32( xmmS0, xmmS1 ); + // coeff 12 + xmmS0 = _mm_add_epi16(xmmS0, xmmCur); + xmmS0 = _mm_min_epi16( mmMax, _mm_max_epi16( xmmS0, mmMin ) ); + + if( j + 8 <= endWidth - startWidth ) + { + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } + else if( j + 6 == endWidth - startWidth ) + { + xmmS0 = _mm_blend_epi16( xmmS0, xmmCur, 0xC0 ); + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } + else if( j + 4 == endWidth - startWidth ) + { + xmmS0 = _mm_blend_epi16( xmmS0, xmmCur, 0xF0 ); + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } + else + { + xmmS0 = _mm_blend_epi16( xmmS0, xmmCur, 0xFC ); + _mm_storeu_si128( ( __m128i* )( pRec1 ), xmmS0 ); + } +#else for( int k = 0; k < 4; k++ ) { __m128i xmm6 = _mm_lddqu_si128( ( __m128i* ) pImg6 ); @@ -861,6 +1684,7 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD _mm_storel_epi64( ( __m128i* )( pRec ), xmm12 ); pRec += dstStride; +#endif pImg0 += srcStride; pImg1 += srcStride; @@ -869,6 +1693,33 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD pImg4 += srcStride; pImg5 += srcStride; pImg6 += srcStride; +#if JVET_N0242_NON_LINEAR_ALF + + pRec1 += dstStride; + } + pRec1 -= dstStride2; + // restore 2x2 PCM chroma blocks + if( bChroma && isPCMFilterDisabled ) + { + int blkX, blkY; + bool *flags = pcmFlags2x2; + Pel *pcmRec = pcmRec2x2; + for( blkY=0; blkY<4; blkY+=2 ) + { + for( blkX=0; blkX<8; blkX+=2 ) + { + if( *flags++ ) + { + pRec1[(blkY+0)*dstStride + (blkX+0)] = *pcmRec++; + pRec1[(blkY+0)*dstStride + (blkX+1)] = *pcmRec++; + pRec1[(blkY+1)*dstStride + (blkX+0)] = *pcmRec++; + pRec1[(blkY+1)*dstStride + (blkX+1)] = *pcmRec++; + } + } + } + } + +#else } pRec -= ( 4 * dstStride ); @@ -895,9 +1746,14 @@ static void simdFilter7x7Blk( AlfClassifier** classifier, const PelUnitBuf &recD } pRec += 4; +#endif } +#if JVET_N0242_NON_LINEAR_ALF + pRec0 += dstStride2; +#else pRec += dstStride2; +#endif pImgYPad0 += srcStride2; pImgYPad1 += srcStride2; diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index 896ca3944..b7549a782 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -625,6 +625,17 @@ void HLSyntaxReader::parseAPS(APS* aps) param.enabledFlag[COMPONENT_Cb] = alfChromaIdc >> 1; param.enabledFlag[COMPONENT_Cr] = alfChromaIdc & 1; +#if JVET_N0242_NON_LINEAR_ALF + READ_FLAG( code, "alf_luma_clip" ); + param.nonLinearFlag[CHANNEL_TYPE_LUMA] = code ? true : false; + + if( alfChromaIdc ) + { + READ_FLAG( code, "alf_chroma_clip" ); + param.nonLinearFlag[CHANNEL_TYPE_CHROMA] = code ? true : false; + } +#endif + xReadTruncBinCode(code, MAX_NUM_ALF_CLASSES); //number_of_filters_minus1 param.numLumaFilters = code + 1; if (param.numLumaFilters > 1) @@ -2522,8 +2533,11 @@ bool HLSyntaxReader::xMoreRbspData() return (cnt>0); } - +#if JVET_N0242_NON_LINEAR_ALF +int HLSyntaxReader::alfGolombDecode( const int k, const bool signed_val ) +#else int HLSyntaxReader::alfGolombDecode( const int k ) +#endif { uint32_t uiSymbol; int q = -1; @@ -2555,7 +2569,11 @@ int HLSyntaxReader::alfGolombDecode( const int k ) } } nr += q * m; // add the bits and the multiple of M +#if JVET_N0242_NON_LINEAR_ALF + if( signed_val && nr != 0 ) +#else if( nr != 0 ) +#endif { #if RExt__DECODER_DEBUG_BIT_STATISTICS xReadFlag( uiSymbol, "" ); @@ -2604,6 +2622,9 @@ void HLSyntaxReader::alfFilter( AlfSliceParam& alfSliceParam, const bool isChrom static int kMinTab[MAX_NUM_ALF_COEFF]; const int numFilters = isChroma ? 1 : alfSliceParam.numLumaFilters; short* coeff = isChroma ? alfSliceParam.chromaCoeff : alfSliceParam.lumaCoeff; +#if JVET_N0242_NON_LINEAR_ALF + short* clipp = isChroma ? alfSliceParam.chromaClipp : alfSliceParam.lumaClipp; +#endif for( int idx = 0; idx < maxGolombIdx; idx++ ) { @@ -2639,6 +2660,70 @@ void HLSyntaxReader::alfFilter( AlfSliceParam& alfSliceParam, const bool isChrom coeff[ind * MAX_NUM_ALF_LUMA_COEFF + i] = alfGolombDecode( kMinTab[alfShape.golombIdx[i]] ); } } +#if JVET_N0242_NON_LINEAR_ALF + + // Clipping values coding + if ( alfSliceParam.nonLinearFlag[isChroma] ) + { + READ_UVLC( code, "clip_min_golomb_order" ); + + kMin = code + 1; + + for( int idx = 0; idx < maxGolombIdx; idx++ ) + { + READ_FLAG( code, "clip_golomb_order_increase_flag" ); + CHECK( code > 1, "Wrong golomb_order_increase_flag" ); + kMinTab[idx] = kMin + code; + kMin = kMinTab[idx]; + } + + short recCoeff[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; + if( isChroma ) + { + memcpy( recCoeff, coeff, sizeof(short) * MAX_NUM_ALF_CHROMA_COEFF ); + } + else + { + memcpy( recCoeff, coeff, sizeof(short) * numFilters * MAX_NUM_ALF_LUMA_COEFF ); + + if( alfSliceParam.alfLumaCoeffDeltaPredictionFlag ) + { + for( int i = 1; i < numFilters; i++ ) + { + for( int j = 0; j < alfShape.numCoeff - 1; j++ ) + { + recCoeff[i * MAX_NUM_ALF_LUMA_COEFF + j] += recCoeff[( i - 1 ) * MAX_NUM_ALF_LUMA_COEFF + j]; + } + } + } + } + + // Filter coefficients + for( int ind = 0; ind < numFilters; ++ind ) + { + if( !isChroma && !alfSliceParam.alfLumaCoeffFlag[ind] && alfSliceParam.alfLumaCoeffDeltaFlag ) + { + std::fill_n( clipp + ind * MAX_NUM_ALF_LUMA_COEFF, alfShape.numCoeff, 0 ); + continue; + } + + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( recCoeff[ind * MAX_NUM_ALF_LUMA_COEFF + i] ) + clipp[ind * MAX_NUM_ALF_LUMA_COEFF + i] = alfGolombDecode( kMinTab[alfShape.golombIdx[i]], false ); + else + clipp[ind * MAX_NUM_ALF_LUMA_COEFF + i] = 0; + } + } + } + else + { + for( int ind = 0; ind < numFilters; ++ind ) + { + std::fill_n( clipp + ind * MAX_NUM_ALF_LUMA_COEFF, alfShape.numCoeff, 0 ); + } + } +#endif } int HLSyntaxReader::truncatedUnaryEqProb( const int maxSymbol ) diff --git a/source/Lib/DecoderLib/VLCReader.h b/source/Lib/DecoderLib/VLCReader.h index 01117b9f8..cb4908840 100644 --- a/source/Lib/DecoderLib/VLCReader.h +++ b/source/Lib/DecoderLib/VLCReader.h @@ -177,7 +177,11 @@ public: private: int truncatedUnaryEqProb( const int maxSymbol ); void xReadTruncBinCode( uint32_t& ruiSymbol, const int uiMaxSymbol ); +#if JVET_N0242_NON_LINEAR_ALF + int alfGolombDecode( const int k, const bool signed_val=true ); +#else int alfGolombDecode( const int k ); +#endif protected: bool xMoreRbspData(); diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index 8259f1758..c96fd1f23 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -42,6 +42,370 @@ #define AlfCtx(c) SubCtx( Ctx::ctbAlfFlag, c ) std::vector<double> EncAdaptiveLoopFilter::m_lumaLevelToWeightPLUT; +#if JVET_N0242_NON_LINEAR_ALF +void AlfCovariance::getClipMax(const AlfFilterShape& alfShape, int *clip_max) const +{ + for( int k = 0; k < numCoeff-1; ++k ) + { + clip_max[k] = 0; + + bool inc = true; + while (clip_max[k]+1 < numBins && y[clip_max[k]+1][k] == y[clip_max[k]][k]) + { + for (int l = 0; l < numCoeff; ++l) + if (E[clip_max[k]][0][k][l] != E[clip_max[k]+1][0][k][l]) + { + inc = false; + break; + } + if (!inc) + { + break; + } + ++clip_max[k]; + } + } + clip_max[numCoeff-1] = 0; +} + +void AlfCovariance::reduceClipCost(const AlfFilterShape& alfShape, int *clip) const +{ + for( int k = 0; k < numCoeff-1; ++k ) + { + bool dec = true; + while (clip[k] > 0 && y[clip[k]-1][k] == y[clip[k]][k]) + { + for (int l=0; l<numCoeff; ++l) + if (E[clip[k]][clip[l]][k][l] != E[clip[k]-1][clip[l]][k][l]) + { + dec = false; + break; + } + if (!dec) + { + break; + } + --clip[k]; + } + } +} + +double AlfCovariance::optimizeFilter(const AlfFilterShape& alfShape, int* clip, double *f, bool optimize_clip) const +{ + const int size = alfShape.numCoeff; + int clip_max[MAX_NUM_ALF_LUMA_COEFF]; + + double err_best, err_last; + + TE kE; + Ty ky; + + if( optimize_clip ) + { + // Start by looking for min clipping that has no impact => max_clipping + getClipMax(alfShape, clip_max); + for (int k=0; k<size; ++k) + { + clip[k] = std::max(clip_max[k], clip[k]); + clip[k] = std::min(clip[k], numBins-1); + } + } + + setEyFromClip( clip, kE, ky, size ); + + gnsSolveByChol( kE, ky, f, size ); + err_best = calculateError( clip, f, size ); + + int step = optimize_clip ? (numBins+1)/2 : 0; + + while( step > 0 ) + { + double err_min = err_best; + int idx_min = -1; + int inc_min = 0; + + for( int k = 0; k < size-1; ++k ) + { + if( clip[k] - step >= clip_max[k] ) + { + clip[k] -= step; + ky[k] = y[clip[k]][k]; + for( int l = 0; l < size; l++ ) + { + kE[k][l] = E[clip[k]][clip[l]][k][l]; + kE[l][k] = E[clip[l]][clip[k]][l][k]; + } + + gnsSolveByChol( kE, ky, f, size ); + err_last = calculateError( clip, f, size ); + + if( err_last < err_min ) + { + err_min = err_last; + idx_min = k; + inc_min = -step; + } + clip[k] += step; + } + if( clip[k] + step < numBins ) + { + clip[k] += step; + ky[k] = y[clip[k]][k]; + for( int l = 0; l < size; l++ ) + { + kE[k][l] = E[clip[k]][clip[l]][k][l]; + kE[l][k] = E[clip[l]][clip[k]][l][k]; + } + + gnsSolveByChol( kE, ky, f, size ); + err_last = calculateError( clip, f, size ); + + if( err_last < err_min ) + { + err_min = err_last; + idx_min = k; + inc_min = step; + } + clip[k] -= step; + + } + ky[k] = y[clip[k]][k]; + for( int l = 0; l < size; l++ ) + { + kE[k][l] = E[clip[k]][clip[l]][k][l]; + kE[l][k] = E[clip[l]][clip[k]][l][k]; + } + } + + if( idx_min >= 0 ) + { + err_best = err_min; + clip[idx_min] += inc_min; + ky[idx_min] = y[clip[idx_min]][idx_min]; + for( int l = 0; l < size; l++ ) + { + kE[idx_min][l] = E[clip[idx_min]][clip[l]][idx_min][l]; + kE[l][idx_min] = E[clip[l]][clip[idx_min]][l][idx_min]; + } + } + else + { + --step; + } + } + + if( optimize_clip ) { + // test all max + for( int k = 0; k < size-1; ++k ) + { + clip_max[k] = 0; + } + TE kE_max; + Ty ky_max; + setEyFromClip( clip_max, kE_max, ky_max, size ); + + gnsSolveByChol( kE_max, ky_max, f, size ); + err_last = calculateError( clip_max, f, size ); + if( err_last < err_best ) + { + err_best = err_last; + for (int k=0; k<size; ++k) + { + clip[k] = clip_max[k]; + } + } + else + { + // update clip to reduce coding cost + reduceClipCost(alfShape, clip); + + // update f with best solution + gnsSolveByChol( kE, ky, f, size ); + } + } + + return err_best; +} + +double AlfCovariance::calcErrorForCoeffs( const int *clip, const int *coeff, const int numCoeff, const int bitDepth ) const +{ + double factor = 1 << ( bitDepth - 1 ); + double error = 0; + + for( int i = 0; i < numCoeff; i++ ) //diagonal + { + double sum = 0; + for( int j = i + 1; j < numCoeff; j++ ) + { + // E[j][i] = E[i][j], sum will be multiplied by 2 later + sum += E[clip[i]][clip[j]][i][j] * coeff[j]; + } + error += ( ( E[clip[i]][clip[i]][i][i] * coeff[i] + sum * 2 ) / factor - 2 * y[clip[i]][i] ) * coeff[i]; + } + + return error / factor; +} + +double AlfCovariance::calculateError( const int *clip, const double *coeff, const int numCoeff ) const +{ + double sum = 0; + for( int i = 0; i < numCoeff; i++ ) + { + sum += coeff[i] * y[clip[i]][i]; + } + + return pixAcc - sum; +} + +double AlfCovariance::calculateError( const int *clip ) const +{ + Ty c; + + return optimizeFilter( clip, c, numCoeff ); +} +//******************************** +// Cholesky decomposition +//******************************** + +#define ROUND(a) (((a) < 0)? (int)((a) - 0.5) : (int)((a) + 0.5)) +#define REG 0.0001 +#define REG_SQR 0.0000001 + +//Find filter coeff related +int AlfCovariance::gnsCholeskyDec( TE inpMatr, TE outMatr, int numEq ) const +{ + Ty invDiag; /* Vector of the inverse of diagonal entries of outMatr */ + + for( int i = 0; i < numEq; i++ ) + { + for( int j = i; j < numEq; j++ ) + { + /* Compute the scaling factor */ + double scale = inpMatr[i][j]; + if( i > 0 ) + { + for( int k = i - 1; k >= 0; k-- ) + { + scale -= outMatr[k][j] * outMatr[k][i]; + } + } + + /* Compute i'th row of outMatr */ + if( i == j ) + { + if( scale <= REG_SQR ) // if(scale <= 0 ) /* If inpMatr is singular */ + { + return 0; + } + else /* Normal operation */ + invDiag[i] = 1.0 / ( outMatr[i][i] = sqrt( scale ) ); + } + else + { + outMatr[i][j] = scale * invDiag[i]; /* Upper triangular part */ + outMatr[j][i] = 0.0; /* Lower triangular part set to 0 */ + } + } + } + return 1; /* Signal that Cholesky factorization is successfully performed */ +} + +void AlfCovariance::gnsTransposeBacksubstitution( TE U, double* rhs, double* x, int order ) const +{ + /* Backsubstitution starts */ + x[0] = rhs[0] / U[0][0]; /* First row of U' */ + for( int i = 1; i < order; i++ ) + { /* For the rows 1..order-1 */ + + double sum = 0; //Holds backsubstitution from already handled rows + + for( int j = 0; j < i; j++ ) /* Backsubst already solved unknowns */ + { + sum += x[j] * U[j][i]; + } + + x[i] = ( rhs[i] - sum ) / U[i][i]; /* i'th component of solution vect. */ + } +} + +void AlfCovariance::gnsBacksubstitution( TE R, double* z, int size, double* A ) const +{ + size--; + A[size] = z[size] / R[size][size]; + + for( int i = size - 1; i >= 0; i-- ) + { + double sum = 0; + + for( int j = i + 1; j <= size; j++ ) + { + sum += R[i][j] * A[j]; + } + + A[i] = ( z[i] - sum ) / R[i][i]; + } +} + +int AlfCovariance::gnsSolveByChol( const int *clip, double *x, int numEq ) const +{ + TE LHS; + Ty rhs; + + setEyFromClip( clip, LHS, rhs, numEq ); + return gnsSolveByChol( LHS, rhs, x, numEq ); +} + +int AlfCovariance::gnsSolveByChol( TE LHS, double* rhs, double *x, int numEq ) const +{ + Ty aux; /* Auxiliary vector */ + TE U; /* Upper triangular Cholesky factor of LHS */ + + int res = 1; // Signal that Cholesky factorization is successfully performed + + /* The equation to be solved is LHSx = rhs */ + + /* Compute upper triangular U such that U'*U = LHS */ + if( gnsCholeskyDec( LHS, U, numEq ) ) /* If Cholesky decomposition has been successful */ + { + /* Now, the equation is U'*U*x = rhs, where U is upper triangular + * Solve U'*aux = rhs for aux + */ + gnsTransposeBacksubstitution( U, rhs, aux, numEq ); + + /* The equation is now U*x = aux, solve it for x (new motion coefficients) */ + gnsBacksubstitution( U, aux, numEq, x ); + + } + else /* LHS was singular */ + { + res = 0; + + /* Regularize LHS */ + for( int i = 0; i < numEq; i++ ) + { + LHS[i][i] += REG; + } + + /* Compute upper triangular U such that U'*U = regularized LHS */ + res = gnsCholeskyDec( LHS, U, numEq ); + + if( !res ) + { + std::memset( x, 0, sizeof( double )*numEq ); + return 0; + } + + /* Solve U'*aux = rhs for aux */ + gnsTransposeBacksubstitution( U, rhs, aux, numEq ); + + /* Solve U*x = aux for x */ + gnsBacksubstitution( U, aux, numEq, x ); + } + return res; +} +////////////////////////////////////////////////////////////////////////////////////////// + +#endif EncAdaptiveLoopFilter::EncAdaptiveLoopFilter() : m_CABACEstimator( nullptr ) { @@ -53,16 +417,29 @@ EncAdaptiveLoopFilter::EncAdaptiveLoopFilter() { m_alfCovarianceFrame[i] = nullptr; } +#if !JVET_N0242_NON_LINEAR_ALF m_filterCoeffQuant = nullptr; +#endif m_filterCoeffSet = nullptr; +#if JVET_N0242_NON_LINEAR_ALF + m_filterClippSet = nullptr; +#endif m_diffFilterCoeff = nullptr; m_alfWSSD = 0; } +#if JVET_N0242_NON_LINEAR_ALF +void EncAdaptiveLoopFilter::create( const EncCfg* encCfg, const int picWidth, const int picHeight, const ChromaFormat chromaFormatIDC, const int maxCUWidth, const int maxCUHeight, const int maxCUDepth, const int inputBitDepth[MAX_NUM_CHANNEL_TYPE], const int internalBitDepth[MAX_NUM_CHANNEL_TYPE] ) +#else void EncAdaptiveLoopFilter::create( const int picWidth, const int picHeight, const ChromaFormat chromaFormatIDC, const int maxCUWidth, const int maxCUHeight, const int maxCUDepth, const int inputBitDepth[MAX_NUM_CHANNEL_TYPE], const int internalBitDepth[MAX_NUM_CHANNEL_TYPE] ) +#endif { AdaptiveLoopFilter::create( picWidth, picHeight, chromaFormatIDC, maxCUWidth, maxCUHeight, maxCUDepth, inputBitDepth ); +#if JVET_N0242_NON_LINEAR_ALF + CHECK( encCfg == nullptr, "encCfg must not be null" ); + m_encCfg = encCfg; +#endif for( int channelIdx = 0; channelIdx < MAX_NUM_CHANNEL_TYPE; channelIdx++ ) { @@ -109,13 +486,21 @@ void EncAdaptiveLoopFilter::create( const int picWidth, const int picHeight, con } } +#if !JVET_N0242_NON_LINEAR_ALF m_filterCoeffQuant = new int[MAX_NUM_ALF_LUMA_COEFF]; +#endif m_filterCoeffSet = new int*[MAX_NUM_ALF_CLASSES]; +#if JVET_N0242_NON_LINEAR_ALF + m_filterClippSet = new int*[MAX_NUM_ALF_CLASSES]; +#endif m_diffFilterCoeff = new int*[MAX_NUM_ALF_CLASSES]; for( int i = 0; i < MAX_NUM_ALF_CLASSES; i++ ) { m_filterCoeffSet[i] = new int[MAX_NUM_ALF_LUMA_COEFF]; +#if JVET_N0242_NON_LINEAR_ALF + m_filterClippSet[i] = new int[MAX_NUM_ALF_LUMA_COEFF]; +#endif m_diffFilterCoeff[i] = new int[MAX_NUM_ALF_LUMA_COEFF]; } } @@ -195,6 +580,19 @@ void EncAdaptiveLoopFilter::destroy() m_filterCoeffSet = nullptr; } +#if JVET_N0242_NON_LINEAR_ALF + if( m_filterClippSet ) + { + for( int i = 0; i < MAX_NUM_ALF_CLASSES; i++ ) + { + delete[] m_filterClippSet[i]; + m_filterClippSet[i] = nullptr; + } + delete[] m_filterClippSet; + m_filterClippSet = nullptr; + } + +#endif if( m_diffFilterCoeff ) { for( int i = 0; i < MAX_NUM_ALF_CLASSES; i++ ) @@ -206,9 +604,11 @@ void EncAdaptiveLoopFilter::destroy() m_diffFilterCoeff = nullptr; } +#if !JVET_N0242_NON_LINEAR_ALF delete[] m_filterCoeffQuant; m_filterCoeffQuant = nullptr; +#endif AdaptiveLoopFilter::destroy(); } @@ -382,11 +782,26 @@ void EncAdaptiveLoopFilter::alfEncoder( CodingStructure& cs, AlfSliceParam& alfS setCtuEnableFlag( m_ctuEnableFlagTmp, channel, 0 ); } +#if JVET_N0242_NON_LINEAR_ALF + const int nonLinearFlagMax = + ( isLuma( channel ) ? m_encCfg->getUseNonLinearAlfLuma() : m_encCfg->getUseNonLinearAlfChroma() ) + ? 2 : 1; + + for( int nonLinearFlag = 0; nonLinearFlag < nonLinearFlagMax; nonLinearFlag++ ) + { +#endif //2. all CTUs are on setEnableFlag( m_alfSliceParamTemp, channel, true ); +#if JVET_N0242_NON_LINEAR_ALF + m_alfSliceParamTemp.nonLinearFlag[channel] = nonLinearFlag; +#endif m_CABACEstimator->getCtx() = AlfCtx( ctxStart ); setCtuEnableFlag( m_ctuEnableFlag, channel, 1 ); +#if JVET_N0242_NON_LINEAR_ALF + cost = getFilterCoeffAndCost( cs, 0, channel, nonLinearFlag != 0, iShapeIdx, uiCoeffBits ); +#else cost = getFilterCoeffAndCost( cs, 0, channel, false, iShapeIdx, uiCoeffBits ); +#endif if( cost < costMin ) { @@ -425,6 +840,9 @@ void EncAdaptiveLoopFilter::alfEncoder( CodingStructure& cs, AlfSliceParam& alfS cost = getFilterCoeffAndCost(cs, distUnfilter, channel, true, iShapeIdx, uiCoeffBits); } }//for iter +#if JVET_N0242_NON_LINEAR_ALF + }// for nonLineaFlag +#endif }//for shapeIdx m_CABACEstimator->getCtx() = AlfCtx( ctxBest ); copyCtuEnableFlag( m_ctuEnableFlag, m_ctuEnableFlagTmp, channel ); @@ -443,6 +861,9 @@ void EncAdaptiveLoopFilter::alfEncoder( CodingStructure& cs, AlfSliceParam& alfS const int chromaScaleY = getComponentScaleY( compID, recBuf.chromaFormat ); AlfFilterType filterType = isLuma( compID ) ? ALF_FILTER_7 : ALF_FILTER_5; short* coeff = isLuma( compID ) ? m_coeffFinal : alfSliceParam.chromaCoeff; +#if JVET_N0242_NON_LINEAR_ALF + short* clipp = isLuma( compID ) ? m_clippFinal : m_chromaClippFinal; //alfSliceParam.chromaClipp; +#endif for( int yPos = 0; yPos < pcv.lumaHeight; yPos += pcv.maxCUHeight ) { @@ -456,11 +877,19 @@ void EncAdaptiveLoopFilter::alfEncoder( CodingStructure& cs, AlfSliceParam& alfS { if( filterType == ALF_FILTER_5 ) { +#if JVET_N0242_NON_LINEAR_ALF + m_filter5x5Blk( m_classifier, recBuf, recExtBuf, blk, compID, coeff, clipp, m_clpRngs.comp[compIdx], cs ); +#else m_filter5x5Blk( m_classifier, recBuf, recExtBuf, blk, compID, coeff, m_clpRngs.comp[compIdx], cs ); +#endif } else if( filterType == ALF_FILTER_7 ) { +#if JVET_N0242_NON_LINEAR_ALF + m_filter7x7Blk( m_classifier, recBuf, recExtBuf, blk, compID, coeff, clipp, m_clpRngs.comp[compIdx], cs ); +#else m_filter7x7Blk( m_classifier, recBuf, recExtBuf, blk, compID, coeff, m_clpRngs.comp[compIdx], cs ); +#endif } else { @@ -482,9 +911,15 @@ void EncAdaptiveLoopFilter::copyAlfSliceParam( AlfSliceParam& alfSliceParamDst, } else { +#if JVET_N0242_NON_LINEAR_ALF + alfSliceParamDst.nonLinearFlag[channel] = alfSliceParamSrc.nonLinearFlag[channel]; +#endif alfSliceParamDst.enabledFlag[COMPONENT_Cb] = alfSliceParamSrc.enabledFlag[COMPONENT_Cb]; alfSliceParamDst.enabledFlag[COMPONENT_Cr] = alfSliceParamSrc.enabledFlag[COMPONENT_Cr]; memcpy( alfSliceParamDst.chromaCoeff, alfSliceParamSrc.chromaCoeff, sizeof( alfSliceParamDst.chromaCoeff ) ); +#if JVET_N0242_NON_LINEAR_ALF + memcpy( alfSliceParamDst.chromaClipp, alfSliceParamSrc.chromaClipp, sizeof( alfSliceParamDst.chromaClipp ) ); +#endif } } double EncAdaptiveLoopFilter::getFilterCoeffAndCost( CodingStructure& cs, double distUnfilter, ChannelType channel, bool bReCollectStat, int iShapeIdx, int& uiCoeffBits ) @@ -502,19 +937,37 @@ double EncAdaptiveLoopFilter::getFilterCoeffAndCost( CodingStructure& cs, double //get filter coeff if( isLuma( channel ) ) { +#if JVET_N0242_NON_LINEAR_ALF + std::fill_n(m_alfClipMerged[iShapeIdx][0][0], MAX_NUM_ALF_LUMA_COEFF*MAX_NUM_ALF_CLASSES*MAX_NUM_ALF_CLASSES, m_alfSliceParamTemp.nonLinearFlag[channel] ? AlfNumClippingValues[CHANNEL_TYPE_LUMA] / 2 : 0); + // Reset Merge Tmp Cov + m_alfCovarianceMerged[iShapeIdx][MAX_NUM_ALF_CLASSES].reset(AlfNumClippingValues[channel]); //distortion + dist += mergeFiltersAndCost( m_alfSliceParamTemp, alfFilterShape, m_alfCovarianceFrame[channel][iShapeIdx], m_alfCovarianceMerged[iShapeIdx], m_alfClipMerged[iShapeIdx], uiCoeffBits ); +#else dist += mergeFiltersAndCost( m_alfSliceParamTemp, alfFilterShape, m_alfCovarianceFrame[channel][iShapeIdx], m_alfCovarianceMerged[iShapeIdx], uiCoeffBits ); +#endif } else { //distortion +#if JVET_N0242_NON_LINEAR_ALF + assert(alfFilterShape.numCoeff == m_alfCovarianceFrame[channel][iShapeIdx][0].numCoeff); + std::fill_n(m_filterClippSet[0], MAX_NUM_ALF_CHROMA_COEFF, m_alfSliceParamTemp.nonLinearFlag[channel] ? AlfNumClippingValues[CHANNEL_TYPE_CHROMA] / 2 : 0); + dist += m_alfCovarianceFrame[channel][iShapeIdx][0].pixAcc + deriveCoeffQuant( m_filterClippSet[0], m_filterCoeffSet[0], m_alfCovarianceFrame[channel][iShapeIdx][0], alfFilterShape, m_NUM_BITS, m_alfSliceParamTemp.nonLinearFlag[channel] ); +#else dist += m_alfCovarianceFrame[channel][iShapeIdx][0].pixAcc + deriveCoeffQuant( m_filterCoeffQuant, m_alfCovarianceFrame[channel][iShapeIdx][0].E, m_alfCovarianceFrame[channel][iShapeIdx][0].y, alfFilterShape.numCoeff, alfFilterShape.weights, m_NUM_BITS, true ); memcpy( m_filterCoeffSet[0], m_filterCoeffQuant, sizeof( *m_filterCoeffQuant ) * alfFilterShape.numCoeff ); +#endif //setEnableFlag( m_alfSliceParamTemp, channel, m_ctuEnableFlag ); const int alfChromaIdc = m_alfSliceParamTemp.enabledFlag[COMPONENT_Cb] * 2 + m_alfSliceParamTemp.enabledFlag[COMPONENT_Cr]; for( int i = 0; i < MAX_NUM_ALF_CHROMA_COEFF; i++ ) { +#if JVET_N0242_NON_LINEAR_ALF + m_alfSliceParamTemp.chromaCoeff[i] = m_filterCoeffSet[0][i]; + m_alfSliceParamTemp.chromaClipp[i] = m_filterClippSet[0][i]; +#else m_alfSliceParamTemp.chromaCoeff[i] = m_filterCoeffQuant[i]; +#endif } uiCoeffBits += getCoeffRate( m_alfSliceParamTemp, true ); uiSliceFlag = lengthTruncatedUnary(alfChromaIdc, 3); @@ -530,6 +983,9 @@ double EncAdaptiveLoopFilter::getFilterCoeffAndCost( CodingStructure& cs, double int EncAdaptiveLoopFilter::getCoeffRate( AlfSliceParam& alfSliceParam, bool isChroma ) { int iBits = 0; +#if JVET_N0242_NON_LINEAR_ALF + assert( isChroma ); +#else if( !isChroma ) { iBits++; // alf_coefficients_delta_flag @@ -541,10 +997,29 @@ int EncAdaptiveLoopFilter::getCoeffRate( AlfSliceParam& alfSliceParam, bool isCh } } } +#endif memset( m_bitsCoeffScan, 0, sizeof( m_bitsCoeffScan ) ); +#if JVET_N0242_NON_LINEAR_ALF + AlfFilterShape alfShape( 5 ); +#else AlfFilterShape alfShape( isChroma ? 5 : 7 ); +#endif const int maxGolombIdx = AdaptiveLoopFilter::getMaxGolombIdx( alfShape.filterType ); +#if JVET_N0242_NON_LINEAR_ALF + const int numFilters = 1; + + // vlc for all + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + int coeffVal = abs( alfSliceParam.chromaCoeff[i] ); + + for( int k = 1; k < 15; k++ ) + { + m_bitsCoeffScan[alfShape.golombIdx[i]][k] += lengthGolomb( coeffVal, k ); + } + } +#else const short* coeff = isChroma ? alfSliceParam.chromaCoeff : alfSliceParam.lumaCoeff; const int numFilters = isChroma ? 1 : alfSliceParam.numLumaFilters; @@ -564,6 +1039,7 @@ int EncAdaptiveLoopFilter::getCoeffRate( AlfSliceParam& alfSliceParam, bool isCh } } } +#endif int kMin = getGolombKMin( alfShape, numFilters, m_kMinTab, m_bitsCoeffScan ); @@ -579,6 +1055,13 @@ int EncAdaptiveLoopFilter::getCoeffRate( AlfSliceParam& alfSliceParam, bool isCh kMin = m_kMinTab[idx]; } +#if JVET_N0242_NON_LINEAR_ALF + // Filter coefficients + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + iBits += lengthGolomb( alfSliceParam.chromaCoeff[i], m_kMinTab[alfShape.golombIdx[i]] ); // alf_coeff_chroma[i], alf_coeff_luma_delta[i][j] + } +#else if( !isChroma ) { if( alfSliceParam.alfLumaCoeffDeltaFlag ) @@ -600,6 +1083,48 @@ int EncAdaptiveLoopFilter::getCoeffRate( AlfSliceParam& alfSliceParam, bool isCh iBits += lengthGolomb( coeff[ind* MAX_NUM_ALF_LUMA_COEFF + i], m_kMinTab[alfShape.golombIdx[i]] ); // alf_coeff_chroma[i], alf_coeff_luma_delta[i][j] } } +#endif + +#if JVET_N0242_NON_LINEAR_ALF + if( m_alfSliceParamTemp.nonLinearFlag[isChroma] ) + { + memset( m_bitsCoeffScan, 0, sizeof( m_bitsCoeffScan ) ); + // vlc for all + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( alfSliceParam.chromaCoeff[i] ) ) + continue; + int coeffVal = abs( alfSliceParam.chromaClipp[i] ); + + for( int k = 1; k < 15; k++ ) + { + m_bitsCoeffScan[alfShape.golombIdx[i]][k] += lengthGolomb( coeffVal, k, false ); + } + } + + kMin = getGolombKMin( alfShape, numFilters, m_kMinTab, m_bitsCoeffScan ); + + // Golomb parameters + iBits += lengthUvlc( kMin - 1 ); // "min_golomb_order" + golombOrderIncreaseFlag = 0; + + for( int idx = 0; idx < maxGolombIdx; idx++ ) + { + golombOrderIncreaseFlag = ( m_kMinTab[idx] != kMin ) ? 1 : 0; + CHECK( !( m_kMinTab[idx] <= kMin + 1 ), "ALF Golomb parameter not consistent" ); + iBits += golombOrderIncreaseFlag; //golomb_order_increase_flag + kMin = m_kMinTab[idx]; + } + + // Filter coefficients + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( alfSliceParam.chromaCoeff[i] ) ) + continue; + iBits += lengthGolomb( alfSliceParam.chromaClipp[i], m_kMinTab[alfShape.golombIdx[i]], false ); // alf_coeff_chroma[i], alf_coeff_luma_delta[i][j] + } + } +#endif return iBits; } @@ -634,13 +1159,21 @@ double EncAdaptiveLoopFilter::getFilteredDistortion( AlfCovariance* cov, const i for( int classIdx = 0; classIdx < numClasses; classIdx++ ) { int filterIdx = numClasses == 1 ? 0 : m_filterIndices[numFiltersMinus1][classIdx]; +#if JVET_N0242_NON_LINEAR_ALF + dist += cov[classIdx].calcErrorForCoeffs( m_filterClippSet[filterIdx], m_filterCoeffSet[filterIdx], numCoeff, m_NUM_BITS ); +#else dist += calcErrorForCoeffs( cov[classIdx].E, cov[classIdx].y, m_filterCoeffSet[filterIdx], numCoeff, m_NUM_BITS ); +#endif } return dist; } +#if JVET_N0242_NON_LINEAR_ALF +double EncAdaptiveLoopFilter::mergeFiltersAndCost( AlfSliceParam& alfSliceParam, AlfFilterShape& alfShape, AlfCovariance* covFrame, AlfCovariance* covMerged, int clipMerged[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF], int& uiCoeffBits ) +#else double EncAdaptiveLoopFilter::mergeFiltersAndCost( AlfSliceParam& alfSliceParam, AlfFilterShape& alfShape, AlfCovariance* covFrame, AlfCovariance* covMerged, int& uiCoeffBits ) +#endif { int numFiltersBest = 0; int numFilters = MAX_NUM_ALF_CLASSES; @@ -650,11 +1183,19 @@ double EncAdaptiveLoopFilter::mergeFiltersAndCost( AlfSliceParam& alfSliceParam, double cost, cost0, dist, distForce0, costMin = MAX_DOUBLE; int predMode = 0, bestPredMode = 0, coeffBits, coeffBitsForce0; +#if JVET_N0242_NON_LINEAR_ALF + mergeClasses( alfShape, covFrame, covMerged, clipMerged, MAX_NUM_ALF_CLASSES, m_filterIndices ); +#else mergeClasses( covFrame, covMerged, MAX_NUM_ALF_CLASSES, m_filterIndices ); +#endif while( numFilters >= 1 ) { +#if JVET_N0242_NON_LINEAR_ALF + dist = deriveFilterCoeffs( covFrame, covMerged, clipMerged, alfShape, m_filterIndices[numFilters - 1], numFilters, errorForce0CoeffTab ); +#else dist = deriveFilterCoeffs( covFrame, covMerged, alfShape, m_filterIndices[numFilters - 1], numFilters, errorForce0CoeffTab ); +#endif // filter coeffs are stored in m_filterCoeffSet distForce0 = getDistForce0( alfShape, numFilters, errorForce0CoeffTab, codedVarBins ); coeffBits = deriveFilterCoefficientsPredictionMode( alfShape, m_filterCoeffSet, m_diffFilterCoeff, numFilters, predMode ); @@ -677,7 +1218,11 @@ double EncAdaptiveLoopFilter::mergeFiltersAndCost( AlfSliceParam& alfSliceParam, numFilters--; } +#if JVET_N0242_NON_LINEAR_ALF + dist = deriveFilterCoeffs( covFrame, covMerged, clipMerged, alfShape, m_filterIndices[numFiltersBest - 1], numFiltersBest, errorForce0CoeffTab ); +#else dist = deriveFilterCoeffs( covFrame, covMerged, alfShape, m_filterIndices[numFiltersBest - 1], numFiltersBest, errorForce0CoeffTab ); +#endif coeffBits = deriveFilterCoefficientsPredictionMode( alfShape, m_filterCoeffSet, m_diffFilterCoeff, numFiltersBest, predMode ); distForce0 = getDistForce0( alfShape, numFiltersBest, errorForce0CoeffTab, codedVarBins ); coeffBitsForce0 = getCostFilterCoeffForce0( alfShape, m_filterCoeffSet, numFiltersBest, codedVarBins ); @@ -707,6 +1252,9 @@ double EncAdaptiveLoopFilter::mergeFiltersAndCost( AlfSliceParam& alfSliceParam, if( codedVarBins[varInd] == 0 ) { memset( m_filterCoeffSet[varInd], 0, sizeof( int )*MAX_NUM_ALF_LUMA_COEFF ); +#if JVET_N0242_NON_LINEAR_ALF + memset( m_filterClippSet[varInd], 0, sizeof( int )*MAX_NUM_ALF_LUMA_COEFF ); +#endif } } } @@ -723,6 +1271,9 @@ double EncAdaptiveLoopFilter::mergeFiltersAndCost( AlfSliceParam& alfSliceParam, { alfSliceParam.lumaCoeff[ind * MAX_NUM_ALF_LUMA_COEFF + i] = m_filterCoeffSet[ind][i]; } +#if JVET_N0242_NON_LINEAR_ALF + alfSliceParam.lumaClipp[ind * MAX_NUM_ALF_LUMA_COEFF + i] = m_filterClippSet[ind][i]; +#endif } } @@ -847,6 +1398,52 @@ int EncAdaptiveLoopFilter::getCostFilterCoeffForce0( AlfFilterShape& alfShape, i } } +#if JVET_N0242_NON_LINEAR_ALF + if( m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ) + { + memset( m_bitsCoeffScan, 0, sizeof( m_bitsCoeffScan ) ); + + for( int ind = 0; ind < numFilters; ++ind ) + { + if( !codedVarBins[ind] ) + { + continue; + } + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( pDiffQFilterCoeffIntPP[ind][i] ) ) + continue; + int coeffVal = abs( m_filterClippSet[ind][i] ); + for( int k = 1; k < 15; k++ ) + { + m_bitsCoeffScan[alfShape.golombIdx[i]][k] += lengthGolomb( coeffVal, k, false ); + } + } + } + + kMin = getGolombKMin( alfShape, numFilters, m_kMinTab, m_bitsCoeffScan ); + + // Coding parameters + len += kMin //min_golomb_order + + maxGolombIdx //golomb_order_increase_flag + ; + + // Filter coefficients + for( int ind = 0; ind < numFilters; ++ind ) + { + if( codedVarBins[ind] ) + { + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( pDiffQFilterCoeffIntPP[ind][i] ) ) + continue; + len += lengthGolomb( abs( m_filterClippSet[ind][i] ), m_kMinTab[alfShape.golombIdx[i]], false ); // alf_coeff_luma_delta[i][j] + } + } + } + } + +#endif return len; } @@ -873,8 +1470,16 @@ int EncAdaptiveLoopFilter::deriveFilterCoefficientsPredictionMode( AlfFilterShap predMode = ( ratePredMode1 < ratePredMode0 && numFilters > 1 ) ? 1 : 0; +#if JVET_N0242_NON_LINEAR_ALF + int rateClipp = m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ? getCostFilterClipp( alfShape, filterSet, numFilters ) : 0; + + return ( numFilters > 1 ? 1 : 0 ) // coeff_delta_pred_mode_flag + + rateClipp + + ( predMode ? ratePredMode1 : ratePredMode0 ); // min_golomb_order, golomb_order_increase_flag, alf_coeff_luma_delta +#else return ( numFilters > 1 ? 1 : 0 ) // coeff_delta_pred_mode_flag + ( predMode ? ratePredMode1 : ratePredMode0 ); // min_golomb_order, golomb_order_increase_flag, alf_coeff_luma_delta +#endif } int EncAdaptiveLoopFilter::getCostFilterCoeff( AlfFilterShape& alfShape, int **pDiffQFilterCoeffIntPP, const int numFilters ) @@ -907,6 +1512,30 @@ int EncAdaptiveLoopFilter::getCostFilterCoeff( AlfFilterShape& alfShape, int **p return len; } +#if JVET_N0242_NON_LINEAR_ALF +int EncAdaptiveLoopFilter::getCostFilterClipp( AlfFilterShape& alfShape, int **pDiffQFilterCoeffIntPP, const int numFilters ) +{ + memset( m_bitsCoeffScan, 0, sizeof( m_bitsCoeffScan ) ); + for( int filterIdx = 0; filterIdx < numFilters; ++filterIdx ) + { + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( pDiffQFilterCoeffIntPP[filterIdx][i] ) ) + continue; + int clippVal = abs( m_filterClippSet[filterIdx][i] ); + for( int k = 1; k < 15; k++ ) + { + m_bitsCoeffScan[alfShape.golombIdx[i]][k] += lengthGolomb( clippVal, k ); + } + } + } + int len = getGolombKMin( alfShape, numFilters, m_kMinTab, m_bitsCoeffScan ); + return len //min_golomb_order + + getMaxGolombIdx( alfShape.filterType ) //golomb_order_increase_flag + + lengthFilterClipps( alfShape, numFilters, pDiffQFilterCoeffIntPP, m_kMinTab ); // Filter clippings +} + +#endif int EncAdaptiveLoopFilter::lengthFilterCoeffs( AlfFilterShape& alfShape, const int numFilters, int **FilterCoeff, int* kMinTab ) { int bitCnt = 0; @@ -921,6 +1550,24 @@ int EncAdaptiveLoopFilter::lengthFilterCoeffs( AlfFilterShape& alfShape, const i return bitCnt; } +#if JVET_N0242_NON_LINEAR_ALF +int EncAdaptiveLoopFilter::lengthFilterClipps( AlfFilterShape& alfShape, const int numFilters, int **FilterCoeff, int* kMinTab ) +{ + int bitCnt = 0; + + for( int ind = 0; ind < numFilters; ++ind ) + { + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( FilterCoeff[ind][i] ) ) + continue; + bitCnt += lengthGolomb( abs( m_filterClippSet[ind][i] ), kMinTab[alfShape.golombIdx[i]], false ); + } + } + return bitCnt; +} + +#endif double EncAdaptiveLoopFilter::getDistForce0( AlfFilterShape& alfShape, const int numFilters, double errorTabForce0Coeff[MAX_NUM_ALF_CLASSES][2], bool* codedVarBins ) { static int bitsVarBin[MAX_NUM_ALF_CLASSES]; @@ -949,6 +1596,38 @@ double EncAdaptiveLoopFilter::getDistForce0( AlfFilterShape& alfShape, const int } } +#if JVET_N0242_NON_LINEAR_ALF + if( m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ) + { + memset( m_bitsCoeffScan, 0, sizeof( m_bitsCoeffScan ) ); + for( int ind = 0; ind < numFilters; ++ind ) + { + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( m_filterCoeffSet[ind][i] ) ) + continue; + int coeffVal = abs( m_filterClippSet[ind][i] ); + for( int k = 1; k < 15; k++ ) + { + m_bitsCoeffScan[alfShape.golombIdx[i]][k] += lengthGolomb( coeffVal, k, false ); + } + } + } + + getGolombKMin( alfShape, numFilters, m_kMinTab, m_bitsCoeffScan ); + + for( int ind = 0; ind < numFilters; ++ind ) + { + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( m_filterCoeffSet[ind][i] ) ) + continue; + bitsVarBin[ind] += lengthGolomb( abs( m_filterClippSet[ind][i] ), m_kMinTab[alfShape.golombIdx[i]], false ); + } + } + } + +#endif double distForce0 = getDistCoeffForce0( codedVarBins, errorTabForce0Coeff, bitsVarBin, numFilters ); return distForce0; @@ -1035,11 +1714,19 @@ int EncAdaptiveLoopFilter::lengthUvlc( int uiCode ) return ( uiLength >> 1 ) + ( ( uiLength + 1 ) >> 1 ); } +#if JVET_N0242_NON_LINEAR_ALF +int EncAdaptiveLoopFilter::lengthGolomb( int coeffVal, int k, bool signed_coeff ) +#else int EncAdaptiveLoopFilter::lengthGolomb( int coeffVal, int k ) +#endif { int m = 2 << ( k - 1 ); int q = coeffVal / m; +#if JVET_N0242_NON_LINEAR_ALF + if( signed_coeff && coeffVal != 0 ) +#else if( coeffVal != 0 ) +#endif { return q + 2 + k; } @@ -1049,47 +1736,127 @@ int EncAdaptiveLoopFilter::lengthGolomb( int coeffVal, int k ) } } +#if JVET_N0242_NON_LINEAR_ALF +double EncAdaptiveLoopFilter::deriveFilterCoeffs( AlfCovariance* cov, AlfCovariance* covMerged, int clipMerged[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF], AlfFilterShape& alfShape, short* filterIndices, int numFilters, double errorTabForce0Coeff[MAX_NUM_ALF_CLASSES][2] ) +#else double EncAdaptiveLoopFilter::deriveFilterCoeffs( AlfCovariance* cov, AlfCovariance* covMerged, AlfFilterShape& alfShape, short* filterIndices, int numFilters, double errorTabForce0Coeff[MAX_NUM_ALF_CLASSES][2] ) +#endif { double error = 0.0; AlfCovariance& tmpCov = covMerged[MAX_NUM_ALF_CLASSES]; for( int filtIdx = 0; filtIdx < numFilters; filtIdx++ ) { tmpCov.reset(); +#if JVET_N0242_NON_LINEAR_ALF + bool found_clip = false; +#endif for( int classIdx = 0; classIdx < MAX_NUM_ALF_CLASSES; classIdx++ ) { if( filterIndices[classIdx] == filtIdx ) { tmpCov += cov[classIdx]; +#if JVET_N0242_NON_LINEAR_ALF + if( !found_clip ) + { + found_clip = true; // clip should be at the adress of shortest one + memcpy(m_filterClippSet[filtIdx], clipMerged[numFilters-1][classIdx], sizeof(int[MAX_NUM_ALF_LUMA_COEFF])); + } +#endif } } // Find coeffcients +#if JVET_N0242_NON_LINEAR_ALF + assert(alfShape.numCoeff == tmpCov.numCoeff); + errorTabForce0Coeff[filtIdx][1] = tmpCov.pixAcc + deriveCoeffQuant( m_filterClippSet[filtIdx], m_filterCoeffSet[filtIdx], tmpCov, alfShape, m_NUM_BITS, false ); +#else errorTabForce0Coeff[filtIdx][1] = tmpCov.pixAcc + deriveCoeffQuant( m_filterCoeffQuant, tmpCov.E, tmpCov.y, alfShape.numCoeff, alfShape.weights, m_NUM_BITS ); +#endif errorTabForce0Coeff[filtIdx][0] = tmpCov.pixAcc; error += errorTabForce0Coeff[filtIdx][1]; +#if !JVET_N0242_NON_LINEAR_ALF // store coeff memcpy( m_filterCoeffSet[filtIdx], m_filterCoeffQuant, sizeof( int )*alfShape.numCoeff ); +#endif } return error; } +#if JVET_N0242_NON_LINEAR_ALF +double EncAdaptiveLoopFilter::deriveCoeffQuant( int *filterClipp, int *filterCoeffQuant, const AlfCovariance& cov, const AlfFilterShape& shape, const int bitDepth, const bool optimizeClip ) +#else double EncAdaptiveLoopFilter::deriveCoeffQuant( int *filterCoeffQuant, double **E, double *y, const int numCoeff, std::vector<int>& weights, const int bitDepth, const bool bChroma ) +#endif { const int factor = 1 << ( bitDepth - 1 ); +#if JVET_N0242_NON_LINEAR_ALF +const int numCoeff = shape.numCoeff; +#else static int filterCoeffQuantMod[MAX_NUM_ALF_LUMA_COEFF]; +#endif static double filterCoeff[MAX_NUM_ALF_LUMA_COEFF]; +#if JVET_N0242_NON_LINEAR_ALF + cov.optimizeFilter( shape, filterClipp, filterCoeff, optimizeClip ); +#else gnsSolveByChol( E, y, filterCoeff, numCoeff ); +#endif roundFiltCoeff( filterCoeffQuant, filterCoeff, numCoeff, factor ); +#if JVET_N0242_NON_LINEAR_ALF + + const int max_value = factor - 1; + const int min_value = -factor; + + for ( int i = 0; i < numCoeff - 1; i++ ) + { + filterCoeffQuant[i] = std::min( max_value, std::max( min_value, filterCoeffQuant[i] ) ); + } + filterCoeffQuant[numCoeff - 1] = 0; + + int modified=1; + + double errRef=cov.calcErrorForCoeffs( filterClipp, filterCoeffQuant, numCoeff, bitDepth ); + while( modified ) + { + modified=0; + for( int sign: {1, -1} ) + { + double errMin = MAX_DOUBLE; + int minInd = -1; + + for( int k = 0; k < numCoeff-1; k++ ) + { + if( filterCoeffQuant[k] - sign > max_value || filterCoeffQuant[k] - sign < min_value ) + continue; + + filterCoeffQuant[k] -= sign; + + double error = cov.calcErrorForCoeffs( filterClipp, filterCoeffQuant, numCoeff, bitDepth ); + if( error < errMin ) + { + errMin = error; + minInd = k; + } + filterCoeffQuant[k] += sign; + } + if( errMin < errRef ) + { + filterCoeffQuant[minInd] -= sign; + modified++; + errRef = errMin; + } + } + } + + return errRef; +#else const int targetCoeffSumInt = 0; int quantCoeffSum = 0; for( int i = 0; i < numCoeff; i++ ) { quantCoeffSum += weights[i] * filterCoeffQuant[i]; } - int count = 0; while( quantCoeffSum != targetCoeffSumInt && count < 10 ) { @@ -1219,8 +1986,10 @@ double EncAdaptiveLoopFilter::deriveCoeffQuant( int *filterCoeffQuant, double ** double error = calcErrorForCoeffs( E, y, filterCoeffQuant, numCoeff, bitDepth ); return error; +#endif } +#if !JVET_N0242_NON_LINEAR_ALF double EncAdaptiveLoopFilter::calcErrorForCoeffs( double **E, double *y, int *coeff, const int numCoeff, const int bitDepth ) { double factor = 1 << ( bitDepth - 1 ); @@ -1240,6 +2009,7 @@ double EncAdaptiveLoopFilter::calcErrorForCoeffs( double **E, double *y, int *co return error / factor; } +#endif void EncAdaptiveLoopFilter::roundFiltCoeff( int *filterCoeffQuant, double *filterCoeff, const int numCoeff, const int factor ) { for( int i = 0; i < numCoeff; i++ ) @@ -1249,8 +2019,18 @@ void EncAdaptiveLoopFilter::roundFiltCoeff( int *filterCoeffQuant, double *filte } } +#if JVET_N0242_NON_LINEAR_ALF +void EncAdaptiveLoopFilter::mergeClasses( const AlfFilterShape& alfShape, AlfCovariance* cov, AlfCovariance* covMerged, int clipMerged[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF], const int numClasses, short filterIndices[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES] ) +#else void EncAdaptiveLoopFilter::mergeClasses( AlfCovariance* cov, AlfCovariance* covMerged, const int numClasses, short filterIndices[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES] ) +#endif { +#if JVET_N0242_NON_LINEAR_ALF + static int tmpClip[MAX_NUM_ALF_LUMA_COEFF]; + static int bestMergeClip[MAX_NUM_ALF_LUMA_COEFF]; + static double err[MAX_NUM_ALF_CLASSES]; + static double bestMergeErr; +#endif static bool availableClass[MAX_NUM_ALF_CLASSES]; static uint8_t indexList[MAX_NUM_ALF_CLASSES]; static uint8_t indexListTemp[MAX_NUM_ALF_CLASSES]; @@ -1264,14 +2044,38 @@ void EncAdaptiveLoopFilter::mergeClasses( AlfCovariance* cov, AlfCovariance* cov indexList[i] = i; availableClass[i] = true; covMerged[i] = cov[i]; +#if JVET_N0242_NON_LINEAR_ALF + covMerged[i].numBins = m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ? AlfNumClippingValues[COMPONENT_Y] : 1; +#endif } // Try merging different covariance matrices // temporal AlfCovariance structure is allocated as the last element in covMerged array, the size of covMerged is MAX_NUM_ALF_CLASSES + 1 AlfCovariance& tmpCov = covMerged[MAX_NUM_ALF_CLASSES]; +#if JVET_N0242_NON_LINEAR_ALF + tmpCov.numBins = m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ? AlfNumClippingValues[COMPONENT_Y] : 1; + + // init Clip + for( int i = 0; i < numClasses; i++ ) + { + std::fill_n(clipMerged[numRemaining-1][i], MAX_NUM_ALF_LUMA_COEFF, m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ? AlfNumClippingValues[CHANNEL_TYPE_LUMA] / 2 : 0); + if ( m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ) + { + err[i] = covMerged[i].optimizeFilterClip( alfShape, clipMerged[numRemaining-1][i] ); + } + else + { + err[i] = covMerged[i].calculateError( clipMerged[numRemaining-1][i] ); + } + } +#endif +#if JVET_N0242_NON_LINEAR_ALF + while( numRemaining >= 2 ) +#else while( numRemaining > 2 ) +#endif { double errorMin = std::numeric_limits<double>::max(); int bestToMergeIdx1 = 0, bestToMergeIdx2 = 1; @@ -1284,14 +2088,32 @@ void EncAdaptiveLoopFilter::mergeClasses( AlfCovariance* cov, AlfCovariance* cov { if( availableClass[j] ) { +#if JVET_N0242_NON_LINEAR_ALF + double error1 = err[i]; + double error2 = err[j]; +#else double error1 = calculateError( covMerged[i] ); double error2 = calculateError( covMerged[j] ); +#endif tmpCov.add( covMerged[i], covMerged[j] ); +#if JVET_N0242_NON_LINEAR_ALF + for( int l = 0; l < MAX_NUM_ALF_LUMA_COEFF; ++l ) + { + tmpClip[l] = (clipMerged[numRemaining-1][i][l] + clipMerged[numRemaining-1][j][l] + 1 ) >> 1; + } + double errorMerged = m_alfSliceParamTemp.nonLinearFlag[CHANNEL_TYPE_LUMA] ? tmpCov.optimizeFilterClip( alfShape, tmpClip ) : tmpCov.calculateError( tmpClip ); + double error = errorMerged - error1 - error2; +#else double error = calculateError( tmpCov ) - error1 - error2; +#endif if( error < errorMin ) { +#if JVET_N0242_NON_LINEAR_ALF + bestMergeErr = errorMerged; + memcpy(bestMergeClip, tmpClip, sizeof(bestMergeClip)); +#endif errorMin = error; bestToMergeIdx1 = i; bestToMergeIdx2 = j; @@ -1302,6 +2124,11 @@ void EncAdaptiveLoopFilter::mergeClasses( AlfCovariance* cov, AlfCovariance* cov } covMerged[bestToMergeIdx1] += covMerged[bestToMergeIdx2]; +#if JVET_N0242_NON_LINEAR_ALF + memcpy(clipMerged[numRemaining-2], clipMerged[numRemaining-1], sizeof(int[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF])); + memcpy(clipMerged[numRemaining-2][bestToMergeIdx1], bestMergeClip, sizeof(bestMergeClip)); + err[bestToMergeIdx1] = bestMergeErr; +#endif availableClass[bestToMergeIdx2] = false; for( int i = 0; i < numClasses; i++ ) @@ -1354,7 +2181,11 @@ void EncAdaptiveLoopFilter::getFrameStats( ChannelType channel, int iShapeIdx ) int numClasses = isLuma( channel ) ? MAX_NUM_ALF_CLASSES : 1; for( int i = 0; i < numClasses; i++ ) { +#if JVET_N0242_NON_LINEAR_ALF + m_alfCovarianceFrame[channel][iShapeIdx][i].reset(AlfNumClippingValues[channel]); +#else m_alfCovarianceFrame[channel][iShapeIdx][i].reset(); +#endif } if( isLuma( channel ) ) { @@ -1398,7 +2229,11 @@ void EncAdaptiveLoopFilter::deriveStatsForFiltering( PelUnitBuf& orgYuv, PelUnit { for( int ctuIdx = 0; ctuIdx < m_numCTUsInPic; ctuIdx++ ) { +#if JVET_N0242_NON_LINEAR_ALF + m_alfCovariance[compIdx][shape][ctuIdx][classIdx].reset(AlfNumClippingValues[toChannelType( compID )]); +#else m_alfCovariance[compIdx][shape][ctuIdx][classIdx].reset(); +#endif } } } @@ -1415,7 +2250,11 @@ void EncAdaptiveLoopFilter::deriveStatsForFiltering( PelUnitBuf& orgYuv, PelUnit { for( int classIdx = 0; classIdx < numClasses; classIdx++ ) { +#if JVET_N0242_NON_LINEAR_ALF + m_alfCovarianceFrame[channelIdx][shape][classIdx].reset(AlfNumClippingValues[channelID]); +#else m_alfCovarianceFrame[channelIdx][shape][classIdx].reset(); +#endif } } } @@ -1443,7 +2282,11 @@ void EncAdaptiveLoopFilter::deriveStatsForFiltering( PelUnitBuf& orgYuv, PelUnit for( int shape = 0; shape != m_filterShapes[chType].size(); shape++ ) { +#if JVET_N0242_NON_LINEAR_ALF + getBlkStats( m_alfCovariance[compIdx][shape][ctuRsAddr], m_filterShapes[chType][shape], compIdx ? nullptr : m_classifier, org, orgStride, rec, recStride, compArea, chType ); +#else getBlkStats( m_alfCovariance[compIdx][shape][ctuRsAddr], m_filterShapes[chType][shape], compIdx ? nullptr : m_classifier, org, orgStride, rec, recStride, compArea ); +#endif const int numClasses = isLuma( compID ) ? MAX_NUM_ALF_CLASSES : 1; @@ -1458,9 +2301,19 @@ void EncAdaptiveLoopFilter::deriveStatsForFiltering( PelUnitBuf& orgYuv, PelUnit } } +#if JVET_N0242_NON_LINEAR_ALF +void EncAdaptiveLoopFilter::getBlkStats( AlfCovariance* alfCovariance, const AlfFilterShape& shape, AlfClassifier** classifier, Pel* org, const int orgStride, Pel* rec, const int recStride, const CompArea& area, const ChannelType channel ) +#else void EncAdaptiveLoopFilter::getBlkStats( AlfCovariance* alfCovariace, const AlfFilterShape& shape, AlfClassifier** classifier, Pel* org, const int orgStride, Pel* rec, const int recStride, const CompArea& area ) +#endif { +#if JVET_N0242_NON_LINEAR_ALF + static int ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues]; + + const int numBins = AlfNumClippingValues[channel]; +#else static int ELocal[MAX_NUM_ALF_LUMA_COEFF]; +#endif int transposeIdx = 0; int classIdx = 0; @@ -1473,7 +2326,11 @@ void EncAdaptiveLoopFilter::getBlkStats( AlfCovariance* alfCovariace, const AlfF { continue; } +#if JVET_N0242_NON_LINEAR_ALF + std::memset( ELocal, 0, sizeof( ELocal ) ); +#else std::memset( ELocal, 0, shape.numCoeff * sizeof( int ) ); +#endif if( classifier ) { AlfClassifier& cl = classifier[area.y + i][area.x + j]; @@ -1487,31 +2344,76 @@ void EncAdaptiveLoopFilter::getBlkStats( AlfCovariance* alfCovariace, const AlfF weight = m_lumaLevelToWeightPLUT[org[j]]; } int yLocal = org[j] - rec[j]; +#if JVET_N0242_NON_LINEAR_ALF + calcCovariance( ELocal, rec + j, recStride, shape, transposeIdx, channel ); +#else calcCovariance( ELocal, rec + j, recStride, shape.pattern.data(), shape.filterLength >> 1, transposeIdx ); +#endif for( int k = 0; k < shape.numCoeff; k++ ) { for( int l = k; l < shape.numCoeff; l++ ) { +#if JVET_N0242_NON_LINEAR_ALF + for( int b0 = 0; b0 < numBins; b0++ ) + { + for( int b1 = 0; b1 < numBins; b1++ ) + { + if (m_alfWSSD) + { + alfCovariance[classIdx].E[b0][b1][k][l] += weight * (double)(ELocal[k][b0] * ELocal[l][b1]); + } + else + { + alfCovariance[classIdx].E[b0][b1][k][l] += ELocal[k][b0] * ELocal[l][b1]; + } + } + } +#else if (m_alfWSSD) { alfCovariace[classIdx].E[k][l] += weight * (double)(ELocal[k] * ELocal[l]); } else alfCovariace[classIdx].E[k][l] += ELocal[k] * ELocal[l]; +#endif + } +#if JVET_N0242_NON_LINEAR_ALF + for( int b = 0; b < numBins; b++ ) + { + if (m_alfWSSD) + { + alfCovariance[classIdx].y[b][k] += weight * (double)(ELocal[k][b] * yLocal); + } + else + { + alfCovariance[classIdx].y[b][k] += ELocal[k][b] * yLocal; + } } +#else if (m_alfWSSD) { alfCovariace[classIdx].y[k] += weight * (double)(ELocal[k] * yLocal); } else alfCovariace[classIdx].y[k] += ELocal[k] * yLocal; +#endif } if (m_alfWSSD) { +#if JVET_N0242_NON_LINEAR_ALF + alfCovariance[classIdx].pixAcc += weight * (double)(yLocal * yLocal); +#else alfCovariace[classIdx].pixAcc += weight * (double)(yLocal * yLocal); +#endif } else +#if JVET_N0242_NON_LINEAR_ALF + { + alfCovariance[classIdx].pixAcc += yLocal * yLocal; + } +#else alfCovariace[classIdx].pixAcc += yLocal * yLocal; +#endif } org += orgStride; rec += recStride; @@ -1524,16 +2426,41 @@ void EncAdaptiveLoopFilter::getBlkStats( AlfCovariance* alfCovariace, const AlfF { for( int l = 0; l < k; l++ ) { +#if JVET_N0242_NON_LINEAR_ALF + for( int b0 = 0; b0 < numBins; b0++ ) + { + for( int b1 = 0; b1 < numBins; b1++ ) + { + alfCovariance[classIdx].E[b0][b1][k][l] = alfCovariance[classIdx].E[b1][b0][l][k]; + } + } +#else alfCovariace[classIdx].E[k][l] = alfCovariace[classIdx].E[l][k]; +#endif } } } } +#if JVET_N0242_NON_LINEAR_ALF +void EncAdaptiveLoopFilter::calcCovariance( int ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues], const Pel *rec, const int stride, const AlfFilterShape& shape, const int transposeIdx, const ChannelType channel ) +#else void EncAdaptiveLoopFilter::calcCovariance( int *ELocal, const Pel *rec, const int stride, const int *filterPattern, const int halfFilterLength, const int transposeIdx ) +#endif { +#if JVET_N0242_NON_LINEAR_ALF + const int *filterPattern = shape.pattern.data(); + const int halfFilterLength = shape.filterLength >> 1; + const Pel* clip = m_alfClippingValues[channel]; + const int numBins = AlfNumClippingValues[channel]; + +#endif int k = 0; +#if JVET_N0242_NON_LINEAR_ALF + const short curr = rec[0]; +#endif + if( transposeIdx == 0 ) { for( int i = -halfFilterLength; i < 0; i++ ) @@ -1541,15 +2468,35 @@ void EncAdaptiveLoopFilter::calcCovariance( int *ELocal, const Pel *rec, const i const Pel* rec0 = rec + i * stride; const Pel* rec1 = rec - i * stride; +#if JVET_N0242_NON_LINEAR_ALF + for( int j = -halfFilterLength - i; j <= halfFilterLength + i; j++, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec0[j], rec1[-j]); + } + } +#else for( int j = -halfFilterLength - i; j <= halfFilterLength + i; j++ ) { ELocal[filterPattern[k++]] += rec0[j] + rec1[-j]; } +#endif + } +#if JVET_N0242_NON_LINEAR_ALF + for( int j = -halfFilterLength; j < 0; j++, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec[j], rec[-j]); + } } +#else for( int j = -halfFilterLength; j < 0; j++ ) { ELocal[filterPattern[k++]] += rec[j] + rec[-j]; } +#endif } else if( transposeIdx == 1 ) { @@ -1558,15 +2505,35 @@ void EncAdaptiveLoopFilter::calcCovariance( int *ELocal, const Pel *rec, const i const Pel* rec0 = rec + j; const Pel* rec1 = rec - j; +#if JVET_N0242_NON_LINEAR_ALF + for( int i = -halfFilterLength - j; i <= halfFilterLength + j; i++, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec0[i * stride], rec1[-i * stride]); + } + } +#else for( int i = -halfFilterLength - j; i <= halfFilterLength + j; i++ ) { ELocal[filterPattern[k++]] += rec0[i * stride] + rec1[-i * stride]; } +#endif + } +#if JVET_N0242_NON_LINEAR_ALF + for( int i = -halfFilterLength; i < 0; i++, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec[i*stride], rec[-i * stride]); + } } +#else for( int i = -halfFilterLength; i < 0; i++ ) { ELocal[filterPattern[k++]] += rec[i*stride] + rec[-i * stride]; } +#endif } else if( transposeIdx == 2 ) { @@ -1575,15 +2542,35 @@ void EncAdaptiveLoopFilter::calcCovariance( int *ELocal, const Pel *rec, const i const Pel* rec0 = rec + i * stride; const Pel* rec1 = rec - i * stride; +#if JVET_N0242_NON_LINEAR_ALF + for( int j = halfFilterLength + i; j >= -halfFilterLength - i; j--, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec0[j], rec1[-j]); + } + } +#else for( int j = halfFilterLength + i; j >= -halfFilterLength - i; j-- ) { ELocal[filterPattern[k++]] += rec0[j] + rec1[-j]; } +#endif + } +#if JVET_N0242_NON_LINEAR_ALF + for( int j = -halfFilterLength; j < 0; j++, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec[j], rec[-j]); + } } +#else for( int j = -halfFilterLength; j < 0; j++ ) { ELocal[filterPattern[k++]] += rec[j] + rec[-j]; } +#endif } else { @@ -1592,21 +2579,49 @@ void EncAdaptiveLoopFilter::calcCovariance( int *ELocal, const Pel *rec, const i const Pel* rec0 = rec + j; const Pel* rec1 = rec - j; +#if JVET_N0242_NON_LINEAR_ALF + for( int i = halfFilterLength + j; i >= -halfFilterLength - j; i--, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec0[i * stride], rec1[-i * stride]); + } + } +#else for( int i = halfFilterLength + j; i >= -halfFilterLength - j; i-- ) { ELocal[filterPattern[k++]] += rec0[i * stride] + rec1[-i * stride]; } +#endif + } +#if JVET_N0242_NON_LINEAR_ALF + for( int i = -halfFilterLength; i < 0; i++, k++ ) + { + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += clipALF(clip[b], curr, rec[i*stride], rec[-i * stride]); + } } +#else for( int i = -halfFilterLength; i < 0; i++ ) { ELocal[filterPattern[k++]] += rec[i*stride] + rec[-i * stride]; } +#endif + } +#if JVET_N0242_NON_LINEAR_ALF + for( int b = 0; b < numBins; b++ ) + { + ELocal[filterPattern[k]][b] += curr; } +#else ELocal[filterPattern[k++]] += rec[0]; +#endif } +#if !JVET_N0242_NON_LINEAR_ALF double EncAdaptiveLoopFilter::calculateError( AlfCovariance& cov ) { static double c[MAX_NUM_ALF_COEFF]; @@ -1753,6 +2768,7 @@ int EncAdaptiveLoopFilter::gnsSolveByChol( double **LHS, double *rhs, double *x, return res; } ////////////////////////////////////////////////////////////////////////////////////////// +#endif void EncAdaptiveLoopFilter::setEnableFlag( AlfSliceParam& alfSlicePara, ChannelType channel, bool val ) { if( channel == CHANNEL_TYPE_LUMA ) diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h index d2b02d902..f8b766729 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h @@ -41,20 +41,46 @@ #include "CommonLib/AdaptiveLoopFilter.h" #include "CABACWriter.h" +#if JVET_N0242_NON_LINEAR_ALF +#include "EncCfg.h" +#endif struct AlfCovariance { +#if JVET_N0242_NON_LINEAR_ALF + static constexpr int MaxAlfNumClippingValues = AdaptiveLoopFilter::MaxAlfNumClippingValues; + using TE = double[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF]; + using Ty = double[MAX_NUM_ALF_LUMA_COEFF]; + using TKE = TE[AdaptiveLoopFilter::MaxAlfNumClippingValues][AdaptiveLoopFilter::MaxAlfNumClippingValues]; + using TKy = Ty[AdaptiveLoopFilter::MaxAlfNumClippingValues]; +#endif + int numCoeff; +#if JVET_N0242_NON_LINEAR_ALF + int numBins; + TKy y; + TKE E; +#else double *y; double **E; +#endif double pixAcc; AlfCovariance() {} ~AlfCovariance() {} +#if JVET_N0242_NON_LINEAR_ALF + void create( int size, int num_bins = MaxAlfNumClippingValues ) +#else void create( int size ) +#endif { numCoeff = size; +#if JVET_N0242_NON_LINEAR_ALF + numBins = num_bins; + std::memset( y, 0, sizeof( y ) ); + std::memset( E, 0, sizeof( E ) ); +#else y = new double[numCoeff]; E = new double*[numCoeff]; @@ -63,10 +89,12 @@ struct AlfCovariance { E[i] = new double[numCoeff]; } +#endif } void destroy() { +#if !JVET_N0242_NON_LINEAR_ALF for( int i = 0; i < numCoeff; i++ ) { delete[] E[i]; @@ -78,25 +106,46 @@ struct AlfCovariance delete[] y; y = nullptr; +#endif } +#if JVET_N0242_NON_LINEAR_ALF + void reset( int num_bins = -1 ) +#else void reset() +#endif { +#if JVET_N0242_NON_LINEAR_ALF + if ( num_bins > 0 ) + numBins = num_bins; +#endif pixAcc = 0; +#if JVET_N0242_NON_LINEAR_ALF + std::memset( y, 0, sizeof( y ) ); + std::memset( E, 0, sizeof( E ) ); +#else std::memset( y, 0, sizeof( *y ) * numCoeff ); for( int i = 0; i < numCoeff; i++ ) { std::memset( E[i], 0, sizeof( *E[i] ) * numCoeff ); } +#endif } const AlfCovariance& operator=( const AlfCovariance& src ) { +#if JVET_N0242_NON_LINEAR_ALF + numCoeff = src.numCoeff; + numBins = src.numBins; + std::memcpy( E, src.E, sizeof( E ) ); + std::memcpy( y, src.y, sizeof( y ) ); +#else for( int i = 0; i < numCoeff; i++ ) { std::memcpy( E[i], src.E[i], sizeof( *E[i] ) * numCoeff ); } std::memcpy( y, src.y, sizeof( *y ) * numCoeff ); +#endif pixAcc = src.pixAcc; return *this; @@ -104,6 +153,30 @@ struct AlfCovariance void add( const AlfCovariance& lhs, const AlfCovariance& rhs ) { +#if JVET_N0242_NON_LINEAR_ALF + numCoeff = lhs.numCoeff; + numBins = lhs.numBins; + for( int b0 = 0; b0 < numBins; b0++ ) + { + for( int b1 = 0; b1 < numBins; b1++ ) + { + for( int j = 0; j < numCoeff; j++ ) + { + for( int i = 0; i < numCoeff; i++ ) + { + E[b0][b1][j][i] = lhs.E[b0][b1][j][i] + rhs.E[b0][b1][j][i]; + } + } + } + } + for( int b = 0; b < numBins; b++ ) + { + for( int j = 0; j < numCoeff; j++ ) + { + y[b][j] = lhs.y[b][j] + rhs.y[b][j]; + } + } +#else for( int j = 0; j < numCoeff; j++ ) { for( int i = 0; i < numCoeff; i++ ) @@ -112,11 +185,34 @@ struct AlfCovariance } y[j] = lhs.y[j] + rhs.y[j]; } +#endif pixAcc = lhs.pixAcc + rhs.pixAcc; } const AlfCovariance& operator+= ( const AlfCovariance& src ) { +#if JVET_N0242_NON_LINEAR_ALF + for( int b0 = 0; b0 < numBins; b0++ ) + { + for( int b1 = 0; b1 < numBins; b1++ ) + { + for( int j = 0; j < numCoeff; j++ ) + { + for( int i = 0; i < numCoeff; i++ ) + { + E[b0][b1][j][i] += src.E[b0][b1][j][i]; + } + } + } + } + for( int b = 0; b < numBins; b++ ) + { + for( int j = 0; j < numCoeff; j++ ) + { + y[b][j] += src.y[b][j]; + } + } +#else for( int j = 0; j < numCoeff; j++ ) { for( int i = 0; i < numCoeff; i++ ) @@ -125,6 +221,7 @@ struct AlfCovariance } y[j] += src.y[j]; } +#endif pixAcc += src.pixAcc; return *this; @@ -132,6 +229,28 @@ struct AlfCovariance const AlfCovariance& operator-= ( const AlfCovariance& src ) { +#if JVET_N0242_NON_LINEAR_ALF + for( int b0 = 0; b0 < numBins; b0++ ) + { + for( int b1 = 0; b1 < numBins; b1++ ) + { + for( int j = 0; j < numCoeff; j++ ) + { + for( int i = 0; i < numCoeff; i++ ) + { + E[b0][b1][j][i] -= src.E[b0][b1][j][i]; + } + } + } + } + for( int b = 0; b < numBins; b++ ) + { + for( int j = 0; j < numCoeff; j++ ) + { + y[b][j] -= src.y[b][j]; + } + } +#else for( int j = 0; j < numCoeff; j++ ) { for( int i = 0; i < numCoeff; i++ ) @@ -140,10 +259,55 @@ struct AlfCovariance } y[j] -= src.y[j]; } +#endif pixAcc -= src.pixAcc; return *this; } + +#if JVET_N0242_NON_LINEAR_ALF + void setEyFromClip(const int* clip, TE _E, Ty _y, int size) const + { + for (int k=0; k<size; k++) + { + _y[k] = y[clip[k]][k]; + for (int l=0; l<size; l++) + { + _E[k][l] = E[clip[k]][clip[l]][k][l]; + } + } + } + + double optimizeFilter(const int* clip, double *f, int size) const + { + gnsSolveByChol( clip, f, size ); + return calculateError( clip, f ); + } + + double optimizeFilter(const AlfFilterShape& alfShape, int* clip, double *f, bool optimize_clip) const; + double optimizeFilterClip(const AlfFilterShape& alfShape, int* clip) const + { + Ty f; + return optimizeFilter(alfShape, clip, f, true); + } + + double calculateError( const int *clip ) const; + double calculateError( const int *clip, const double *coeff ) const { return calculateError(clip, coeff, numCoeff); } + double calculateError( const int *clip, const double *coeff, const int numCoeff ) const; + double calcErrorForCoeffs( const int *clip, const int *coeff, const int numCoeff, const int bitDepth ) const; + + void getClipMax(const AlfFilterShape& alfShape, int *clip_max) const; + void reduceClipCost(const AlfFilterShape& alfShape, int *clip) const; + +private: + // Cholesky decomposition + + int gnsSolveByChol( const int *clip, double *x, int numEq ) const; + int gnsSolveByChol( TE LHS, double* rhs, double *x, int numEq ) const; + void gnsBacksubstitution( TE R, double* z, int size, double* A ) const; + void gnsTransposeBacksubstitution( TE U, double* rhs, double* x, int order ) const; + int gnsCholeskyDec( TE inpMatr, TE outMatr, int numEq ) const; +#endif }; class EncAdaptiveLoopFilter : public AdaptiveLoopFilter @@ -157,6 +321,9 @@ public: inline std::vector<double>& getLumaLevelWeightTable() { return m_lumaLevelToWeightPLUT; } private: +#if JVET_N0242_NON_LINEAR_ALF + const EncCfg* m_encCfg; +#endif AlfCovariance*** m_alfCovariance[MAX_NUM_COMPONENT]; // [compIdx][shapeIdx][ctbAddr][classIdx] AlfCovariance** m_alfCovarianceFrame[MAX_NUM_CHANNEL_TYPE]; // [CHANNEL][shapeIdx][classIdx] uint8_t* m_ctuEnableFlagTmp[MAX_NUM_COMPONENT]; @@ -164,13 +331,21 @@ private: //for RDO AlfSliceParam m_alfSliceParamTemp; AlfCovariance m_alfCovarianceMerged[ALF_NUM_OF_FILTER_TYPES][MAX_NUM_ALF_CLASSES + 1]; +#if JVET_N0242_NON_LINEAR_ALF + int m_alfClipMerged[ALF_NUM_OF_FILTER_TYPES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF]; +#endif CABACWriter* m_CABACEstimator; CtxCache* m_CtxCache; double m_lambda[MAX_NUM_COMPONENT]; const double FracBitsScale = 1.0 / double( 1 << SCALE_BITS ); +#if !JVET_N0242_NON_LINEAR_ALF int* m_filterCoeffQuant; +#endif int** m_filterCoeffSet; +#if JVET_N0242_NON_LINEAR_ALF + int** m_filterClippSet; +#endif int** m_diffFilterCoeff; int m_kMinTab[MAX_NUM_ALF_LUMA_COEFF]; int m_bitsCoeffScan[m_MAX_SCAN_VAL][m_MAX_EXP_GOLOMB]; @@ -186,9 +361,17 @@ public: #endif AlfSliceParam& alfSliceParam ); void initCABACEstimator( CABACEncoder* cabacEncoder, CtxCache* ctxCache, Slice* pcSlice ); +#if JVET_N0242_NON_LINEAR_ALF + void create( const EncCfg* encCfg, const int picWidth, const int picHeight, const ChromaFormat chromaFormatIDC, const int maxCUWidth, const int maxCUHeight, const int maxCUDepth, const int inputBitDepth[MAX_NUM_CHANNEL_TYPE], const int internalBitDepth[MAX_NUM_CHANNEL_TYPE] ); +#else void create( const int picWidth, const int picHeight, const ChromaFormat chromaFormatIDC, const int maxCUWidth, const int maxCUHeight, const int maxCUDepth, const int inputBitDepth[MAX_NUM_CHANNEL_TYPE], const int internalBitDepth[MAX_NUM_CHANNEL_TYPE] ); +#endif void destroy(); +#if JVET_N0242_NON_LINEAR_ALF + static int lengthGolomb( int coeffVal, int k, bool signed_coeff=true ); +#else static int lengthGolomb( int coeffVal, int k ); +#endif static int getGolombKMin( AlfFilterShape& alfShape, const int numFilters, int kMinTab[MAX_NUM_ALF_LUMA_COEFF], int bitsCoeffScan[m_MAX_SCAN_VAL][m_MAX_EXP_GOLOMB] ); private: @@ -199,21 +382,41 @@ private: ); void copyAlfSliceParam( AlfSliceParam& alfSliceParamDst, AlfSliceParam& alfSliceParamSrc, ChannelType channel ); +#if JVET_N0242_NON_LINEAR_ALF + double mergeFiltersAndCost( AlfSliceParam& alfSliceParam, AlfFilterShape& alfShape, AlfCovariance* covFrame, AlfCovariance* covMerged, int clipMerged[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF], int& uiCoeffBits ); +#else double mergeFiltersAndCost( AlfSliceParam& alfSliceParam, AlfFilterShape& alfShape, AlfCovariance* covFrame, AlfCovariance* covMerged, int& uiCoeffBits ); +#endif void getFrameStats( ChannelType channel, int iShapeIdx ); void getFrameStat( AlfCovariance* frameCov, AlfCovariance** ctbCov, uint8_t* ctbEnableFlags, const int numClasses ); void deriveStatsForFiltering( PelUnitBuf& orgYuv, PelUnitBuf& recYuv ); +#if JVET_N0242_NON_LINEAR_ALF + void getBlkStats( AlfCovariance* alfCovariace, const AlfFilterShape& shape, AlfClassifier** classifier, Pel* org, const int orgStride, Pel* rec, const int recStride, const CompArea& area, const ChannelType channel ); + void calcCovariance( int ELocal[MAX_NUM_ALF_LUMA_COEFF][MaxAlfNumClippingValues], const Pel *rec, const int stride, const AlfFilterShape& shape, const int transposeIdx, const ChannelType channel ); + void mergeClasses( const AlfFilterShape& alfShape, AlfCovariance* cov, AlfCovariance* covMerged, int clipMerged[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF], const int numClasses, short filterIndices[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES] ); +#else void getBlkStats( AlfCovariance* alfCovariace, const AlfFilterShape& shape, AlfClassifier** classifier, Pel* org, const int orgStride, Pel* rec, const int recStride, const CompArea& area ); void calcCovariance( int *ELocal, const Pel *rec, const int stride, const int *filterPattern, const int halfFilterLength, const int transposeIdx ); void mergeClasses( AlfCovariance* cov, AlfCovariance* covMerged, const int numClasses, short filterIndices[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES] ); +#endif +#if !JVET_N0242_NON_LINEAR_ALF double calculateError( AlfCovariance& cov ); double calcErrorForCoeffs( double **E, double *y, int *coeff, const int numCoeff, const int bitDepth ); +#endif double getFilterCoeffAndCost( CodingStructure& cs, double distUnfilter, ChannelType channel, bool bReCollectStat, int iShapeIdx, int& uiCoeffBits ); +#if JVET_N0242_NON_LINEAR_ALF + double deriveFilterCoeffs( AlfCovariance* cov, AlfCovariance* covMerged, int clipMerged[MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_CLASSES][MAX_NUM_ALF_LUMA_COEFF], AlfFilterShape& alfShape, short* filterIndices, int numFilters, double errorTabForce0Coeff[MAX_NUM_ALF_CLASSES][2] ); +#else double deriveFilterCoeffs( AlfCovariance* cov, AlfCovariance* covMerged, AlfFilterShape& alfShape, short* filterIndices, int numFilters, double errorTabForce0Coeff[MAX_NUM_ALF_CLASSES][2] ); +#endif int deriveFilterCoefficientsPredictionMode( AlfFilterShape& alfShape, int **filterSet, int** filterCoeffDiff, const int numFilters, int& predMode ); +#if JVET_N0242_NON_LINEAR_ALF + double deriveCoeffQuant( int *filterClipp, int *filterCoeffQuant, const AlfCovariance& cov, const AlfFilterShape& shape, const int bitDepth, const bool optimizeClip ); +#else double deriveCoeffQuant( int *filterCoeffQuant, double **E, double *y, const int numCoeff, std::vector<int>& weights, const int bitDepth, const bool bChroma = false ); +#endif double deriveCtbAlfEnableFlags( CodingStructure& cs, const int iShapeIdx, ChannelType channel, #if ENABLE_QPA const double chromaWeight, @@ -229,7 +432,13 @@ private: int getCostFilterCoeffForce0( AlfFilterShape& alfShape, int **pDiffQFilterCoeffIntPP, const int numFilters, bool* codedVarBins ); int getCostFilterCoeff( AlfFilterShape& alfShape, int **pDiffQFilterCoeffIntPP, const int numFilters ); +#if JVET_N0242_NON_LINEAR_ALF + int getCostFilterClipp( AlfFilterShape& alfShape, int **pDiffQFilterCoeffIntPP, const int numFilters ); +#endif int lengthFilterCoeffs( AlfFilterShape& alfShape, const int numFilters, int **FilterCoeff, int* kMinTab ); +#if JVET_N0242_NON_LINEAR_ALF + int lengthFilterClipps( AlfFilterShape& alfShape, const int numFilters, int **FilterCoeff, int* kMinTab ); +#endif double getDistForce0( AlfFilterShape& alfShape, const int numFilters, double errorTabForce0Coeff[MAX_NUM_ALF_CLASSES][2], bool* codedVarBins ); int getCoeffRate( AlfSliceParam& alfSliceParam, bool isChroma ); @@ -237,12 +446,14 @@ private: double getUnfilteredDistortion( AlfCovariance* cov, const int numClasses ); double getFilteredDistortion( AlfCovariance* cov, const int numClasses, const int numFiltersMinus1, const int numCoeff ); +#if !JVET_N0242_NON_LINEAR_ALF // Cholesky decomposition int gnsSolveByChol( double **LHS, double *rhs, double *x, int numEq ); void gnsBacksubstitution( double R[MAX_NUM_ALF_COEFF][MAX_NUM_ALF_COEFF], double* z, int size, double* A ); void gnsTransposeBacksubstitution( double U[MAX_NUM_ALF_COEFF][MAX_NUM_ALF_COEFF], double* rhs, double* x, int order ); int gnsCholeskyDec( double **inpMatr, double outMatr[MAX_NUM_ALF_COEFF][MAX_NUM_ALF_COEFF], int numEq ); +#endif void setEnableFlag( AlfSliceParam& alfSlicePara, ChannelType channel, bool val ); void setEnableFlag( AlfSliceParam& alfSlicePara, ChannelType channel, uint8_t** ctuFlags ); void setCtuEnableFlag( uint8_t** ctuFlags, ChannelType channel, uint8_t val ); diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index ceb945663..f1ece8c0e 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -266,6 +266,10 @@ protected: bool m_useAMaxBT; bool m_e0023FastEnc; bool m_contentBasedFastQtbt; +#if JVET_N0242_NON_LINEAR_ALF + bool m_useNonLinearAlfLuma; + bool m_useNonLinearAlfChroma; +#endif #if MAX_TB_SIZE_SIGNALLING uint32_t m_log2MaxTbSize; @@ -822,6 +826,12 @@ public: bool getUseE0023FastEnc () const { return m_e0023FastEnc; } void setUseContentBasedFastQtbt ( bool b ) { m_contentBasedFastQtbt = b; } bool getUseContentBasedFastQtbt () const { return m_contentBasedFastQtbt; } +#if JVET_N0242_NON_LINEAR_ALF + void setUseNonLinearAlfLuma ( bool b ) { m_useNonLinearAlfLuma = b; } + bool getUseNonLinearAlfLuma () const { return m_useNonLinearAlfLuma; } + void setUseNonLinearAlfChroma ( bool b ) { m_useNonLinearAlfChroma = b; } + bool getUseNonLinearAlfChroma () const { return m_useNonLinearAlfChroma; } +#endif #if MAX_TB_SIZE_SIGNALLING void setLog2MaxTbSize ( uint32_t u ) { m_log2MaxTbSize = u; } diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index dcc383dc2..fbb1edeb1 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -135,7 +135,11 @@ void EncLib::create () } if( m_alf ) { +#if JVET_N0242_NON_LINEAR_ALF + m_cEncALF.create( this, getSourceWidth(), getSourceHeight(), m_chromaFormatIDC, m_maxCUWidth, m_maxCUHeight, m_maxTotalCUDepth, m_bitDepth, m_inputBitDepth ); +#else m_cEncALF.create( getSourceWidth(), getSourceHeight(), m_chromaFormatIDC, m_maxCUWidth, m_maxCUHeight, m_maxTotalCUDepth, m_bitDepth, m_inputBitDepth ); +#endif } #if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index ae81af9f9..57b20baad 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -377,6 +377,14 @@ void HLSWriter::codeAPS( APS* pcAPS) const int alfChromaIdc = param.enabledFlag[COMPONENT_Cb] * 2 + param.enabledFlag[COMPONENT_Cr]; truncatedUnaryEqProb(alfChromaIdc, 3); // alf_chroma_idc +#if JVET_N0242_NON_LINEAR_ALF + WRITE_FLAG( param.nonLinearFlag[CHANNEL_TYPE_LUMA], "alf_luma_clip" ); + if( alfChromaIdc ) + { + WRITE_FLAG( param.nonLinearFlag[CHANNEL_TYPE_CHROMA], "alf_chroma_clip" ); + } +#endif + xWriteTruncBinCode(param.numLumaFilters - 1, MAX_NUM_ALF_CLASSES); //number_of_filters_minus1 if (param.numLumaFilters > 1) { @@ -1793,8 +1801,11 @@ bool HLSWriter::xFindMatchingLTRP(Slice* pcSlice, uint32_t *ltrpsIndex, int ltrp return false; } - +#if JVET_N0242_NON_LINEAR_ALF +void HLSWriter::alfGolombEncode( int coeff, int k, const bool signed_coeff ) +#else void HLSWriter::alfGolombEncode( int coeff, int k ) +#endif { int symbol = abs( coeff ); @@ -1814,7 +1825,11 @@ void HLSWriter::alfGolombEncode( int coeff, int k ) symbol >>= 1; } +#if JVET_N0242_NON_LINEAR_ALF + if( signed_coeff && coeff != 0 ) +#else if( coeff != 0 ) +#endif { int sign = ( coeff > 0 ) ? 1 : 0; xWriteFlag( sign ); @@ -1840,6 +1855,9 @@ void HLSWriter::alfFilter( const AlfSliceParam& alfSliceParam, const bool isChro AlfFilterShape alfShape( isChroma ? 5 : 7 ); const int maxGolombIdx = AdaptiveLoopFilter::getMaxGolombIdx( alfShape.filterType ); const short* coeff = isChroma ? alfSliceParam.chromaCoeff : alfSliceParam.lumaCoeff; +#if JVET_N0242_NON_LINEAR_ALF + const short* clipp = isChroma ? alfSliceParam.chromaClipp : alfSliceParam.lumaClipp; +#endif const int numFilters = isChroma ? 1 : alfSliceParam.numLumaFilters; // vlc for all @@ -1897,6 +1915,82 @@ void HLSWriter::alfFilter( const AlfSliceParam& alfSliceParam, const bool isChro alfGolombEncode( coeff[ind* MAX_NUM_ALF_LUMA_COEFF + i], kMinTab[alfShape.golombIdx[i]] ); // alf_coeff_chroma[i], alf_coeff_luma_delta[i][j] } } +#if JVET_N0242_NON_LINEAR_ALF + + // Clipping values coding + if( alfSliceParam.nonLinearFlag[isChroma] ) + { + memset( bitsCoeffScan, 0, sizeof( bitsCoeffScan ) ); + + short recCoeff[MAX_NUM_ALF_CLASSES * MAX_NUM_ALF_LUMA_COEFF]; + if( isChroma ) + { + memcpy( recCoeff, coeff, sizeof(short) * MAX_NUM_ALF_CHROMA_COEFF ); + } + else + { + memcpy( recCoeff, coeff, sizeof(short) * numFilters * MAX_NUM_ALF_LUMA_COEFF ); + + if( alfSliceParam.alfLumaCoeffDeltaPredictionFlag ) + { + for( int i = 1; i < numFilters; i++ ) + { + for( int j = 0; j < alfShape.numCoeff - 1; j++ ) + { + recCoeff[i * MAX_NUM_ALF_LUMA_COEFF + j] += recCoeff[( i - 1 ) * MAX_NUM_ALF_LUMA_COEFF + j]; + } + } + } + } + // vlc for all + for( int ind = 0; ind < numFilters; ++ind ) + { + if( isChroma || !alfSliceParam.alfLumaCoeffDeltaFlag || alfSliceParam.alfLumaCoeffFlag[ind] ) + { + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( recCoeff[ind * MAX_NUM_ALF_LUMA_COEFF + i] ) ) + continue; + int coeffVal = abs( clipp[ind * MAX_NUM_ALF_LUMA_COEFF + i] ); + + for( int k = 1; k < 15; k++ ) + { + bitsCoeffScan[alfShape.golombIdx[i]][k] += EncAdaptiveLoopFilter::lengthGolomb( coeffVal, k, false ); + } + } + } + } + + kMin = EncAdaptiveLoopFilter::getGolombKMin( alfShape, numFilters, kMinTab, bitsCoeffScan ); + + // Golomb parameters + WRITE_UVLC( kMin - 1, "clip_min_golomb_order" ); + + for( int idx = 0; idx < maxGolombIdx; idx++ ) + { + bool golombOrderIncreaseFlag = ( kMinTab[idx] != kMin ) ? true : false; + CHECK( !( kMinTab[idx] <= kMin + 1 ), "ALF Golomb parameter not consistent" ); + WRITE_FLAG( golombOrderIncreaseFlag, "clip_golomb_order_increase_flag" ); + kMin = kMinTab[idx]; + } + + // Filter coefficients + for( int ind = 0; ind < numFilters; ++ind ) + { + if( !isChroma && !alfSliceParam.alfLumaCoeffFlag[ind] && alfSliceParam.alfLumaCoeffDeltaFlag ) + { + continue; + } + + for( int i = 0; i < alfShape.numCoeff - 1; i++ ) + { + if( !abs( recCoeff[ind * MAX_NUM_ALF_LUMA_COEFF + i] ) ) + continue; + alfGolombEncode( clipp[ind* MAX_NUM_ALF_LUMA_COEFF + i], kMinTab[alfShape.golombIdx[i]], false ); // alf_coeff_chroma[i], alf_coeff_luma_delta[i][j] + } + } + } +#endif } void HLSWriter::xWriteTruncBinCode( uint32_t uiSymbol, const int uiMaxSymbol ) diff --git a/source/Lib/EncoderLib/VLCWriter.h b/source/Lib/EncoderLib/VLCWriter.h index 2ec729bde..b4ddd26eb 100644 --- a/source/Lib/EncoderLib/VLCWriter.h +++ b/source/Lib/EncoderLib/VLCWriter.h @@ -149,7 +149,11 @@ public: private: void xWriteTruncBinCode( uint32_t uiSymbol, const int uiMaxSymbol ); +#if JVET_N0242_NON_LINEAR_ALF + void alfGolombEncode( const int coeff, const int k, const bool signed_coeff=true ); +#else void alfGolombEncode( const int coeff, const int k ); +#endif void truncatedUnaryEqProb( int symbol, int maxSymbol ); void codeReshaper ( const SliceReshapeInfo& pSliceReshaperInfo, const SPS* pcSPS, const bool isIntra); -- GitLab