diff --git a/cfg/encoder_intra_vtm.cfg b/cfg/encoder_intra_vtm.cfg index aec0126b24df4e2ce43ef34f33b95d097d846aa8..12a31a22940fb9d31224d058afcf6b9d181ba124 100644 --- a/cfg/encoder_intra_vtm.cfg +++ b/cfg/encoder_intra_vtm.cfg @@ -100,6 +100,7 @@ MRL : 1 MIP : 1 JointCbCr : 1 # joint coding of chroma residuals (if available): 0: disable, 1: enable ChromaTS : 1 +CCSAO : 1 # Cross-component sample adaptive offset: 0: disable, 1: enable # Fast tools PBIntraFast : 1 diff --git a/cfg/encoder_lowdelay_P_vtm.cfg b/cfg/encoder_lowdelay_P_vtm.cfg index 9152b7877f65b00dc1c7c4966d3fb4c12890da32..f2f8c9217dd40d3f956b29f39f64de32cc707c36 100644 --- a/cfg/encoder_lowdelay_P_vtm.cfg +++ b/cfg/encoder_lowdelay_P_vtm.cfg @@ -128,6 +128,7 @@ ChromaTS : 1 AffineMMVD : 1 AdditionalInterHyps : 0 BIF : 1 # Bilateral filter: 0: disable, 1: enable +CCSAO : 1 # Cross-component sample adaptive offset: 0: disable, 1: enable # Fast tools PBIntraFast : 1 diff --git a/cfg/encoder_lowdelay_vtm.cfg b/cfg/encoder_lowdelay_vtm.cfg index 5a2784aa4183f7de899090f29823da08fcda9c65..f22d99fc2c914898be63ad9a36f9ef2b4538b79f 100644 --- a/cfg/encoder_lowdelay_vtm.cfg +++ b/cfg/encoder_lowdelay_vtm.cfg @@ -132,6 +132,7 @@ ChromaTS : 1 AffineMMVD : 1 AdditionalInterHyps : 2 BIF : 1 # Bilateral filter: 0: disable, 1: enable +CCSAO : 1 # Cross-component sample adaptive offset: 0: disable, 1: enable # Fast tools PBIntraFast : 1 diff --git a/cfg/encoder_randomaccess_vtm.cfg b/cfg/encoder_randomaccess_vtm.cfg index a732648effd0a4316c2f852f2d53fb6980c4ac6d..ffc4d09115e6486bd7110da3cc58b4f73892a7c5 100644 --- a/cfg/encoder_randomaccess_vtm.cfg +++ b/cfg/encoder_randomaccess_vtm.cfg @@ -162,6 +162,7 @@ PROF : 1 AffineMMVD : 1 AdditionalInterHyps : 2 BIF : 1 # Bilateral filter: 0: disable, 1: enable +CCSAO : 1 # Cross-component sample adaptive offset: 0: disable, 1: enable # Fast tools PBIntraFast : 1 diff --git a/cfg/encoder_randomaccess_vtm_gop16.cfg b/cfg/encoder_randomaccess_vtm_gop16.cfg index 68f3d7214907d5c0ea6bf5229a9dbf527105bd01..076459ff3c577c39ae8ccfe62d5556a94fa2b257 100644 --- a/cfg/encoder_randomaccess_vtm_gop16.cfg +++ b/cfg/encoder_randomaccess_vtm_gop16.cfg @@ -146,7 +146,7 @@ PROF : 1 AffineMMVD : 1 AdditionalInterHyps : 2 BIF : 1 # Bilateral filter: 0: disable, 1: enable - +CCSAO : 1 # Cross-component sample adaptive offset: 0: disable, 1: enable # Fast tools PBIntraFast : 1 diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp index 34cd83ca8e9b0858bd7ec57dd6b35c6ef4f9a5f4..a8b96db85b4114c3e1df21fee7d71cf7b3b994a9 100644 --- a/source/App/EncoderApp/EncApp.cpp +++ b/source/App/EncoderApp/EncApp.cpp @@ -346,6 +346,11 @@ void EncApp::xInitLibCfg() m_cEncLib.setNoSaoConstraintFlag(m_noSaoConstraintFlag); CHECK(m_noSaoConstraintFlag && m_bUseSAO, "SAO shall be deactivated when m_bNoSaoConstraintFlag is equal to 1"); +#if JVET_W0066_CCSAO + m_cEncLib.setNoCCSaoConstraintFlag(m_noCCSaoConstraintFlag); + CHECK(m_noCCSaoConstraintFlag && m_CCSAO, "CCSAO shall be deactivated when m_noCCSaoConstraintFlag is equal to 1"); +#endif + m_cEncLib.setNoAlfConstraintFlag(m_noAlfConstraintFlag); CHECK(m_noAlfConstraintFlag && m_alf, "ALF shall be deactivated when m_bNoAlfConstraintFlag is equal to 1"); @@ -516,6 +521,9 @@ void EncApp::xInitLibCfg() m_cEncLib.setNoQtbttDualTreeIntraConstraintFlag(false); m_cEncLib.setNoPartitionConstraintsOverrideConstraintFlag(false); m_cEncLib.setNoSaoConstraintFlag(false); +#if JVET_W0066_CCSAO + m_cEncLib.setNoCCSaoConstraintFlag(false); +#endif m_cEncLib.setNoAlfConstraintFlag(false); m_cEncLib.setNoCCAlfConstraintFlag(false); #if JVET_S0058_GCI @@ -919,6 +927,9 @@ void EncApp::xInitLibCfg() //====== Sub-picture and Slices ======== m_cEncLib.setSingleSlicePerSubPicFlagFlag ( m_singleSlicePerSubPicFlag ); m_cEncLib.setUseSAO ( m_bUseSAO ); +#if JVET_W0066_CCSAO + m_cEncLib.setUseCCSAO ( m_CCSAO ); +#endif m_cEncLib.setTestSAODisableAtPictureLevel ( m_bTestSAODisableAtPictureLevel ); m_cEncLib.setSaoEncodingRate ( m_saoEncodingRate ); m_cEncLib.setSaoEncodingRateChroma ( m_saoEncodingRateChroma ); diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index f684965eb91ce3b440f30ff157fb102e75212873..79ce1652c9dcad9698d241e4d88c353157a7577e 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -914,6 +914,9 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) ("NoSignDataHidingConstraintFlag", m_noSignDataHidingConstraintFlag, false, "Indicate that SDH is deactivated") ("NoQpDeltaConstraintFlag", m_noQpDeltaConstraintFlag, false, "Indicate that QPdelta is deactivated") ("NoSaoConstraintFlag", m_noSaoConstraintFlag, false, "Indicate that SAO is deactivated") +#if JVET_W0066_CCSAO + ("NoCCSaoConstraintFlag", m_noCCSaoConstraintFlag, false, "Indicate that CCSAO is deactivated") +#endif ("NoAlfConstraintFlag", m_noAlfConstraintFlag, false, "Indicate that ALF is deactivated") ("NoCCAlfConstraintFlag", m_noCCAlfConstraintFlag, false, "Indicate that CCALF is deactivated") ("NoLmcsConstraintFlag", m_noLmcsConstraintFlag, false, "Indicate that LMCS is deactivated") @@ -1240,6 +1243,9 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) ("GolombRiceParameterAdaptation", m_persistentRiceAdaptationEnabledFlag, false, "Enable the adaptation of the Golomb-Rice parameter over the course of each slice") ("AlignCABACBeforeBypass", m_cabacBypassAlignmentEnabledFlag, false, "Align the CABAC engine to a defined fraction of a bit prior to coding bypass data. Must be 1 in high bit rate profile, 0 otherwise") ("SAO", m_bUseSAO, true, "Enable Sample Adaptive Offset") +#if JVET_W0066_CCSAO + ("CCSAO", m_CCSAO, true, "Cross-component Sample Adaptive Offset" ) +#endif ("TestSAODisableAtPictureLevel", m_bTestSAODisableAtPictureLevel, false, "Enables the testing of disabling SAO at the picture level after having analysed all blocks") ("SaoEncodingRate", m_saoEncodingRate, 0.75, "When >0 SAO early picture termination is enabled for luma and chroma") ("SaoEncodingRateChroma", m_saoEncodingRateChroma, 0.5, "The SAO early picture termination rate to use for chroma (when m_SaoEncodingRate is >0). If <=0, use results for luma") @@ -4233,6 +4239,9 @@ void EncAppCfg::xPrintParameter() msg( VERBOSE, "Slices: %d ", m_numSlicesInPic); msg( VERBOSE, "MCTS:%d ", m_MCTSEncConstraint ); msg( VERBOSE, "SAO:%d ", (m_bUseSAO)?(1):(0)); +#if JVET_W0066_CCSAO + msg( VERBOSE, "CCSAO:%d ", m_CCSAO ? 1 : 0 ); +#endif msg( VERBOSE, "ALF:%d ", m_alf ? 1 : 0 ); msg( VERBOSE, "CCALF:%d ", m_ccalf ? 1 : 0 ); diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h index 3a6a3d1b4ddef2838bf9177945a8060638beb078..c07a0dc8749b37b4d251bc8501c476facf844a21 100644 --- a/source/App/EncoderApp/EncAppCfg.h +++ b/source/App/EncoderApp/EncAppCfg.h @@ -167,6 +167,9 @@ protected: #endif bool m_noPartitionConstraintsOverrideConstraintFlag; bool m_noSaoConstraintFlag; +#if JVET_W0066_CCSAO + bool m_noCCSaoConstraintFlag; +#endif bool m_noAlfConstraintFlag; bool m_noCCAlfConstraintFlag; #if JVET_S0058_GCI @@ -490,6 +493,9 @@ protected: // coding tool (SAO) bool m_bUseSAO; +#if JVET_W0066_CCSAO + bool m_CCSAO; +#endif bool m_bTestSAODisableAtPictureLevel; double m_saoEncodingRate; ///< When >0 SAO early picture termination is enabled for luma and chroma double m_saoEncodingRateChroma; ///< The SAO early picture termination rate to use for chroma (when m_SaoEncodingRate is >0). If <=0, use results for luma. diff --git a/source/Lib/CommonLib/AlfParameters.h b/source/Lib/CommonLib/AlfParameters.h index 3c1e7141fa9e7ff418ab348ff307dcd748444fc4..cb18d7991dc380605eb6a6179fe0041fce240940 100644 --- a/source/Lib/CommonLib/AlfParameters.h +++ b/source/Lib/CommonLib/AlfParameters.h @@ -424,6 +424,50 @@ struct CcAlfFilterParam return *this; } }; + +#if JVET_W0066_CCSAO +struct CcSaoComParam +{ + bool enabled [MAX_NUM_COMPONENT]; + uint8_t setNum [MAX_NUM_COMPONENT]; + bool setEnabled[MAX_NUM_COMPONENT][MAX_CCSAO_SET_NUM]; + uint16_t candPos [MAX_NUM_COMPONENT][MAX_CCSAO_SET_NUM][MAX_NUM_LUMA_COMP]; + uint16_t bandNum [MAX_NUM_COMPONENT][MAX_CCSAO_SET_NUM][MAX_NUM_COMPONENT]; + short offset [MAX_NUM_COMPONENT][MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM]; + CcSaoComParam() + { + reset(); + } + void reset() + { + std::memset( enabled, false, sizeof( enabled ) ); + std::memset( setNum, 0, sizeof( setNum ) ); + std::memset( setEnabled, false, sizeof( setEnabled ) ); + std::memset( candPos, 0, sizeof( candPos ) ); + std::memset( bandNum, 0, sizeof( bandNum ) ); + std::memset( offset, 0, sizeof( offset ) ); + } + void reset(ComponentID compID) + { + enabled[compID] = false; + setNum [compID] = 0; + std::memset( setEnabled[compID], false, sizeof( setEnabled[compID]) ); + std::memset( candPos [compID], 0, sizeof( candPos [compID]) ); + std::memset( bandNum [compID], 0, sizeof( bandNum [compID]) ); + std::memset( offset [compID], 0, sizeof( offset [compID]) ); + } + const CcSaoComParam& operator = ( const CcSaoComParam& src ) + { + std::memcpy( enabled, src.enabled, sizeof( enabled ) ); + std::memcpy( setNum, src.setNum, sizeof( setNum ) ); + std::memcpy( setEnabled, src.setEnabled, sizeof( setEnabled ) ); + std::memcpy( candPos, src.candPos, sizeof( candPos ) ); + std::memcpy( bandNum, src.bandNum, sizeof( bandNum ) ); + std::memcpy( offset, src.offset, sizeof( offset ) ); + return *this; + } +}; +#endif //! \} #endif // end of #ifndef __ALFPARAMETERS__ diff --git a/source/Lib/CommonLib/BilateralFilter.cpp b/source/Lib/CommonLib/BilateralFilter.cpp index e9d485661c408c68db36bc0b18d1f73231f82155..f654e028325e48b5e88297c2e135488aefc33616 100755 --- a/source/Lib/CommonLib/BilateralFilter.cpp +++ b/source/Lib/CommonLib/BilateralFilter.cpp @@ -51,6 +51,7 @@ BilateralFilter::BilateralFilter() { m_bilateralFilterDiamond5x5 = blockBilateralFilterDiamond5x5; + m_bilateralFilterDiamond5x5NoClip = blockBilateralFilterDiamond5x5NoClip; #if ENABLE_SIMD_BILATERAL_FILTER #ifdef TARGET_SIMD_X86 @@ -117,6 +118,241 @@ const char* BilateralFilter::getFilterLutParameters( const int size, const PredM return m_wBIF[sqp - 17]; } +#if JVET_W0066_CCSAO +void BilateralFilter::blockBilateralFilterDiamond5x5NoClip(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr) +{ + int pad = 2; + + int padwidth = iWidthExtSIMD; + int downbuffer[64]; + int downleftbuffer[65]; + int downrightbuffer[2][65]; + int Shift, sg0, v0, idx, w0; + Shift = sizeof(int) * 8 - 1; + downbuffer[0] = 0; + + for (int x = 0; x < uiWidth; x++) + { + int pixel = block[(-1 + pad) * padwidth + x + pad]; + int below = block[(-1 + pad + 1) * padwidth + x + pad]; + int diff = below - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx]; + int mod = (w0 + sg0) ^ sg0; + downbuffer[x] = mod; + + int belowright = block[(-1 + pad + 1) * padwidth + x + pad + 1]; + diff = belowright - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downrightbuffer[1][x + 1] = mod; + + int belowleft = block[(-1 + pad + 1) * padwidth + x + pad - 1]; + diff = belowleft - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downleftbuffer[x] = mod; + } + int width = uiWidth; + for (int y = 0; y < uiHeight; y++) + { + int diff; + + int16_t* rowStart = &block[(y + pad) * padwidth + pad]; + + int pixel = rowStart[-1]; + + int right = rowStart[0]; + diff = right - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx]; + int mod = (w0 + sg0) ^ sg0; + int rightmod = mod; + + pixel = rowStart[-padwidth - 1]; + int belowright = right; + diff = belowright - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downrightbuffer[(y + 1) % 2][0] = mod; + + pixel = rowStart[-padwidth + width]; + int belowleft = rowStart[width - 1]; + diff = belowleft - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downleftbuffer[width] = mod; + + for (int x = 0; x < uiWidth; x++) + { + pixel = rowStart[x]; + int modsum = 0; + + int abovemod = -downbuffer[x]; + modsum += abovemod; + + int leftmod = -rightmod; + modsum += leftmod; + + right = rowStart[x + 1]; + diff = right - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx]; + mod = (w0 + sg0) ^ sg0; + + modsum += mod; + rightmod = mod; + + int below = rowStart[x + padwidth]; + diff = below - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx]; + mod = (w0 + sg0) ^ sg0; + modsum += mod; + downbuffer[x] = mod; + + int aboverightmod = -downleftbuffer[x + 1]; + // modsum += ((int16_t)((uint16_t)((aboverightmod) >> 1))); + modsum += aboverightmod; + + int aboveleftmod = -downrightbuffer[(y + 1) % 2][x]; + // modsum += ((int16_t)((uint16_t)((aboveleftmod) >> 1))); + modsum += aboveleftmod; + + int belowleft = rowStart[x + padwidth - 1]; + diff = belowleft - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + // modsum += ((int16_t)((uint16_t)((mod) >> 1))); + modsum += mod; + downleftbuffer[x] = mod; + + int belowright = rowStart[x + padwidth + 1]; + diff = belowright - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + //modsum += ((int16_t)((uint16_t)((mod) >> 1))); + modsum += mod; + downrightbuffer[y % 2][x + 1] = mod; + + // For samples two pixels out, we do not reuse previously calculated + // values even though that is possible. Doing so would likely increase + // speed when SIMD is turned off. + + int above = rowStart[x - 2 * padwidth]; + diff = above - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + modsum += mod; + + below = rowStart[x + 2 * padwidth]; + diff = below - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + modsum += mod; + + int left = rowStart[x - 2]; + diff = left - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + modsum += mod; + + right = rowStart[x + 2]; + diff = right - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15) & ((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + modsum += mod; + + blkFilt[(y + pad) * (padwidth + 4) + x + pad] = ((int16_t)((uint16_t)((modsum * bfac + bif_round_add) >> bif_round_shift))); + } + } + + // Copy back + Pel* tempBlockPtr = (short*)blkFilt + (((padwidth + 4) << 1) + 2); + int tempBlockStride = padwidth + 4; + if (isRDO) + { + Pel* srcBlockPtr = (short*)block + (((padwidth) << 1) + 2); + int srcBlockStride = padwidth; + for (uint32_t yy = 0; yy < uiHeight; yy++) + { + for (uint32_t xx = 0; xx < uiWidth; xx++) + { + recPtr[xx] = ClipPel(srcBlockPtr[xx] + tempBlockPtr[xx], clpRng); + } + recPtr += recStride; + tempBlockPtr += tempBlockStride; + srcBlockPtr += srcBlockStride; + } + } + else + { + for (uint32_t yy = 0; yy < uiHeight; yy++) + { + for (uint32_t xx = 0; xx < uiWidth; xx++) + { + // new result = old result (which is SAO-treated already) + diff due to bilateral filtering + //recPtr[xx] = ClipPel<int>(recPtr[xx] + tempBlockPtr[xx], clpRng); + recPtr[xx] = recPtr[xx] + tempBlockPtr[xx]; // clipping is done jointly for SAO/BIF/CCSAO + } + recPtr += recStride; + tempBlockPtr += tempBlockStride; + } + } +} +#endif void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ) { int pad = 2; @@ -559,6 +795,234 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu } } +#if JVET_W0066_CCSAO +void BilateralFilter::bilateralFilterDiamond5x5NoClip(const CPelUnitBuf& src, PelUnitBuf& rec, int32_t qp, const ClpRng& clpRng, TransformUnit& currTU) +{ + CompArea& compArea = currTU.block(COMPONENT_Y); + + const unsigned uiWidth = compArea.width; + const unsigned uiHeight = compArea.height; + + bool topAltAvailable; + bool leftAltAvailable; + + int srcStride = src.get(COMPONENT_Y).stride; + const Pel* srcPtr = src.get(COMPONENT_Y).bufAt(compArea); + const Pel* srcPtrTemp = srcPtr; + + int recStride = rec.get(COMPONENT_Y).stride; + Pel* recPtr = rec.get(COMPONENT_Y).bufAt(compArea); + + int bfac = 1; + const char* LUTrowPtr = getFilterLutParameters(std::min(uiWidth, uiHeight), currTU.cu->predMode, qp + currTU.cs->pps->getBIFQPOffset(), bfac); + + int bif_round_add = (BIF_ROUND_ADD) >> (currTU.cs->pps->getBIFStrength()); + int bif_round_shift = (BIF_ROUND_SHIFT)-(currTU.cs->pps->getBIFStrength()); + + const CompArea& myArea = currTU.blocks[COMPONENT_Y]; + topAltAvailable = myArea.y - 2 >= 0; + leftAltAvailable = myArea.x - 2 >= 0; + bool bottomAltAvailable = myArea.y + myArea.height + 1 < currTU.cu->slice->getSPS()->getMaxPicHeightInLumaSamples(); + bool rightAltAvailable = myArea.x + myArea.width + 1 < currTU.cu->slice->getSPS()->getMaxPicWidthInLumaSamples(); + + uint32_t uiWidthExt = uiWidth + (NUMBER_PADDED_SAMPLES << 1); + uint32_t uiHeightExt = uiHeight + (NUMBER_PADDED_SAMPLES << 1); + + int iWidthExtSIMD = uiWidthExt; + if (uiWidth < 8) + { + iWidthExtSIMD = 8 + (NUMBER_PADDED_SAMPLES << 1); + } + + Pel* tempBlockPtr; + + bool allAvail = topAltAvailable && bottomAltAvailable && leftAltAvailable && rightAltAvailable; + + memset(tempblock, 0, iWidthExtSIMD * uiHeightExt * sizeof(short)); + + if (allAvail) + { + // set pointer two rows up and two pixels to the left from the start of the block + tempBlockPtr = tempblock; + + // same with image data + srcPtr = srcPtr - 2 * srcStride - 2; + + //// Move block to temporary block + + // Check if the block a the top block of a CTU. + bool isCTUboundary = myArea.y % currTU.cs->slice->getSPS()->getCTUSize() == 0; + if (isCTUboundary) + { + // The samples two lines up are out of bounds. (One line above the CTU is OK, since SAO uses that line.) + // Hence the top line of tempblock is unavailable if the block is the top block of a CTU. + // Therefore, copy samples from one line up instead of from two lines up by updating srcPtr *before* copy. + srcPtr += srcStride; + std::memcpy(tempBlockPtr, srcPtr, (uiWidthExt) * sizeof(Pel)); + } + else + { + std::memcpy(tempBlockPtr, srcPtr, (uiWidthExt) * sizeof(Pel)); + srcPtr += srcStride; + } + tempBlockPtr += iWidthExtSIMD; + // Copy samples that are not out of bounds. + for (uint32_t uiY = 1; uiY < uiHeightExt - 1; ++uiY) + { + std::memcpy(tempBlockPtr, srcPtr, (uiWidthExt) * sizeof(Pel)); + srcPtr += srcStride; + tempBlockPtr += iWidthExtSIMD; + } + // Check if the block is a bottom block of a CTU. + isCTUboundary = (myArea.y + uiHeight) % currTU.cs->slice->getSPS()->getCTUSize() == 0; + if (isCTUboundary) + { + // The samples two lines down are out of bounds. (One line below the CTU is OK, since SAO uses that line.) + // Hence the bottom line of tempblock is unavailable if the block at the bottom of a CTU. + // Therefore, copy samples from the second to last line instead of the last line by subtracting srcPtr before copy. + srcPtr -= srcStride; + std::memcpy(tempBlockPtr, srcPtr, (uiWidthExt) * sizeof(Pel)); + } + else + { + std::memcpy(tempBlockPtr, srcPtr, (uiWidthExt) * sizeof(Pel)); + } + return m_bilateralFilterDiamond5x5NoClip(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false, LUTrowPtr); + } + else + { + tempBlockPtr = tempblock + (NUMBER_PADDED_SAMPLES)*iWidthExtSIMD + NUMBER_PADDED_SAMPLES; + + //// Move block to temporary block + for (uint32_t uiY = 0; uiY < uiHeight; ++uiY) + { + std::memcpy(tempBlockPtr, srcPtr, uiWidth * sizeof(Pel)); + srcPtr += srcStride; + tempBlockPtr += iWidthExtSIMD; + } + srcPtr = srcPtrTemp; + + if (topAltAvailable) + { + std::copy(srcPtr - 2 * srcStride, srcPtr - 2 * srcStride + uiWidth, tempblock + 2); + std::copy(srcPtr - srcStride, srcPtr - srcStride + uiWidth, tempblock + iWidthExtSIMD + 2); + } + if (bottomAltAvailable) + { + std::copy(srcPtr + (uiHeight + 1) * srcStride, srcPtr + (uiHeight + 1) * srcStride + uiWidth, tempblock + (uiHeightExt - 1) * iWidthExtSIMD + 2); + std::copy(srcPtr + uiHeight * srcStride, srcPtr + uiHeight * srcStride + uiWidth, tempblock + (uiHeightExt - 2) * iWidthExtSIMD + 2); + } + if (leftAltAvailable) + { + for (int yy = 0; yy < uiHeight; yy++) + { + tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 0] = *(srcPtr + yy * srcStride - 2); + tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 1] = *(srcPtr + yy * srcStride - 1); + } + } + if (rightAltAvailable) + { + for (int yy = 0; yy < uiHeight; yy++) + { + tempblock[(iWidthExtSIMD << 1) + uiWidthExt - 1 + yy * iWidthExtSIMD] = *(srcPtr + uiWidth + yy * srcStride + 1); + tempblock[(iWidthExtSIMD << 1) + uiWidthExt - 2 + yy * iWidthExtSIMD] = *(srcPtr + uiWidth + yy * srcStride); + } + } + + // if not all available, copy from inside tempbuffer + if (!topAltAvailable) + { + std::copy(tempblock + iWidthExtSIMD * 2 + 2, tempblock + iWidthExtSIMD * 2 + 2 + uiWidth, tempblock + 2); + std::copy(tempblock + iWidthExtSIMD * 2 + 2, tempblock + iWidthExtSIMD * 2 + 2 + uiWidth, tempblock + iWidthExtSIMD + 2); + } + if (!bottomAltAvailable) + { + std::copy(tempblock + (uiHeightExt - 3) * iWidthExtSIMD + 2, tempblock + (uiHeightExt - 3) * iWidthExtSIMD + 2 + uiWidth, tempblock + (uiHeightExt - 2) * iWidthExtSIMD + 2); + std::copy(tempblock + (uiHeightExt - 3) * iWidthExtSIMD + 2, tempblock + (uiHeightExt - 3) * iWidthExtSIMD + 2 + uiWidth, tempblock + (uiHeightExt - 1) * iWidthExtSIMD + 2); + } + if (!leftAltAvailable) + { + for (int yy = 0; yy < uiHeight; yy++) + { + tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 0] = tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 2]; + tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 1] = tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 2]; + } + } + if (!rightAltAvailable) + { + for (int yy = 0; yy < uiHeight; yy++) + { + tempblock[(iWidthExtSIMD << 1) + uiWidthExt - 2 + yy * iWidthExtSIMD] = tempblock[(iWidthExtSIMD << 1) + uiWidthExt - 2 + yy * iWidthExtSIMD - 1]; + tempblock[(iWidthExtSIMD << 1) + uiWidthExt - 1 + yy * iWidthExtSIMD] = tempblock[(iWidthExtSIMD << 1) + uiWidthExt - 2 + yy * iWidthExtSIMD - 1]; + } + } + + // All sides are available, easy to just copy corners also. + if (topAltAvailable && leftAltAvailable) + { + tempblock[0] = *(srcPtr - 2 * srcStride - 2); // a top left corner + tempblock[1] = *(srcPtr - 2 * srcStride - 1); // b a b|x x + tempblock[iWidthExtSIMD + 0] = *(srcPtr - srcStride - 2); // c c d|x x + tempblock[iWidthExtSIMD + 1] = *(srcPtr - srcStride - 1); // d ------- + } + else + { + tempblock[0] = tempblock[iWidthExtSIMD * 2 + 2]; // extend top left + tempblock[1] = tempblock[iWidthExtSIMD * 2 + 2]; // extend top left + tempblock[iWidthExtSIMD + 0] = tempblock[iWidthExtSIMD * 2 + 2]; // extend top left + tempblock[iWidthExtSIMD + 1] = tempblock[iWidthExtSIMD * 2 + 2]; // extend top left + } + + if (topAltAvailable && rightAltAvailable) + { + tempblock[iWidthExtSIMD - 2] = *(srcPtr - 2 * srcStride + uiWidth); // a + tempblock[iWidthExtSIMD - 1] = *(srcPtr - 2 * srcStride + uiWidth + 1); // b + tempblock[iWidthExtSIMD + uiWidthExt - 2] = *(srcPtr - srcStride + uiWidth); // c + tempblock[iWidthExtSIMD + uiWidthExt - 1] = *(srcPtr - srcStride + uiWidth + 1); // d + } + else + { + tempblock[iWidthExtSIMD - 2] = tempblock[iWidthExtSIMD * 2 + uiWidthExt - 3]; // extend top right + tempblock[iWidthExtSIMD - 1] = tempblock[iWidthExtSIMD * 2 + uiWidthExt - 3]; // extend top right + tempblock[iWidthExtSIMD + uiWidthExt - 2] = tempblock[iWidthExtSIMD * 2 + uiWidthExt - 3]; // extend top right + tempblock[iWidthExtSIMD + uiWidthExt - 1] = tempblock[iWidthExtSIMD * 2 + uiWidthExt - 3]; // extend top right + } + + if (bottomAltAvailable && leftAltAvailable) + { + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + 0] = *(srcPtr + uiHeight * srcStride - 2); // a + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + 1] = *(srcPtr + uiHeight * srcStride - 1); // b + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + 0] = *(srcPtr + (uiHeight + 1) * srcStride - 2); // c + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + 1] = *(srcPtr + (uiHeight + 1) * srcStride - 1); // d + } + else + { + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + 0] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + 2]; // bot avail: mirror left/right + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + 1] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + 2]; // bot avail: mirror left/right + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + 0] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + 2]; // bot avail: mirror left/right + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + 1] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + 2]; // bot avail: mirror left/right + } + + if (bottomAltAvailable && rightAltAvailable) + { + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + uiWidthExt - 2] = *(srcPtr + uiHeight * srcStride + uiWidth); // a + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + uiWidthExt - 1] = *(srcPtr + uiHeight * srcStride + uiWidth + 1); // b + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + uiWidthExt - 2] = *(srcPtr + (uiHeight + 1) * srcStride + uiWidth); // c + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + uiWidthExt - 1] = *(srcPtr + (uiHeight + 1) * srcStride + uiWidth + 1); // d + } + else + { + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + uiWidthExt - 2] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + uiWidthExt - 3]; + tempblock[iWidthExtSIMD * (uiHeightExt - 2) + uiWidthExt - 1] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + uiWidthExt - 3]; + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + uiWidthExt - 2] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + uiWidthExt - 3]; + tempblock[iWidthExtSIMD * (uiHeightExt - 1) + uiWidthExt - 1] = tempblock[iWidthExtSIMD * (uiHeightExt - 3) + uiWidthExt - 3]; + } + } + + m_bilateralFilterDiamond5x5NoClip(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false, LUTrowPtr); +} +#endif + void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitBuf& rec, int32_t qp, const ClpRng& clpRng, TransformUnit & currTU) { CompArea &compArea = currTU.block(COMPONENT_Y); diff --git a/source/Lib/CommonLib/BilateralFilter.h b/source/Lib/CommonLib/BilateralFilter.h index 4cbd41be7cb19150bafba652de4e71334c4ec209..a77d6d62e9d085d5a39aa5514e1dbcad69d358c4 100755 --- a/source/Lib/CommonLib/BilateralFilter.h +++ b/source/Lib/CommonLib/BilateralFilter.h @@ -65,7 +65,13 @@ private: short *tempblockFiltered = &tempblockFilteredTemp[-2]; void (*m_bilateralFilterDiamond5x5)( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr); +#if JVET_W0066_CCSAO + void (*m_bilateralFilterDiamond5x5NoClip)(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr); +#endif static void blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ); +#if JVET_W0066_CCSAO + static void blockBilateralFilterDiamond5x5NoClip(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr); +#endif char m_wBIF[26][16] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, @@ -108,6 +114,9 @@ public: void bilateralFilterPicRDOperCTU(CodingStructure& cs, PelUnitBuf& src,BIFCabacEst* BifCABACEstimator); void bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitBuf& rec, int32_t qp, const ClpRng& clpRng, TransformUnit & currTU); +#if JVET_W0066_CCSAO + void bilateralFilterDiamond5x5NoClip(const CPelUnitBuf& src, PelUnitBuf& rec, int32_t qp, const ClpRng& clpRng, TransformUnit& currTU); +#endif void clipNotBilaterallyFilteredBlocks(const CPelUnitBuf& src, PelUnitBuf& rec, const ClpRng& clpRng, TransformUnit & currTU); const char* getFilterLutParameters( const int size, const PredMode predMode, const int qp, int& bfac ); @@ -116,6 +125,10 @@ public: #ifdef TARGET_SIMD_X86 template<X86_VEXT vext> static void simdFilterDiamond5x5( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ); +#if JVET_W0066_CCSAO + template<X86_VEXT vext> + static void simdFilterDiamond5x5NoClip(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr); +#endif void initBilateralFilterX86(); template <X86_VEXT vext> diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 75e0312e21f6bf7c7f8b94c320bb9337c8e7ffc7..d76bf13ea7dc8f5706e3c92a9e8ee6541774c357 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -207,6 +207,21 @@ static const int MIP_MAX_WIDTH = MAX_TB_SIZEY; static const int MIP_MAX_HEIGHT = MAX_TB_SIZEY; #endif +#if JVET_W0066_CCSAO +#define MAX_CCSAO_SET_NUM 4 +static const int MAX_CCSAO_CAND_POS_Y = 9; +static const int MAX_CCSAO_CAND_POS_Y_BITS = 4; +static const int MAX_CCSAO_BAND_NUM_Y = 16; +static const int MAX_CCSAO_BAND_NUM_Y_BITS = 4; +static const int MAX_CCSAO_BAND_NUM_U = 4; +static const int MAX_CCSAO_BAND_NUM_U_BITS = 2; +static const int MAX_CCSAO_BAND_NUM_V = 4; +static const int MAX_CCSAO_BAND_NUM_V_BITS = 2; +static const int MAX_CCSAO_CLASS_NUM = 64; +static const int MAX_CCSAO_OFFSET_THR = 15; +static const int MAX_CCSAO_FILTER_LENGTH = 3; +#endif + static const int MAX_NUM_ALF_ALTERNATIVES_CHROMA = 8; static const int MAX_NUM_ALF_CLASSES = 25; #if ALF_IMPROVEMENT diff --git a/source/Lib/CommonLib/Contexts.cpp b/source/Lib/CommonLib/Contexts.cpp index ba7ebd86a7001e609c06728ae006a014ae70ff1f..6f4111ae4af9a1c114d6255bb92fb8e73dfef1e6 100644 --- a/source/Lib/CommonLib/Contexts.cpp +++ b/source/Lib/CommonLib/Contexts.cpp @@ -1515,6 +1515,16 @@ const CtxSet ContextSetCfg::BifCtrlFlags = ContextSetCfg::addCtxSet }); #endif +#if JVET_W0066_CCSAO +const CtxSet ContextSetCfg::CcSaoControlIdc = ContextSetCfg::addCtxSet +({ + { CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, }, + { CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, }, + { CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, }, + { DWS, DWS, DWS, DWS, DWS, DWS, DWS, DWS, DWS, }, +}); +#endif + const CtxSet ContextSetCfg::LFNSTIdx = ContextSetCfg::addCtxSet ({ #if EXTENDED_LFNST @@ -2567,6 +2577,16 @@ const CtxSet ContextSetCfg::BifCtrlFlags = ContextSetCfg::addCtxSet }); #endif +#if JVET_W0066_CCSAO +const CtxSet ContextSetCfg::CcSaoControlIdc = ContextSetCfg::addCtxSet +({ + { CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, }, + { CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, }, + { CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, CNU, }, + { DWS, DWS, DWS, DWS, DWS, DWS, DWS, DWS, DWS, }, +}); +#endif + const CtxSet ContextSetCfg::LFNSTIdx = ContextSetCfg::addCtxSet ({ #if EXTENDED_LFNST diff --git a/source/Lib/CommonLib/Contexts.h b/source/Lib/CommonLib/Contexts.h index 4f49ae343125ef4139c5b641f35622375a1ac2b2..0b7f343b3a1f562fafede39eff05afeedc741a47 100644 --- a/source/Lib/CommonLib/Contexts.h +++ b/source/Lib/CommonLib/Contexts.h @@ -300,6 +300,9 @@ public: static const CtxSet SaoTypeIdx; #if JVET_V0094_BILATERAL_FILTER static const CtxSet BifCtrlFlags; +#endif +#if JVET_W0066_CCSAO + static const CtxSet CcSaoControlIdc; #endif static const CtxSet TransformSkipFlag; static const CtxSet MTSIdx; diff --git a/source/Lib/CommonLib/Rom.cpp b/source/Lib/CommonLib/Rom.cpp index c7351e229cb679571f6a6b3f777f2a1ec47445c9..69a74632d372d20a80ccaec8546401869cc06573 100644 --- a/source/Lib/CommonLib/Rom.cpp +++ b/source/Lib/CommonLib/Rom.cpp @@ -4831,4 +4831,8 @@ uint8_t g_geoTmShape[2][GEO_NUM_ANGLES] = { GEO_TM_SHAPE_L, 0, 0, GEO_TM_SHAPE_L, GEO_TM_SHAPE_AL, GEO_TM_SHAPE_AL, GEO_TM_SHAPE_AL, 0, } }; #endif +#if JVET_W0066_CCSAO +const int8_t g_ccSaoCandPosX[MAX_NUM_LUMA_COMP][MAX_CCSAO_CAND_POS_Y] = { {-1, 0, 1, -1, 0, 1, -1, 0, 1} }; +const int8_t g_ccSaoCandPosY[MAX_NUM_LUMA_COMP][MAX_CCSAO_CAND_POS_Y] = { {-1, -1, -1, 0, 0, 0, 1, 1, 1} }; +#endif //! \} diff --git a/source/Lib/CommonLib/Rom.h b/source/Lib/CommonLib/Rom.h index 763a3b24cb76cc12b154919954af64adcb161946..4eae2a15c273a1bf6de5f56ecf5ad3a07857c1ec 100644 --- a/source/Lib/CommonLib/Rom.h +++ b/source/Lib/CommonLib/Rom.h @@ -324,5 +324,9 @@ extern uint8_t g_geoTmShape[2][GEO_NUM_ANGLES]; #if MULTI_HYP_PRED extern const int g_addHypWeight[MULTI_HYP_PRED_NUM_WEIGHTS]; #endif +#if JVET_W0066_CCSAO +extern const int8_t g_ccSaoCandPosX[MAX_NUM_LUMA_COMP][MAX_CCSAO_CAND_POS_Y]; +extern const int8_t g_ccSaoCandPosY[MAX_NUM_LUMA_COMP][MAX_CCSAO_CAND_POS_Y]; +#endif #endif //__TCOMROM__ diff --git a/source/Lib/CommonLib/SampleAdaptiveOffset.cpp b/source/Lib/CommonLib/SampleAdaptiveOffset.cpp index 0c27da182295adb09e461343865b5b6b26844256..97e03f888da00e9d7bae35a98fffd500838850c1 100644 --- a/source/Lib/CommonLib/SampleAdaptiveOffset.cpp +++ b/source/Lib/CommonLib/SampleAdaptiveOffset.cpp @@ -138,11 +138,56 @@ void SampleAdaptiveOffset::create( int picWidth, int picHeight, ChromaFormat for m_offsetStepLog2 [compIdx] = isLuma(ComponentID(compIdx))? lumaBitShift : chromaBitShift; } m_numberOfComponents = getNumberValidComponents(format); + +#if JVET_W0066_CCSAO + if (m_created) + { + return; + } + m_created = true; + + m_ccSaoBuf.destroy(); + m_ccSaoBuf.create(format, Area(0, 0, picWidth, picHeight), maxCUWidth, MAX_CCSAO_FILTER_LENGTH >> 1, 0, false); + + m_picWidth = picWidth; + m_picHeight = picHeight; + m_maxCUWidth = maxCUWidth; + m_maxCUHeight = maxCUHeight; + + m_numCTUsInWidth = ( m_picWidth / m_maxCUWidth ) + ( ( m_picWidth % m_maxCUWidth ) ? 1 : 0 ); + m_numCTUsInHeight = ( m_picHeight / m_maxCUHeight ) + ( ( m_picHeight % m_maxCUHeight ) ? 1 : 0 ); + m_numCTUsInPic = m_numCTUsInHeight * m_numCTUsInWidth; + + for (int compIdx = 0; compIdx < MAX_NUM_COMPONENT; compIdx++) + { + m_ccSaoControl[compIdx] = new uint8_t[m_numCTUsInPic]; + ::memset(m_ccSaoControl[compIdx], 0, sizeof(uint8_t) * m_numCTUsInPic); + } +#endif } void SampleAdaptiveOffset::destroy() { m_tempBuf.destroy(); + +#if JVET_W0066_CCSAO + if (!m_created) + { + return; + } + m_created = false; + + m_ccSaoBuf.destroy(); + + for (int compIdx = 0; compIdx < MAX_NUM_COMPONENT; compIdx++) + { + if (m_ccSaoControl[compIdx]) + { + delete [] m_ccSaoControl[compIdx]; + m_ccSaoControl[compIdx] = nullptr; + } + } +#endif } void SampleAdaptiveOffset::invertQuantOffsets(ComponentID compIdx, int typeIdc, int typeAuxInfo, int* dstOffsets, int* srcOffsets) @@ -864,12 +909,14 @@ void SampleAdaptiveOffset::offsetCTUnoClip( const UnitArea& area, const CPelUnit m_signLineBuf2.resize(lineBufferSize); } +#if !JVET_W0066_CCSAO int numHorVirBndry = 0, numVerVirBndry = 0; int horVirBndryPos[] = { -1,-1,-1 }; int verVirBndryPos[] = { -1,-1,-1 }; int horVirBndryPosComp[] = { -1,-1,-1 }; int verVirBndryPosComp[] = { -1,-1,-1 }; bool isCtuCrossedByVirtualBoundaries = isCrossedByVirtualBoundaries(area.Y().x, area.Y().y, area.Y().width, area.Y().height, numHorVirBndry, numVerVirBndry, horVirBndryPos, verVirBndryPos, cs.picHeader); +#endif for(int compIdx = 0; compIdx < numberOfComponents; compIdx++) { const ComponentID compID = ComponentID(compIdx); @@ -882,6 +929,7 @@ void SampleAdaptiveOffset::offsetCTUnoClip( const UnitArea& area, const CPelUnit const Pel* srcBlk = src.get(compID).bufAt(compArea); int resStride = res.get(compID).stride; Pel* resBlk = res.get(compID).bufAt(compArea); +#if !JVET_W0066_CCSAO for (int i = 0; i < numHorVirBndry; i++) { horVirBndryPosComp[i] = (horVirBndryPos[i] >> ::getComponentScaleY(compID, area.chromaFormat)) - compArea.y; @@ -890,7 +938,21 @@ void SampleAdaptiveOffset::offsetCTUnoClip( const UnitArea& area, const CPelUnit { verVirBndryPosComp[i] = (verVirBndryPos[i] >> ::getComponentScaleX(compID, area.chromaFormat)) - compArea.x; } +#endif +#if JVET_W0066_CCSAO + // Do not clip the final output for both luma and chroma. Clipping is done jontly for SAO/BIF/CCSAO. + offsetBlockNoClip(cs.sps->getBitDepth(toChannelType(compID)), + cs.slice->clpRng(compID), + ctbOffset.typeIdc, ctbOffset.offset + , srcBlk, resBlk, srcStride, resStride, compArea.width, compArea.height + , isLeftAvail, isRightAvail + , isAboveAvail, isBelowAvail + , isAboveLeftAvail, isAboveRightAvail + , isBelowLeftAvail, isBelowRightAvail + // , isCtuCrossedByVirtualBoundaries, horVirBndryPosComp, verVirBndryPosComp, numHorVirBndry, numVerVirBndry + ); +#else if(compID == COMPONENT_Y) { // If it is luma we should not clip, since we will clip @@ -924,6 +986,7 @@ void SampleAdaptiveOffset::offsetCTUnoClip( const UnitArea& area, const CPelUnit } +#endif } } //compIdx } @@ -994,6 +1057,27 @@ void SampleAdaptiveOffset::SAOProcess( CodingStructure& cs, SAOBlkParam* saoBlkP const uint32_t width = (xPos + pcv.maxCUWidth > pcv.lumaWidth) ? (pcv.lumaWidth - xPos) : pcv.maxCUWidth; const uint32_t height = (yPos + pcv.maxCUHeight > pcv.lumaHeight) ? (pcv.lumaHeight - yPos) : pcv.maxCUHeight; const UnitArea area( cs.area.chromaFormat, Area(xPos , yPos, width, height) ); + +#if JVET_W0066_CCSAO + // Always do non-clipped version for SAO/BIF, the clipping is done jointly after CCSAO is also applied + if (!bAllDisabled) + offsetCTUnoClip(area, m_tempBuf, rec, cs.picture->getSAO()[ctuRsAddr], cs); + + BifParams& bifParams = cs.picture->getBifParam(); + + // And now we traverse the CTU to do BIF + for (auto& currCU : cs.traverseCUs(CS::getArea(cs, area, CH_L), CH_L)) + { + for (auto& currTU : CU::traverseTUs(currCU)) + { + bool isInter = (currCU.predMode == MODE_INTER) ? true : false; + if (bifParams.ctuOn[ctuRsAddr] && ((TU::getCbf(currTU, COMPONENT_Y) || isInter == false) && (currTU.cu->qp > 17)) && (128 > std::max(currTU.lumaSize().width, currTU.lumaSize().height)) && ((isInter == false) || (32 > std::min(currTU.lumaSize().width, currTU.lumaSize().height)))) + { + m_bilateralFilter.bilateralFilterDiamond5x5NoClip(m_tempBuf, rec, currTU.cu->qp, cs.slice->clpRng(COMPONENT_Y), currTU); + } + } + } +#else #if JVET_V0094_BILATERAL_FILTER if(cs.pps->getUseBIF()) { @@ -1038,6 +1122,7 @@ void SampleAdaptiveOffset::SAOProcess( CodingStructure& cs, SAOBlkParam* saoBlkP } #else offsetCTU( area, m_tempBuf, rec, cs.picture->getSAO()[ctuRsAddr], cs); +#endif #endif ctuRsAddr++; } @@ -1053,6 +1138,405 @@ void SampleAdaptiveOffset::SAOProcess( CodingStructure& cs, SAOBlkParam* saoBlkP } +#if JVET_W0066_CCSAO +void SampleAdaptiveOffset::CCSAOProcess(CodingStructure& cs) +{ + const uint32_t numberOfComponents = getNumberValidComponents(cs.area.chromaFormat); + bool bAllDisabled = true; + for (uint32_t compIdx = 0; compIdx < numberOfComponents; compIdx++) + { + if (m_ccSaoComParam.enabled[compIdx]) + { + bAllDisabled = false; + } + } + if (bAllDisabled) + { + return; + } + + const PreCalcValues& pcv = *cs.pcv; + PelUnitBuf dstYuv = cs.getRecoBuf(); + PelUnitBuf srcYuv = m_ccSaoBuf.getBuf( cs.area ); + srcYuv.extendBorderPel( MAX_CCSAO_FILTER_LENGTH >> 1 ); + + applyCcSao(cs, pcv, srcYuv, dstYuv); +} + +void SampleAdaptiveOffset::applyCcSao(CodingStructure &cs, const PreCalcValues& pcv, const CPelUnitBuf& srcYuv, PelUnitBuf& dstYuv) +{ + int ctuRsAddr = 0; + for (uint32_t yPos = 0; yPos < pcv.lumaHeight; yPos += pcv.maxCUHeight) + { + for (uint32_t xPos = 0; xPos < pcv.lumaWidth; xPos += pcv.maxCUWidth) + { + const uint32_t width = (xPos + pcv.maxCUWidth > pcv.lumaWidth ) ? (pcv.lumaWidth - xPos) : pcv.maxCUWidth; + const uint32_t height = (yPos + pcv.maxCUHeight > pcv.lumaHeight) ? (pcv.lumaHeight - yPos) : pcv.maxCUHeight; + const UnitArea area(cs.area.chromaFormat, Area(xPos, yPos, width, height)); + + offsetCTUCcSaoNoClip(cs, area, srcYuv, dstYuv, ctuRsAddr); + ctuRsAddr++; + } + } +} + +void SampleAdaptiveOffset::jointClipSaoBifCcSao(CodingStructure& cs) +{ + if (!cs.sps->getSAOEnabledFlag() && !cs.pps->getUseBIF() && !cs.sps->getCCSAOEnabledFlag()) + return; + + const PreCalcValues& pcv = *cs.pcv; + PelUnitBuf dstYuv = cs.getRecoBuf(); + + // Iterate all CTUs and check if any of the filters is on for a given component + int ctuRsAddr = 0; + for (uint32_t yPos = 0; yPos < pcv.lumaHeight; yPos += pcv.maxCUHeight) + { + for (uint32_t xPos = 0; xPos < pcv.lumaWidth; xPos += pcv.maxCUWidth) + { + const uint32_t width = (xPos + pcv.maxCUWidth > pcv.lumaWidth) ? (pcv.lumaWidth - xPos) : pcv.maxCUWidth; + const uint32_t height = (yPos + pcv.maxCUHeight > pcv.lumaHeight) ? (pcv.lumaHeight - yPos) : pcv.maxCUHeight; + const UnitArea area(cs.area.chromaFormat, Area(xPos, yPos, width, height)); + const uint32_t numberOfComponents = getNumberValidComponents(area.chromaFormat); + SAOBlkParam mySAOblkParam = cs.picture->getSAO()[ctuRsAddr]; + + for (int compIdx = 0; compIdx < numberOfComponents; compIdx++) + { + const int setIdc = m_ccSaoControl[compIdx][ctuRsAddr]; + SAOOffset& myCtbOffset = mySAOblkParam[compIdx]; + if ((m_ccSaoComParam.enabled[compIdx] && setIdc != 0) || (myCtbOffset.modeIdc != SAO_MODE_OFF)) + { + // We definitely need to clip if either SAO or CCSAO is on for the given component of the CTU + clipCTU(cs, dstYuv, area, ComponentID(compIdx)); + } + else + { + // When BIF is on, the luma component might need to be clipped + if (cs.pps->getUseBIF()) + { + if (compIdx == COMPONENT_Y) + { + BifParams& bifParams = cs.picture->getBifParam(); + + // And now we traverse the CTU to do clipping + for (auto& currCU : cs.traverseCUs(CS::getArea(cs, area, CH_L), CH_L)) + { + for (auto& currTU : CU::traverseTUs(currCU)) + { + bool isInter = (currCU.predMode == MODE_INTER) ? true : false; + if (bifParams.ctuOn[ctuRsAddr] && ((TU::getCbf(currTU, COMPONENT_Y) || isInter == false) && (currTU.cu->qp > 17)) && (128 > std::max(currTU.lumaSize().width, currTU.lumaSize().height)) && ((isInter == false) || (32 > std::min(currTU.lumaSize().width, currTU.lumaSize().height)))) + { + m_bilateralFilter.clipNotBilaterallyFilteredBlocks(m_tempBuf, dstYuv, cs.slice->clpRng(COMPONENT_Y), currTU); + } + } + } + } + } + } + } + ctuRsAddr++; + } + } +} + +void SampleAdaptiveOffset::clipCTU(CodingStructure& cs, PelUnitBuf& dstYuv, const UnitArea& area, const ComponentID compID) +{ + const CompArea &compArea = area.block(compID); + const uint32_t height = compArea.height; + const uint32_t width = compArea.width; + Pel *dst = dstYuv.get(compID).bufAt(area.block(compID)); + int dstStride = dstYuv.get(compID).stride; + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + // new result = old result (which is SAO-treated already) + clipping + dst[x] = ClipPel<int>(dst[x], cs.slice->clpRng(compID)); + } + dst += dstStride; + } +} + +void SampleAdaptiveOffset::offsetCTUCcSaoNoClip(CodingStructure& cs, const UnitArea& area, const CPelUnitBuf& srcYuv, PelUnitBuf& dstYuv, const int ctuRsAddr) +{ + const uint32_t numberOfComponents = getNumberValidComponents(area.chromaFormat); + bool bAllOff = true; + for (uint32_t compIdx = 0; compIdx < numberOfComponents; compIdx++) + { + if (m_ccSaoComParam.enabled[compIdx]) + { + bAllOff = false; + } + } + if (bAllOff) + { + return; + } + + bool isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail, isAboveLeftAvail, isAboveRightAvail, isBelowLeftAvail, isBelowRightAvail; + deriveLoopFilterBoundaryAvailibility(cs, area.Y(), isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail, isAboveLeftAvail, isAboveRightAvail, isBelowLeftAvail, isBelowRightAvail); + + for (int compIdx = 0; compIdx < numberOfComponents; compIdx++) + { + if (m_ccSaoComParam.enabled[compIdx]) + { + const int setIdc = m_ccSaoControl[compIdx][ctuRsAddr]; + + if (setIdc != 0) + { + const ComponentID compID = ComponentID(compIdx); + const CompArea &compArea = area.block(compID); + const int srcStrideY = srcYuv.get(COMPONENT_Y ).stride; + const int srcStrideU = srcYuv.get(COMPONENT_Cb).stride; + const int srcStrideV = srcYuv.get(COMPONENT_Cr).stride; + const int dstStride = dstYuv.get(compID ).stride; + const Pel *srcBlkY = srcYuv.get(COMPONENT_Y ).bufAt(area.block(COMPONENT_Y )); + const Pel *srcBlkU = srcYuv.get(COMPONENT_Cb).bufAt(area.block(COMPONENT_Cb)); + const Pel *srcBlkV = srcYuv.get(COMPONENT_Cr).bufAt(area.block(COMPONENT_Cr)); + Pel *dstBlk = dstYuv.get(compID ).bufAt(compArea); + + const uint16_t candPosY = m_ccSaoComParam.candPos[compIdx][setIdc - 1][COMPONENT_Y ]; + const uint16_t bandNumY = m_ccSaoComParam.bandNum[compIdx][setIdc - 1][COMPONENT_Y ]; + const uint16_t bandNumU = m_ccSaoComParam.bandNum[compIdx][setIdc - 1][COMPONENT_Cb]; + const uint16_t bandNumV = m_ccSaoComParam.bandNum[compIdx][setIdc - 1][COMPONENT_Cr]; + const short *offset = m_ccSaoComParam.offset [compIdx][setIdc - 1]; + + offsetBlockCcSaoNoClip(compID, cs.sps->getBitDepth(toChannelType(compID)), cs.slice->clpRng(compID) + , candPosY, bandNumY, bandNumU, bandNumV + , offset + , srcBlkY, srcBlkU, srcBlkV, dstBlk + , srcStrideY, srcStrideU, srcStrideV, dstStride + , compArea.width, compArea.height + , isLeftAvail, isRightAvail + , isAboveAvail, isBelowAvail + , isAboveLeftAvail, isAboveRightAvail + , isBelowLeftAvail, isBelowRightAvail + ); + } + } + } +} + +void SampleAdaptiveOffset::offsetCTUCcSao(CodingStructure& cs, const UnitArea& area, const CPelUnitBuf& srcYuv, PelUnitBuf& dstYuv, const int ctuRsAddr) +{ + const uint32_t numberOfComponents = getNumberValidComponents( area.chromaFormat ); + bool bAllOff = true; + for( uint32_t compIdx = 0; compIdx < numberOfComponents; compIdx++) + { + if (m_ccSaoComParam.enabled[compIdx]) + { + bAllOff = false; + } + } + if (bAllOff) + { + return; + } + + bool isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail, isAboveLeftAvail, isAboveRightAvail, isBelowLeftAvail, isBelowRightAvail; + deriveLoopFilterBoundaryAvailibility(cs, area.Y(), isLeftAvail,isRightAvail,isAboveAvail,isBelowAvail,isAboveLeftAvail,isAboveRightAvail,isBelowLeftAvail,isBelowRightAvail); + + for(int compIdx = 0; compIdx < numberOfComponents; compIdx++) + { + if(m_ccSaoComParam.enabled[compIdx]) + { + const int setIdc = m_ccSaoControl[compIdx][ctuRsAddr]; + + if (setIdc != 0) + { + const ComponentID compID = ComponentID(compIdx); + const CompArea &compArea = area.block(compID); + const int srcStrideY = srcYuv.get(COMPONENT_Y ).stride; + const int srcStrideU = srcYuv.get(COMPONENT_Cb).stride; + const int srcStrideV = srcYuv.get(COMPONENT_Cr).stride; + const int dstStride = dstYuv.get(compID ).stride; + const Pel *srcBlkY = srcYuv.get(COMPONENT_Y ).bufAt(area.block(COMPONENT_Y )); + const Pel *srcBlkU = srcYuv.get(COMPONENT_Cb).bufAt(area.block(COMPONENT_Cb)); + const Pel *srcBlkV = srcYuv.get(COMPONENT_Cr).bufAt(area.block(COMPONENT_Cr)); + Pel *dstBlk = dstYuv.get(compID ).bufAt(compArea); + + const uint16_t candPosY = m_ccSaoComParam.candPos[compIdx][setIdc - 1][COMPONENT_Y ]; + const uint16_t bandNumY = m_ccSaoComParam.bandNum[compIdx][setIdc - 1][COMPONENT_Y ]; + const uint16_t bandNumU = m_ccSaoComParam.bandNum[compIdx][setIdc - 1][COMPONENT_Cb]; + const uint16_t bandNumV = m_ccSaoComParam.bandNum[compIdx][setIdc - 1][COMPONENT_Cr]; + const short *offset = m_ccSaoComParam.offset [compIdx][setIdc - 1]; + + offsetBlockCcSao( compID, cs.sps->getBitDepth(toChannelType(compID)), cs.slice->clpRng(compID) + , candPosY, bandNumY, bandNumU, bandNumV + , offset + , srcBlkY, srcBlkU, srcBlkV, dstBlk + , srcStrideY, srcStrideU, srcStrideV, dstStride + , compArea.width, compArea.height + , isLeftAvail, isRightAvail + , isAboveAvail, isBelowAvail + , isAboveLeftAvail, isAboveRightAvail + , isBelowLeftAvail, isBelowRightAvail + ); + } + } + } +} + +void SampleAdaptiveOffset::offsetBlockCcSaoNoClip(const ComponentID compID, const int bitDepth, const ClpRng& clpRng + , const uint16_t candPosY + , const uint16_t bandNumY, const uint16_t bandNumU, const uint16_t bandNumV + , const short* offset + , const Pel* srcY, const Pel* srcU, const Pel* srcV, Pel* dst + , const int srcStrideY, const int srcStrideU, const int srcStrideV, const int dstStride + , const int width, const int height + , bool isLeftAvail, bool isRightAvail, bool isAboveAvail, bool isBelowAvail, bool isAboveLeftAvail, bool isAboveRightAvail, bool isBelowLeftAvail, bool isBelowRightAvail + ) +{ + const int candPosYX = g_ccSaoCandPosX[COMPONENT_Y][candPosY]; + const int candPosYY = g_ccSaoCandPosY[COMPONENT_Y][candPosY]; + + switch (compID) + { + case COMPONENT_Y: + { + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + const Pel* colY = srcY + x + srcStrideY * candPosYY + candPosYX; + const Pel* colU = srcU + (x >> 1); + const Pel* colV = srcV + (x >> 1); + + const int bandY = (*colY * bandNumY) >> bitDepth; + const int bandU = (*colU * bandNumU) >> bitDepth; + const int bandV = (*colV * bandNumV) >> bitDepth; + const int bandIdx = bandY * bandNumU * bandNumV + + bandU * bandNumV + + bandV; + const int classIdx = bandIdx; + + //dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); + dst[x] = dst[x] + offset[classIdx]; + } + + srcY += srcStrideY; + srcU += srcStrideU * (y & 0x1); + srcV += srcStrideV * (y & 0x1); + dst += dstStride; + } + } + break; + case COMPONENT_Cb: + case COMPONENT_Cr: + { + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + const Pel* colY = srcY + (x << 1) + srcStrideY * candPosYY + candPosYX; + const Pel* colU = srcU + x; + const Pel* colV = srcV + x; + + const int bandY = (*colY * bandNumY) >> bitDepth; + const int bandU = (*colU * bandNumU) >> bitDepth; + const int bandV = (*colV * bandNumV) >> bitDepth; + const int bandIdx = bandY * bandNumU * bandNumV + + bandU * bandNumV + + bandV; + const int classIdx = bandIdx; + + //dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); + dst[x] = dst[x] + offset[classIdx]; + } + srcY += srcStrideY << 1; + srcU += srcStrideU; + srcV += srcStrideV; + dst += dstStride; + } + } + break; + default: + { + THROW("Not a supported CCSAO compID\n"); + } + } +} + +void SampleAdaptiveOffset::offsetBlockCcSao(const ComponentID compID, const int bitDepth, const ClpRng& clpRng + , const uint16_t candPosY + , const uint16_t bandNumY, const uint16_t bandNumU, const uint16_t bandNumV + , const short* offset + , const Pel* srcY, const Pel* srcU, const Pel* srcV, Pel* dst + , const int srcStrideY, const int srcStrideU, const int srcStrideV, const int dstStride + , const int width, const int height + , bool isLeftAvail, bool isRightAvail, bool isAboveAvail, bool isBelowAvail, bool isAboveLeftAvail, bool isAboveRightAvail, bool isBelowLeftAvail, bool isBelowRightAvail + ) +{ + const int candPosYX = g_ccSaoCandPosX[COMPONENT_Y][candPosY]; + const int candPosYY = g_ccSaoCandPosY[COMPONENT_Y][candPosY]; + + switch(compID) + { + case COMPONENT_Y: + { + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + const Pel *colY = srcY + x + srcStrideY * candPosYY + candPosYX; + const Pel *colU = srcU + (x >> 1); + const Pel *colV = srcV + (x >> 1); + + const int bandY = (*colY * bandNumY) >> bitDepth; + const int bandU = (*colU * bandNumU) >> bitDepth; + const int bandV = (*colV * bandNumV) >> bitDepth; + const int bandIdx = bandY * bandNumU * bandNumV + + bandU * bandNumV + + bandV; + const int classIdx = bandIdx; + + dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); + } + + srcY += srcStrideY; + srcU += srcStrideU * (y & 0x1); + srcV += srcStrideV * (y & 0x1); + dst += dstStride; + } + } + break; + case COMPONENT_Cb: + case COMPONENT_Cr: + { + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + const Pel *colY = srcY + (x << 1) + srcStrideY * candPosYY + candPosYX; + const Pel *colU = srcU + x; + const Pel *colV = srcV + x; + + const int bandY = (*colY * bandNumY) >> bitDepth; + const int bandU = (*colU * bandNumU) >> bitDepth; + const int bandV = (*colV * bandNumV) >> bitDepth; + const int bandIdx = bandY * bandNumU * bandNumV + + bandU * bandNumV + + bandV; + const int classIdx = bandIdx; + + dst[x] = ClipPel<int>(dst[x] + offset[classIdx], clpRng); + } + + srcY += srcStrideY << 1; + srcU += srcStrideU; + srcV += srcStrideV; + dst += dstStride; + } + } + break; + default: + { + THROW("Not a supported CCSAO compID\n"); + } + } +} +#endif void SampleAdaptiveOffset::deriveLoopFilterBoundaryAvailibility(CodingStructure& cs, const Position &pos, bool& isLeftAvail, diff --git a/source/Lib/CommonLib/SampleAdaptiveOffset.h b/source/Lib/CommonLib/SampleAdaptiveOffset.h index 9fe135edb403f5e506420eedfd12311e8af17edb..1a0970122d487ccce978c33a34556e73e954e153 100644 --- a/source/Lib/CommonLib/SampleAdaptiveOffset.h +++ b/source/Lib/CommonLib/SampleAdaptiveOffset.h @@ -77,6 +77,13 @@ public: #if JVET_V0094_BILATERAL_FILTER BilateralFilter m_bilateralFilter; #endif +#if JVET_W0066_CCSAO + void CCSAOProcess(CodingStructure& cs); + CcSaoComParam& getCcSaoComParam() { return m_ccSaoComParam; } + uint8_t* getCcSaoControlIdc(const ComponentID compID) { return m_ccSaoControl[compID]; } + PelStorage& getCcSaoBuf() { return m_ccSaoBuf; } + void jointClipSaoBifCcSao(CodingStructure& cs); +#endif protected: void deriveLoopFilterBoundaryAvailibility(CodingStructure& cs, const Position &pos, bool& isLeftAvail, @@ -106,6 +113,9 @@ protected: void offsetCTU(const UnitArea& area, const CPelUnitBuf& src, PelUnitBuf& res, SAOBlkParam& saoblkParam, CodingStructure& cs); #if JVET_V0094_BILATERAL_FILTER void offsetCTUnoClip( const UnitArea& area, const CPelUnitBuf& src, PelUnitBuf& res, SAOBlkParam& saoblkParam, CodingStructure& cs); +#if JVET_W0066_CCSAO + void clipCTU(CodingStructure& cs, PelUnitBuf& dstYuv, const UnitArea& area, const ComponentID compID); +#endif void offsetCTUonlySAO(const UnitArea& area, const CPelUnitBuf& src, PelUnitBuf& res, SAOBlkParam& saoblkParam, CodingStructure& cs); #endif void xReconstructBlkSAOParams(CodingStructure& cs, SAOBlkParam* saoBlkParams); @@ -132,6 +142,29 @@ protected: return bDisabledFlag; } Reshape* m_pcReshape; +#if JVET_W0066_CCSAO + void applyCcSao(CodingStructure &cs, const PreCalcValues& pcv, const CPelUnitBuf& srcYuv, PelUnitBuf& dstYuv); + void offsetCTUCcSao(CodingStructure& cs, const UnitArea& area, const CPelUnitBuf& srcYuv, PelUnitBuf& dstYuv, const int ctuRsAddr); + void offsetCTUCcSaoNoClip(CodingStructure& cs, const UnitArea& area, const CPelUnitBuf& srcYuv, PelUnitBuf& dstYuv, const int ctuRsAddr); + void offsetBlockCcSao(const ComponentID compID, const int bitDepth, const ClpRng& clpRng + , const uint16_t candPosY + , const uint16_t bandNumY, const uint16_t bandNumU, const uint16_t bandNumV + , const short* offset + , const Pel* srcY, const Pel* srcU, const Pel* srcV, Pel* dst + , const int srcStrideY, const int srcStrideU, const int srcStrideV, const int dstStride + , const int width, const int height + , bool isLeftAvail, bool isRightAvail, bool isAboveAvail, bool isBelowAvail, bool isAboveLeftAvail, bool isAboveRightAvail, bool isBelowLeftAvail, bool isBelowRightAvail + ); + void offsetBlockCcSaoNoClip(const ComponentID compID, const int bitDepth, const ClpRng& clpRng + , const uint16_t candPosY + , const uint16_t bandNumY, const uint16_t bandNumU, const uint16_t bandNumV + , const short* offset + , const Pel* srcY, const Pel* srcU, const Pel* srcV, Pel* dst + , const int srcStrideY, const int srcStrideU, const int srcStrideV, const int dstStride + , const int width, const int height + , bool isLeftAvail, bool isRightAvail, bool isAboveAvail, bool isBelowAvail, bool isAboveLeftAvail, bool isAboveRightAvail, bool isBelowLeftAvail, bool isBelowRightAvail + ); +#endif protected: uint32_t m_offsetStepLog2[MAX_NUM_COMPONENT]; //offset step PelStorage m_tempBuf; @@ -139,6 +172,20 @@ protected: std::vector<int8_t> m_signLineBuf1; std::vector<int8_t> m_signLineBuf2; +#if JVET_W0066_CCSAO + bool m_created = false; + PelStorage m_ccSaoBuf; + int m_picWidth; + int m_picHeight; + int m_maxCUWidth; + int m_maxCUHeight; + int m_numCTUsInWidth; + int m_numCTUsInHeight; + int m_numCTUsInPic; + + CcSaoComParam m_ccSaoComParam; + uint8_t* m_ccSaoControl[MAX_NUM_COMPONENT]; +#endif private: bool m_picSAOEnabled[MAX_NUM_COMPONENT]; }; diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp index 69b158ee2b1f1c9bfb4a591a7f993469a0257f13..3981c07929ce7159b1014885f0a6ad83b0b7bea4 100644 --- a/source/Lib/CommonLib/Slice.cpp +++ b/source/Lib/CommonLib/Slice.cpp @@ -153,6 +153,10 @@ Slice::Slice() { m_saoEnabledFlag[ch] = false; } +#if JVET_W0066_CCSAO + m_ccSaoComParam.reset(); + resetCcSaoEnabledFlag(); +#endif memset(m_alfApss, 0, sizeof(m_alfApss)); m_ccAlfFilterParam.reset(); @@ -209,6 +213,10 @@ void Slice::initSlice() m_useLTforDRAP = false; m_isDRAP = false; m_latestDRAPPOC = MAX_INT; +#if JVET_W0066_CCSAO + m_ccSaoComParam.reset(); + resetCcSaoEnabledFlag(); +#endif resetTileGroupAlfEnabledFlag(); m_ccAlfFilterParam.reset(); m_tileGroupCcAlfCbEnabledFlag = 0; @@ -264,6 +272,14 @@ void Slice::inheritFromPicHeader( PicHeader *picHeader, const PPS *pps, const SP setSaoEnabledFlag(CHANNEL_TYPE_LUMA, picHeader->getSaoEnabledFlag(CHANNEL_TYPE_LUMA)); setSaoEnabledFlag(CHANNEL_TYPE_CHROMA, picHeader->getSaoEnabledFlag(CHANNEL_TYPE_CHROMA)); +#if JVET_W0066_CCSAO + setCcSaoEnabledFlag(COMPONENT_Y, picHeader->getCcSaoEnabledFlag(COMPONENT_Y)); + setCcSaoEnabledFlag(COMPONENT_Cb, picHeader->getCcSaoEnabledFlag(COMPONENT_Cb)); + setCcSaoEnabledFlag(COMPONENT_Cr, picHeader->getCcSaoEnabledFlag(COMPONENT_Cr)); + m_ccSaoComParam.enabled[COMPONENT_Y ] = picHeader->getCcSaoEnabledFlag(COMPONENT_Y); + m_ccSaoComParam.enabled[COMPONENT_Cb] = picHeader->getCcSaoEnabledFlag(COMPONENT_Cb); + m_ccSaoComParam.enabled[COMPONENT_Cr] = picHeader->getCcSaoEnabledFlag(COMPONENT_Cr); +#endif setTileGroupAlfEnabledFlag(COMPONENT_Y, picHeader->getAlfEnabledFlag(COMPONENT_Y)); setTileGroupAlfEnabledFlag(COMPONENT_Cb, picHeader->getAlfEnabledFlag(COMPONENT_Cb)); @@ -1219,6 +1235,15 @@ void Slice::copySliceInfo(Slice *pSrc, bool cpyAlmostAll) { m_saoEnabledFlag[ch] = pSrc->m_saoEnabledFlag[ch]; } +#if JVET_W0066_CCSAO + m_ccSaoComParam = pSrc->m_ccSaoComParam; + m_ccSaoControl [COMPONENT_Y ] = pSrc->m_ccSaoControl [COMPONENT_Y ]; + m_ccSaoControl [COMPONENT_Cb] = pSrc->m_ccSaoControl [COMPONENT_Cb]; + m_ccSaoControl [COMPONENT_Cr] = pSrc->m_ccSaoControl [COMPONENT_Cr]; + m_ccSaoEnabledFlag[COMPONENT_Y ] = pSrc->m_ccSaoEnabledFlag[COMPONENT_Y ]; + m_ccSaoEnabledFlag[COMPONENT_Cb] = pSrc->m_ccSaoEnabledFlag[COMPONENT_Cb]; + m_ccSaoEnabledFlag[COMPONENT_Cr] = pSrc->m_ccSaoEnabledFlag[COMPONENT_Cr]; +#endif m_cabacInitFlag = pSrc->m_cabacInitFlag; memcpy(m_alfApss, pSrc->m_alfApss, sizeof(m_alfApss)); // this might be quite unsafe @@ -4760,6 +4785,9 @@ bool operator == (const ConstraintInfo& op1, const ConstraintInfo& o if( op1.m_noQtbttDualTreeIntraConstraintFlag != op2.m_noQtbttDualTreeIntraConstraintFlag ) return false; if( op1.m_noPartitionConstraintsOverrideConstraintFlag != op2.m_noPartitionConstraintsOverrideConstraintFlag ) return false; if( op1.m_noSaoConstraintFlag != op2.m_noSaoConstraintFlag ) return false; +#if JVET_W0066_CCSAO + if( op1.m_noCCSaoConstraintFlag != op2.m_noCCSaoConstraintFlag ) return false; +#endif if( op1.m_noAlfConstraintFlag != op2.m_noAlfConstraintFlag ) return false; if( op1.m_noCCAlfConstraintFlag != op2.m_noCCAlfConstraintFlag ) return false; #if JVET_S0058_GCI diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index 43eb1c0c2189a9f302607f2eb9e4d32048ae0bb2..246c7cfab59512afdb3caa79d739dc73b89a31cc 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -299,6 +299,9 @@ class ConstraintInfo #endif bool m_noPartitionConstraintsOverrideConstraintFlag; bool m_noSaoConstraintFlag; +#if JVET_W0066_CCSAO + bool m_noCCSaoConstraintFlag; +#endif bool m_noAlfConstraintFlag; bool m_noCCAlfConstraintFlag; #if JVET_S0058_GCI @@ -417,6 +420,9 @@ public: #endif , m_noPartitionConstraintsOverrideConstraintFlag(false) , m_noSaoConstraintFlag (false) +#if JVET_W0066_CCSAO + , m_noCCSaoConstraintFlag (false) +#endif , m_noAlfConstraintFlag (false) , m_noCCAlfConstraintFlag (false) #if JVET_S0058_GCI @@ -583,6 +589,10 @@ public: void setNoPartitionConstraintsOverrideConstraintFlag(bool bVal) { m_noPartitionConstraintsOverrideConstraintFlag = bVal; } bool getNoSaoConstraintFlag() const { return m_noSaoConstraintFlag; } void setNoSaoConstraintFlag(bool bVal) { m_noSaoConstraintFlag = bVal; } +#if JVET_W0066_CCSAO + bool getNoCCSaoConstraintFlag() const { return m_noCCSaoConstraintFlag; } + void setNoCCSaoConstraintFlag(bool val) { m_noCCSaoConstraintFlag = val; } +#endif bool getNoAlfConstraintFlag() const { return m_noAlfConstraintFlag; } void setNoAlfConstraintFlag(bool bVal) { m_noAlfConstraintFlag = bVal; } bool getNoCCAlfConstraintFlag() const { return m_noCCAlfConstraintFlag; } @@ -1604,6 +1614,9 @@ private: bool m_useWeightedBiPred; //!< Use of Weighting Bi-Prediction (B_SLICE) bool m_saoEnabledFlag; +#if JVET_W0066_CCSAO + bool m_ccSaoEnabledFlag; +#endif bool m_bTemporalIdNestingFlag; // temporal_id_nesting_flag @@ -1923,6 +1936,10 @@ public: void setSAOEnabledFlag(bool bVal) { m_saoEnabledFlag = bVal; } bool getSAOEnabledFlag() const { return m_saoEnabledFlag; } +#if JVET_W0066_CCSAO + bool getCCSAOEnabledFlag() const { return m_ccSaoEnabledFlag; } + void setCCSAOEnabledFlag( bool b ) { m_ccSaoEnabledFlag = b; } +#endif bool getALFEnabledFlag() const { return m_alfEnabledFlag; } void setALFEnabledFlag( bool b ) { m_alfEnabledFlag = b; } @@ -2670,6 +2687,9 @@ private: #endif int m_qpDelta; //!< value of Qp delta bool m_saoEnabledFlag[MAX_NUM_CHANNEL_TYPE]; //!< sao enabled flags for each channel +#if JVET_W0066_CCSAO + bool m_ccSaoEnabledFlag[MAX_NUM_COMPONENT]; +#endif #if ALF_IMPROVEMENT int m_alfFixedFilterSetIdx; #endif @@ -2804,6 +2824,10 @@ public: int getQpDelta() const { return m_qpDelta; } void setSaoEnabledFlag(ChannelType chType, bool b) { m_saoEnabledFlag[chType] = b; } bool getSaoEnabledFlag(ChannelType chType) const { return m_saoEnabledFlag[chType]; } +#if JVET_W0066_CCSAO + void setCcSaoEnabledFlag(ComponentID compId, bool b) { m_ccSaoEnabledFlag[compId] = b; } + bool getCcSaoEnabledFlag(ComponentID compId) const { return m_ccSaoEnabledFlag[compId]; } +#endif #if ALF_IMPROVEMENT void setAlfFixedFilterSetIdx(int i) { m_alfFixedFilterSetIdx = i; } int getAlfFixedFilterSetIdx() const { return m_alfFixedFilterSetIdx; } @@ -2916,6 +2940,9 @@ class Slice private: // Bitstream writing bool m_saoEnabledFlag[MAX_NUM_CHANNEL_TYPE]; +#if JVET_W0066_CCSAO + bool m_ccSaoEnabledFlag[MAX_NUM_COMPONENT]; +#endif int m_iPOC; int m_iLastIDR; int m_prevGDRInSameLayerPOC; //< the previous GDR in the same layer @@ -3078,6 +3105,11 @@ public: APS** getAlfAPSs() { return m_alfApss; } void setSaoEnabledFlag(ChannelType chType, bool s) {m_saoEnabledFlag[chType] =s; } bool getSaoEnabledFlag(ChannelType chType) const { return m_saoEnabledFlag[chType]; } +#if JVET_W0066_CCSAO + void resetCcSaoEnabledFlag() { memset(m_ccSaoEnabledFlag, 0, sizeof(m_ccSaoEnabledFlag)); } + void setCcSaoEnabledFlag(ComponentID compID, bool b) { m_ccSaoEnabledFlag[compID] = b; } + bool getCcSaoEnabledFlag(ComponentID compID) { return m_ccSaoEnabledFlag[compID]; } +#endif void setRPL0(const ReferencePictureList *pcRPL) { m_pRPL0 = pcRPL; } void setRPL1(const ReferencePictureList *pcRPL) { m_pRPL1 = pcRPL; } const ReferencePictureList* getRPL0() { return m_pRPL0; } @@ -3389,6 +3421,10 @@ public: bool isLastSliceInSubpic(); #endif +#if JVET_W0066_CCSAO + CcSaoComParam m_ccSaoComParam; + uint8_t* m_ccSaoControl[MAX_NUM_COMPONENT]; +#endif CcAlfFilterParam m_ccAlfFilterParam; uint8_t* m_ccAlfFilterControl[2]; diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index 947606520e257054a436f11c83e7caee40715a8f..202a7b51e06f61c3d6de3bdbdb3e7c71e0d61295 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -146,6 +146,7 @@ #define ALF_IMPROVEMENT 1 // ALF improvement #define EMBEDDED_APS 1 // Embed APS into picture header #define JVET_V0094_BILATERAL_FILTER 1 // Bilateral filter +#define JVET_W0066_CCSAO 1 // JVET-W0066: Cross-component sample adaptive offset // SIMD optimizations #if IF_12TAP @@ -671,6 +672,9 @@ enum ModeType enum ComponentID { COMPONENT_Y = 0, +#if JVET_W0066_CCSAO + MAX_NUM_LUMA_COMP = 1, +#endif COMPONENT_Cb = 1, COMPONENT_Cr = 2, MAX_NUM_COMPONENT = 3, diff --git a/source/Lib/CommonLib/x86/BilateralFilterX86.h b/source/Lib/CommonLib/x86/BilateralFilterX86.h index 7b429788c893020b664b40609c7fdec0893f5ba6..7af1e6828bc63391de20274f0ecda1513d01549f 100644 --- a/source/Lib/CommonLib/x86/BilateralFilterX86.h +++ b/source/Lib/CommonLib/x86/BilateralFilterX86.h @@ -44,6 +44,267 @@ #endif #if ENABLE_SIMD_BILATERAL_FILTER +#if JVET_W0066_CCSAO +template<X86_VEXT vext> +void BilateralFilter::simdFilterDiamond5x5NoClip(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr) +{ + //if( uiWidth < 4 || ( uiWidth < 8 && isRDO ) ) + if (uiWidth < 4) + { + return blockBilateralFilterDiamond5x5NoClip(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, isRDO, LUTrowPtr); + } + + int pad = 2; + int padwidth = iWidthExtSIMD; + + __m128i center, left, right, up, down, lu, ld, ru, rd, diffabs, four, fifteen, lut, acc, temp, round_add, clipmin, clipmax, inputVals; + __m128i ll, rr, uu, dd; + + four = _mm_set1_epi16(4); + fifteen = _mm_set1_epi16(15); + round_add = _mm_set1_epi16(bif_round_add); + clipmin = _mm_set1_epi16(clpRng.min); + clipmax = _mm_set1_epi16(clpRng.max); + + lut = _mm_loadu_si128((__m128i*)(LUTrowPtr)); + acc = _mm_set1_epi32(0); + + // Copy back parameters + Pel* tempBlockPtr = (short*)blkFilt + (((padwidth + 4) << 1) + 2); + int tempBlockStride = padwidth + 4; + + for (int col = 0; col < uiWidth; col += 8) + { + for (int row = 0; row < uiHeight; row++) + { + acc = _mm_set1_epi32(0); + int16_t* point = &block[(row + pad) * padwidth + pad + col]; + + center = _mm_loadu_si128((__m128i*)(point)); + + //load neighbours + left = _mm_loadu_si128((__m128i*)(point - 1)); + right = _mm_loadu_si128((__m128i*)(point + 1)); + up = _mm_loadu_si128((__m128i*)(point - padwidth)); + down = _mm_loadu_si128((__m128i*)(point + padwidth)); + + lu = _mm_loadu_si128((__m128i*)(point - 1 - padwidth)); + ld = _mm_loadu_si128((__m128i*)(point - 1 + padwidth)); + ru = _mm_loadu_si128((__m128i*)(point + 1 - padwidth)); + rd = _mm_loadu_si128((__m128i*)(point + 1 + padwidth)); + + ll = _mm_loadu_si128((__m128i*)(point - 2)); + rr = _mm_loadu_si128((__m128i*)(point + 2)); + uu = _mm_loadu_si128((__m128i*)(point - 2 * padwidth)); + dd = _mm_loadu_si128((__m128i*)(point + 2 * padwidth)); + + //calculate diffs + left = _mm_sub_epi16(left, center); + right = _mm_sub_epi16(right, center); + up = _mm_sub_epi16(up, center); + down = _mm_sub_epi16(down, center); + + lu = _mm_sub_epi16(lu, center); + ld = _mm_sub_epi16(ld, center); + ru = _mm_sub_epi16(ru, center); + rd = _mm_sub_epi16(rd, center); + + ll = _mm_sub_epi16(ll, center); + rr = _mm_sub_epi16(rr, center); + uu = _mm_sub_epi16(uu, center); + dd = _mm_sub_epi16(dd, center); + + //LEFT! + //calculate abs + diffabs = _mm_abs_epi16(left); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, left);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //RIGHT! + //calculate abs + diffabs = _mm_abs_epi16(right); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, right);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //UP! + //calculate abs + diffabs = _mm_abs_epi16(up); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, up);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + //DOWN! + //calculate abs + diffabs = _mm_abs_epi16(down); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, down);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + //lu! + //calculate abs + diffabs = _mm_abs_epi16(lu); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, lu);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //ld! + //calculate abs + diffabs = _mm_abs_epi16(ld); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, ld);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //ru! + //calculate abs + diffabs = _mm_abs_epi16(ru); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, ru);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //rd! + //calculate abs + diffabs = _mm_abs_epi16(rd); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, rd);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + //ll! + //calculate abs + diffabs = _mm_abs_epi16(ll); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, ll);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //rr! + //calculate abs + diffabs = _mm_abs_epi16(rr); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, rr);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //uu! + //calculate abs + diffabs = _mm_abs_epi16(uu); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, uu);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //dd! + //calculate abs + diffabs = _mm_abs_epi16(dd); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, dd);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + if (bfac == 2) + { + acc = _mm_slli_epi16(acc, 1); // Shift left to get 2* + } + else if (bfac == 3) + { + temp = _mm_slli_epi16(acc, 1); // Multiply by two by shifting left + acc = _mm_add_epi16(acc, temp); // Add original value to get 3* + } + + // Add 16 and shift 5 + acc = _mm_add_epi16(acc, round_add); + acc = _mm_srai_epi16(acc, bif_round_shift); + + // Instead we add our input values to the delta + if (isRDO) + { + acc = _mm_add_epi16(acc, center); + } + else + { + int16_t* recpoint = &recPtr[row * recStride + col]; + inputVals = _mm_loadu_si128((__m128i*)(recpoint)); + acc = _mm_add_epi16(acc, inputVals); + } + + // Clip + if (isRDO) + { + acc = _mm_max_epi16(acc, clipmin); // No clipping applied in this function, will be clipped later on in CCSAO + acc = _mm_min_epi16(acc, clipmax); + } + + _mm_store_si128((__m128i*)(blkFilt + (row + pad) * (padwidth + 4) + col + pad), acc); + } + } + + // Copy back from tempbufFilter to recBuf + int onerow = uiWidth * sizeof(Pel); + for (uint32_t yy = 0; yy < uiHeight; yy++) + { + std::memcpy(recPtr, tempBlockPtr, onerow); + recPtr += recStride; + tempBlockPtr += tempBlockStride; + } +} +#endif template<X86_VEXT vext> void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ) { @@ -304,6 +565,9 @@ void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, template <X86_VEXT vext> void BilateralFilter::_initBilateralFilterX86() { +#if JVET_W0066_CCSAO + m_bilateralFilterDiamond5x5NoClip = simdFilterDiamond5x5NoClip<vext>; +#endif m_bilateralFilterDiamond5x5 = simdFilterDiamond5x5<vext>; } diff --git a/source/Lib/DecoderLib/CABACReader.cpp b/source/Lib/DecoderLib/CABACReader.cpp index 77e09ae8cfcc6d2ef1d8e28bb6802a0fcf8d4211..b689abdc7d6764fca5ec7785670d9987fb4a4f7e 100644 --- a/source/Lib/DecoderLib/CABACReader.cpp +++ b/source/Lib/DecoderLib/CABACReader.cpp @@ -145,6 +145,24 @@ void CABACReader::coding_tree_unit( CodingStructure& cs, const UnitArea& area, i #endif sao( cs, ctuRsAddr ); +#if JVET_W0066_CCSAO + if (cs.sps->getCCSAOEnabledFlag()) + { + for ( int compIdx = 0; compIdx < getNumberValidComponents( cs.pcv->chrFormat ); compIdx++ ) + { + if (cs.slice->m_ccSaoComParam.enabled[compIdx]) + { + const int setNum = cs.slice->m_ccSaoComParam.setNum[compIdx]; + + const int ry = ctuRsAddr / cs.pcv->widthInCtus; + const int rx = ctuRsAddr % cs.pcv->widthInCtus; + const Position lumaPos(rx * cs.pcv->maxCUWidth, ry * cs.pcv->maxCUHeight); + + ccSaoControlIdc(cs, ComponentID(compIdx), ctuRsAddr, cs.slice->m_ccSaoControl[compIdx], lumaPos, setNum); + } + } + } +#endif if (cs.sps->getALFEnabledFlag() && (cs.slice->getTileGroupAlfEnabledFlag(COMPONENT_Y))) { const PreCalcValues& pcv = *cs.pcv; @@ -562,6 +580,46 @@ void CABACReader::bif(CodingStructure& cs, unsigned ctuRsAddr) } #endif +#if JVET_W0066_CCSAO +void CABACReader::ccSaoControlIdc(CodingStructure &cs, const ComponentID compID, const int curIdx, + uint8_t *controlIdc, Position lumaPos, int setNum) +{ + //RExt__DECODER_DEBUG_BIT_STATISTICS_CREATE_SET( STATS__CABAC_BITS__CROSS_COMPONENT_SAO_BLOCK_LEVEL_IDC ); + + Position leftLumaPos = lumaPos.offset(-(int)cs.pcv->maxCUWidth, 0); + Position aboveLumaPos = lumaPos.offset(0, -(int)cs.pcv->maxCUWidth); + const uint32_t curSliceIdx = cs.slice->getIndependentSliceIdx(); + const uint32_t curTileIdx = cs.pps->getTileIdx( lumaPos ); + bool leftAvail = cs.getCURestricted( leftLumaPos, lumaPos, curSliceIdx, curTileIdx, CH_L ) ? true : false; + bool aboveAvail = cs.getCURestricted( aboveLumaPos, lumaPos, curSliceIdx, curTileIdx, CH_L ) ? true : false; + int ctxt = 0; + + if (leftAvail) + { + ctxt += ( controlIdc[curIdx - 1] ) ? 1 : 0; + } + if (aboveAvail) + { + ctxt += ( controlIdc[curIdx - cs.pcv->widthInCtus] ) ? 1 : 0; + } + ctxt += ( compID == COMPONENT_Y ) ? 0 + : ( compID == COMPONENT_Cb ) ? 3 : 6; + + int idcVal = m_BinDecoder.decodeBin( Ctx::CcSaoControlIdc( ctxt ) ); + if ( idcVal ) + { + while ( ( idcVal != setNum ) && m_BinDecoder.decodeBinEP() ) + { + idcVal++; + } + } + controlIdc[curIdx] = idcVal; + + DTRACE(g_trace_ctx, D_SYNTAX, "ccSaoControlIdc() compID=%d pos=(%d,%d) ctxt=%d, setNum=%d, idcVal=%d\n", + compID, lumaPos.x, lumaPos.y, ctxt, setNum, idcVal); +} +#endif + //================================================================================ // clause 7.3.8.4 //-------------------------------------------------------------------------------- diff --git a/source/Lib/DecoderLib/CABACReader.h b/source/Lib/DecoderLib/CABACReader.h index 5d0e3329a7da3950f28179e76ac6bf89b09a03f5..7fe2facef5db65f2b0847a4dc38cbff880e91eb2 100644 --- a/source/Lib/DecoderLib/CABACReader.h +++ b/source/Lib/DecoderLib/CABACReader.h @@ -73,6 +73,10 @@ public: void bif (CodingStructure& cs, unsigned ctuRsAddr); #endif +#if JVET_W0066_CCSAO + void ccSaoControlIdc ( CodingStructure &cs, const ComponentID compID, const int curIdx, uint8_t *controlIdc, Position lumaPos, int setNum ); +#endif + void readAlfCtuFilterIndex(CodingStructure& cs, unsigned ctuRsAddr); void ccAlfFilterControlIdc(CodingStructure &cs, const ComponentID compID, const int curIdx, uint8_t *filterControlIdc, diff --git a/source/Lib/DecoderLib/DecLib.cpp b/source/Lib/DecoderLib/DecLib.cpp index ec27df33991409c01a52574c12d9c264bff541b0..351ec69ba9a10b2d37668c8f22184a48dfc9b785 100644 --- a/source/Lib/DecoderLib/DecLib.cpp +++ b/source/Lib/DecoderLib/DecLib.cpp @@ -203,6 +203,18 @@ bool tryDecodePicture( Picture* pcEncPic, const int expectedPoc, const std::stri pcEncPic->copySAO( *pic, 0 ); } +#if JVET_W0066_CCSAO + if ( pic->cs->sps->getCCSAOEnabledFlag() ) + { + for (int i = 0; i < pic->slices.size(); i++) + { + pcEncPic->slices[i]->setCcSaoEnabledFlag(COMPONENT_Y, pic->slices[i]->getCcSaoEnabledFlag(COMPONENT_Y)); + pcEncPic->slices[i]->setCcSaoEnabledFlag(COMPONENT_Cb, pic->slices[i]->getCcSaoEnabledFlag(COMPONENT_Cb)); + pcEncPic->slices[i]->setCcSaoEnabledFlag(COMPONENT_Cr, pic->slices[i]->getCcSaoEnabledFlag(COMPONENT_Cr)); + } + } +#endif + if( pic->cs->sps->getALFEnabledFlag() ) { std::copy(pic->getAlfCtbFilterIndexVec().begin(), pic->getAlfCtbFilterIndexVec().end(), pcEncPic->getAlfCtbFilterIndexVec().begin()); @@ -681,6 +693,13 @@ void DecLib::executeLoopFilters() CS::setRefinedMotionField(cs); #endif +#if JVET_W0066_CCSAO + if (cs.sps->getCCSAOEnabledFlag()) + { + m_cSAO.getCcSaoBuf().copyFrom( cs.getRecoBuf() ); + } +#endif + #if JVET_V0094_BILATERAL_FILTER if( cs.sps->getSAOEnabledFlag() || cs.pps->getUseBIF()) #else @@ -690,6 +709,15 @@ void DecLib::executeLoopFilters() m_cSAO.SAOProcess( cs, cs.picture->getSAO() ); } +#if JVET_W0066_CCSAO + if (cs.sps->getCCSAOEnabledFlag()) + { + m_cSAO.getCcSaoComParam() = cs.slice->m_ccSaoComParam; + m_cSAO.CCSAOProcess( cs ); + } + m_cSAO.jointClipSaoBifCcSao( cs ); +#endif + if( cs.sps->getALFEnabledFlag() ) { m_cALF.getCcAlfFilterParam() = cs.slice->m_ccAlfFilterParam; @@ -1641,6 +1669,11 @@ void DecLib::xActivateParameterSets( const InputNALUnit nalu ) sps->getMaxCUWidth(), sps->getMaxCUHeight(), maxDepth, log2SaoOffsetScaleLuma, log2SaoOffsetScaleChroma ); +#if JVET_W0066_CCSAO + pSlice->m_ccSaoControl[COMPONENT_Y ] = m_cSAO.getCcSaoControlIdc(COMPONENT_Y); + pSlice->m_ccSaoControl[COMPONENT_Cb] = m_cSAO.getCcSaoControlIdc(COMPONENT_Cb); + pSlice->m_ccSaoControl[COMPONENT_Cr] = m_cSAO.getCcSaoControlIdc(COMPONENT_Cr); +#endif m_cLoopFilter.create(maxDepth); m_cIntraPred.init( sps->getChromaFormatIdc(), sps->getBitDepth( CHANNEL_TYPE_LUMA ) ); #if INTER_LIC || (TM_AMVP || TM_MRG) || ARMC_TM @@ -2131,6 +2164,9 @@ bool DecLib::xDecodeSlice(InputNALUnit &nalu, int &iSkipFrame, int iPOCLastDispl } m_HLSReader.setBitstream( &nalu.getBitstream() ); +#if JVET_W0066_CCSAO + m_apcSlicePilot->m_ccSaoComParam = m_cSAO.getCcSaoComParam(); +#endif m_apcSlicePilot->m_ccAlfFilterParam = m_cALF.getCcAlfFilterParam(); #if EMBEDDED_APS diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index 90002f86004ac2cf251621915f2061884bba74bc..59cc7257929a175cbac23316a573b991c79c5478 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -2040,6 +2040,9 @@ void HLSyntaxReader::parseSPS(SPS* pcSPS) READ_FLAG( uiCode, "sps_sao_enabled_flag" ); pcSPS->setSAOEnabledFlag ( uiCode ? true : false ); +#if JVET_W0066_CCSAO + READ_FLAG( uiCode, "sps_ccsao_enabled_flag" ); pcSPS->setCCSAOEnabledFlag ( uiCode ? true : false ); +#endif READ_FLAG( uiCode, "sps_alf_enabled_flag" ); pcSPS->setALFEnabledFlag ( uiCode ? true : false ); if (pcSPS->getALFEnabledFlag() && pcSPS->getChromaFormatIdc() != CHROMA_400) { @@ -3713,7 +3716,18 @@ void HLSyntaxReader::parsePictureHeader( PicHeader* picHeader, ParameterSetManag picHeader->setSaoEnabledFlag(CHANNEL_TYPE_CHROMA, false); } +#if JVET_W0066_CCSAO + picHeader->setCcSaoEnabledFlag(COMPONENT_Y, sps->getCCSAOEnabledFlag()); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cb, sps->getCCSAOEnabledFlag()); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cr, sps->getCCSAOEnabledFlag()); + if (sps->getCCSAOEnabledFlag() && pps->getSaoInfoInPhFlag()) + { + READ_FLAG(uiCode, "ph_cc_sao_y_enabled_flag"); picHeader->setCcSaoEnabledFlag(COMPONENT_Y, uiCode != 0); + READ_FLAG(uiCode, "ph_cc_sao_cb_enabled_flag"); picHeader->setCcSaoEnabledFlag(COMPONENT_Cb, uiCode != 0); + READ_FLAG(uiCode, "ph_cc_sao_cr_enabled_flag"); picHeader->setCcSaoEnabledFlag(COMPONENT_Cr, uiCode != 0); + } +#endif // deblocking filter controls if (pps->getDeblockingFilterControlPresentFlag()) @@ -4662,6 +4676,9 @@ void HLSyntaxReader::parseSliceHeader (Slice* pcSlice, PicHeader* picHeader, Par } } +#if JVET_W0066_CCSAO + parseCcSao(pcSlice, picHeader, sps, pcSlice->m_ccSaoComParam); +#endif if (pps->getDeblockingFilterControlPresentFlag()) { @@ -5065,6 +5082,9 @@ void HLSyntaxReader::parseConstraintInfo(ConstraintInfo *cinfo) /* loop filter */ READ_FLAG(symbol, "gci_no_sao_constraint_flag"); cinfo->setNoSaoConstraintFlag(symbol > 0 ? true : false); +#if JVET_W0066_CCSAO + READ_FLAG(symbol, "gci_no_ccsao_constraint_flag"); cinfo->setNoCCSaoConstraintFlag(symbol > 0 ? true : false); +#endif READ_FLAG(symbol, "gci_no_alf_constraint_flag"); cinfo->setNoAlfConstraintFlag(symbol > 0 ? true : false); READ_FLAG(symbol, "gci_no_ccalf_constraint_flag"); cinfo->setNoCCAlfConstraintFlag(symbol > 0 ? true : false); READ_FLAG(symbol, "gci_no_lmcs_constraint_flag"); cinfo->setNoLmcsConstraintFlag(symbol > 0 ? true : false); @@ -5878,6 +5898,65 @@ bool HLSyntaxReader::xMoreRbspData() return (cnt>0); } +#if JVET_W0066_CCSAO +void HLSyntaxReader::parseCcSao( Slice* pcSlice, PicHeader* picHeader, const SPS* sps, CcSaoComParam& ccSaoParam ) +{ + ccSaoParam.reset(); + + uint32_t uiCode; + if (sps->getCCSAOEnabledFlag()) + { + READ_FLAG(uiCode, "slice_ccsao_y_enabled_flag" ); pcSlice->setCcSaoEnabledFlag(COMPONENT_Y, uiCode); ccSaoParam.enabled[COMPONENT_Y ] = uiCode; + READ_FLAG(uiCode, "slice_ccsao_cb_enabled_flag"); pcSlice->setCcSaoEnabledFlag(COMPONENT_Cb, uiCode); ccSaoParam.enabled[COMPONENT_Cb] = uiCode; + READ_FLAG(uiCode, "slice_ccsao_cr_enabled_flag"); pcSlice->setCcSaoEnabledFlag(COMPONENT_Cr, uiCode); ccSaoParam.enabled[COMPONENT_Cr] = uiCode; + } + else + { + ccSaoParam.enabled[COMPONENT_Y ] = false; + ccSaoParam.enabled[COMPONENT_Cb] = false; + ccSaoParam.enabled[COMPONENT_Cr] = false; + } + + for (int compIdx = 0; compIdx < MAX_NUM_COMPONENT; compIdx++) + { + if (ccSaoParam.enabled[compIdx]) + { + READ_UVLC(uiCode, "ccsao_set_num"); ccSaoParam.setNum[compIdx] = uiCode + 1; + + for (int setIdx = 0; setIdx < ccSaoParam.setNum[compIdx]; setIdx++) + { + ccSaoParam.setEnabled[compIdx][setIdx] = true; + + READ_CODE(MAX_CCSAO_CAND_POS_Y_BITS, uiCode, "ccsao_cand_pos_y"); ccSaoParam.candPos[compIdx][setIdx][COMPONENT_Y ] = uiCode; + READ_CODE(MAX_CCSAO_BAND_NUM_Y_BITS, uiCode, "ccsao_band_num_y"); ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Y ] = uiCode + 1; + READ_CODE(MAX_CCSAO_BAND_NUM_U_BITS, uiCode, "ccsao_band_num_u"); ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cb] = uiCode + 1; + READ_CODE(MAX_CCSAO_BAND_NUM_V_BITS, uiCode, "ccsao_band_num_v"); ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cr] = uiCode + 1; + + short *offset = ccSaoParam.offset [compIdx][setIdx]; + int classNum = ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Y ] + * ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cb] + * ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cr]; + for (int i = 0; i < classNum; i++) + { + READ_UVLC(uiCode, "ccsao_offset_abs"); offset[i] = uiCode; + if(offset[i] != 0 ) + { + READ_FLAG(uiCode, "ccsao_offset_sign"); offset[i] = uiCode ? -offset[i] : offset[i]; + } + } + + DTRACE(g_trace_ctx, D_SYNTAX, "offset setIdx %d: ", setIdx); + for (int i = 0; i < classNum; i++) + { + DTRACE(g_trace_ctx, D_SYNTAX, "%d ", offset[i]); + } + DTRACE(g_trace_ctx, D_SYNTAX, "\n"); + } + } + } +} +#endif + #if ALF_IMPROVEMENT int HLSyntaxReader::alfGolombDecode(const int k, const bool signed_val) { diff --git a/source/Lib/DecoderLib/VLCReader.h b/source/Lib/DecoderLib/VLCReader.h index 9d413bad11cb544c2e706929ee3895b37873dd0b..5e055709a3fa5910142adb22666362eb775de727 100644 --- a/source/Lib/DecoderLib/VLCReader.h +++ b/source/Lib/DecoderLib/VLCReader.h @@ -201,6 +201,9 @@ public: #endif void decodeScalingList ( ScalingList *scalingList, uint32_t scalingListId, bool isPredictor); void parseReshaper ( SliceReshapeInfo& sliceReshaperInfo, const SPS* pcSPS, const bool isIntra ); +#if JVET_W0066_CCSAO + void parseCcSao ( Slice* pcSlice, PicHeader* picHeader, const SPS* sps, CcSaoComParam& ccSaoParam ); +#endif #if ALF_IMPROVEMENT int alfGolombDecode( const int k, const bool signed_val = true ); void alfFilter( AlfParam& alfParam, const bool isChroma, const int altIdx, int order0, int order1 ); diff --git a/source/Lib/EncoderLib/CABACWriter.cpp b/source/Lib/EncoderLib/CABACWriter.cpp index 9e98305fc06b56f66ab07ee291ed3a7cef0633f1..2d8f20f5e7daeabefcf8d7e01f3b3582317d38d9 100644 --- a/source/Lib/EncoderLib/CABACWriter.cpp +++ b/source/Lib/EncoderLib/CABACWriter.cpp @@ -198,6 +198,26 @@ void CABACWriter::coding_tree_unit( CodingStructure& cs, const UnitArea& area, i sao( *cs.slice, ctuRsAddr ); } +#if JVET_W0066_CCSAO + if ( !skipSao ) + { + for ( int compIdx = 0; compIdx < getNumberValidComponents( cs.pcv->chrFormat ); compIdx++ ) + { + if (cs.slice->m_ccSaoComParam.enabled[compIdx]) + { + const int setNum = cs.slice->m_ccSaoComParam.setNum[compIdx]; + + const int ry = ctuRsAddr / cs.pcv->widthInCtus; + const int rx = ctuRsAddr % cs.pcv->widthInCtus; + const Position lumaPos(rx * cs.pcv->maxCUWidth, ry * cs.pcv->maxCUHeight); + + codeCcSaoControlIdc(cs.slice->m_ccSaoControl[compIdx][ctuRsAddr], cs, ComponentID(compIdx), + ctuRsAddr, cs.slice->m_ccSaoControl[compIdx], lumaPos, setNum); + } + } + } +#endif + if (!skipAlf) { for (int compIdx = 0; compIdx < MAX_NUM_COMPONENT; compIdx++) @@ -427,7 +447,49 @@ void CABACWriter::sao_offset_pars( const SAOOffset& ctbPars, ComponentID compID, } } +#if JVET_W0066_CCSAO +void CABACWriter::codeCcSaoControlIdc(uint8_t idcVal, CodingStructure &cs, const ComponentID compID, + const int curIdx, const uint8_t *controlIdc, Position lumaPos, + const int setNum) +{ + CHECK(idcVal > setNum, "Set index is too large"); + + const uint32_t curSliceIdx = cs.slice->getIndependentSliceIdx(); + const uint32_t curTileIdx = cs.pps->getTileIdx( lumaPos ); + Position leftLumaPos = lumaPos.offset(-(int)cs.pcv->maxCUWidth, 0); + Position aboveLumaPos = lumaPos.offset(0, -(int)cs.pcv->maxCUWidth); + bool leftAvail = cs.getCURestricted( leftLumaPos, lumaPos, curSliceIdx, curTileIdx, CH_L ) ? true : false; + bool aboveAvail = cs.getCURestricted( aboveLumaPos, lumaPos, curSliceIdx, curTileIdx, CH_L ) ? true : false; + int ctxt = 0; + if (leftAvail) + { + ctxt += ( controlIdc[curIdx - 1]) ? 1 : 0; + } + if (aboveAvail) + { + ctxt += (controlIdc[curIdx - cs.pcv->widthInCtus]) ? 1 : 0; + } + ctxt += ( compID == COMPONENT_Y ) ? 0 + : ( compID == COMPONENT_Cb ) ? 3 : 6; + + m_BinEncoder.encodeBin( ( idcVal == 0 ) ? 0 : 1, Ctx::CcSaoControlIdc( ctxt ) ); // ON/OFF flag is context coded + if ( idcVal > 0 ) + { + int val = (idcVal - 1); + while ( val ) + { + m_BinEncoder.encodeBinEP( 1 ); + val--; + } + if ( idcVal < setNum ) + { + m_BinEncoder.encodeBinEP( 0 ); + } + } + DTRACE( g_trace_ctx, D_SYNTAX, "ccSaoControlIdc() compID=%d pos=(%d,%d) ctxt=%d, setNum=%d, idcVal=%d\n", compID, lumaPos.x, lumaPos.y, ctxt, setNum, idcVal ); +} +#endif //================================================================================ // clause 7.3.8.4 diff --git a/source/Lib/EncoderLib/CABACWriter.h b/source/Lib/EncoderLib/CABACWriter.h index 0b55a2d36badfc6df380e1141818d84fc389b7e6..2f3113d30b29b75541c64bfe082534fff0fa06be 100644 --- a/source/Lib/EncoderLib/CABACWriter.h +++ b/source/Lib/EncoderLib/CABACWriter.h @@ -83,6 +83,10 @@ public: #if JVET_V0094_BILATERAL_FILTER void bif (const Slice& slice, const BifParams& BifParams); void bif (const Slice& slice, const BifParams& BifParams, unsigned ctuRsAddr); +#endif +#if JVET_W0066_CCSAO + void codeCcSaoControlIdc ( uint8_t idcVal, CodingStructure &cs, const ComponentID compID, const int curIdx, + const uint8_t *controlIdc, Position lumaPos, const int setNum ); #endif // coding (quad)tree (clause 7.3.8.4) void coding_tree ( const CodingStructure& cs, Partitioner& pm, CUCtx& cuCtx, Partitioner* pPartitionerChroma = nullptr, CUCtx* pCuCtxChroma = nullptr); diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index 84929d8a066564622400a086a33adf84a04c27a1..22bdb7b64a9c6284b0872d2ca7f248c6a4a1ea8b 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -212,6 +212,9 @@ protected: #endif bool m_noPartitionConstraintsOverrideConstraintFlag; bool m_noSaoConstraintFlag; +#if JVET_W0066_CCSAO + bool m_noCCSaoConstraintFlag; +#endif bool m_noAlfConstraintFlag; bool m_noCCAlfConstraintFlag; #if JVET_S0058_GCI @@ -481,6 +484,9 @@ protected: bool m_DeblockingFilterMetric; #endif bool m_bUseSAO; +#if JVET_W0066_CCSAO + bool m_CCSAO; +#endif bool m_bTestSAODisableAtPictureLevel; double m_saoEncodingRate; // When non-0 SAO early picture termination is enabled for luma and chroma double m_saoEncodingRateChroma; // The SAO early picture termination rate to use for chroma (when m_SaoEncodingRate is >0). If <=0, use results for luma. @@ -949,6 +955,10 @@ public: void setNoPartitionConstraintsOverrideConstraintFlag(bool val) { m_noPartitionConstraintsOverrideConstraintFlag = val; } bool getNoSaoConstraintFlag() const { return m_noSaoConstraintFlag; } void setNoSaoConstraintFlag(bool val) { m_noSaoConstraintFlag = val; } +#if JVET_W0066_CCSAO + bool getNoCCSaoConstraintFlag() const { return m_noCCSaoConstraintFlag; } + void setNoCCSaoConstraintFlag(bool val) { m_noCCSaoConstraintFlag = val; } +#endif bool getNoAlfConstraintFlag() const { return m_noAlfConstraintFlag; } void setNoAlfConstraintFlag(bool val) { m_noAlfConstraintFlag = val; } bool getNoCCAlfConstraintFlag() const { return m_noCCAlfConstraintFlag; } @@ -1670,6 +1680,10 @@ public: bool getSingleSlicePerSubPicFlagFlag( ) { return m_singleSlicePerSubPicFlag; } void setUseSAO (bool bVal) { m_bUseSAO = bVal; } bool getUseSAO () { return m_bUseSAO; } +#if JVET_W0066_CCSAO + void setUseCCSAO( bool b ) { m_CCSAO = b; } + bool getUseCCSAO() const { return m_CCSAO; } +#endif void setTestSAODisableAtPictureLevel (bool bVal) { m_bTestSAODisableAtPictureLevel = bVal; } bool getTestSAODisableAtPictureLevel ( ) const { return m_bTestSAODisableAtPictureLevel; } diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index a60c1a4e81c53c3ad124db717f36e968a79c00f3..6aab82f7dd2d6dbf5c2014b1d81a35f51af3baf3 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -3100,6 +3100,13 @@ void EncGOP::compressGOP( int iPOCLast, int iNumPicRcvd, PicList& rcListPic, CS::setRefinedMotionField(cs); #endif +#if JVET_W0066_CCSAO + if ( cs.sps->getCCSAOEnabledFlag() ) + { + m_pcSAO->getCcSaoBuf().copyFrom( cs.getRecoBuf() ); + } +#endif + #if JVET_V0094_BILATERAL_FILTER // We need to do this step if at least one of BIF or SAO are enabled. if( pcSlice->getSPS()->getSAOEnabledFlag() || pcSlice->getPPS()->getUseBIF()) @@ -3149,6 +3156,24 @@ void EncGOP::compressGOP( int iPOCLast, int iNumPicRcvd, PicList& rcListPic, #endif } +#if JVET_W0066_CCSAO + if ( pcSlice->getSPS()->getCCSAOEnabledFlag() ) + { + m_pcSAO->initCABACEstimator( m_pcEncLib->getCABACEncoder(), m_pcEncLib->getCtxCache(), pcSlice ); + m_pcSAO->CCSAOProcess( cs, pcSlice->getLambdas(), m_pcCfg->getIntraPeriod() ); + + //assign CCSAO slice header + for (int s = 0; s < uiNumSliceSegments; s++) + { + pcPic->slices[s]->m_ccSaoComParam = m_pcSAO->getCcSaoComParam(); + pcPic->slices[s]->m_ccSaoControl[COMPONENT_Y] = m_pcSAO->getCcSaoControlIdc(COMPONENT_Y); + pcPic->slices[s]->m_ccSaoControl[COMPONENT_Cb] = m_pcSAO->getCcSaoControlIdc(COMPONENT_Cb); + pcPic->slices[s]->m_ccSaoControl[COMPONENT_Cr] = m_pcSAO->getCcSaoControlIdc(COMPONENT_Cr); + } + } + m_pcSAO->jointClipSaoBifCcSao( cs ); +#endif + if( pcSlice->getSPS()->getALFEnabledFlag() ) { for (int s = 0; s < uiNumSliceSegments; s++) @@ -3522,6 +3547,11 @@ void EncGOP::compressGOP( int iPOCLast, int iNumPicRcvd, PicList& rcListPic, { picHeader->setSaoEnabledFlag(CHANNEL_TYPE_LUMA, pcSlice->getSaoEnabledFlag(CHANNEL_TYPE_LUMA )); picHeader->setSaoEnabledFlag(CHANNEL_TYPE_CHROMA, pcSlice->getSaoEnabledFlag(CHANNEL_TYPE_CHROMA)); +#if JVET_W0066_CCSAO + picHeader->setCcSaoEnabledFlag(COMPONENT_Y, pcSlice->getCcSaoEnabledFlag(COMPONENT_Y)); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cb, pcSlice->getCcSaoEnabledFlag(COMPONENT_Cb)); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cr, pcSlice->getCcSaoEnabledFlag(COMPONENT_Cr)); +#endif } // code ALF parameters in picture header or slice headers diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index 9850b3561a696f075d7214ba5e57896daea171e2..f912f35904f74f2091c0c1a4094e38b367b21e21 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -169,9 +169,17 @@ void EncLib::create( const int layerId ) m_cEncALF.create(this, m_iSourceWidth, m_iSourceHeight, m_chromaFormatIDC, m_maxCUWidth, m_maxCUHeight, floorLog2(m_maxCUWidth) - m_log2MinCUSize, m_bitDepth, m_inputBitDepth); } #if JVET_V0094_BILATERAL_FILTER +#if JVET_W0066_CCSAO + if (m_bUseSAO || m_BIF || m_CCSAO) +#else if (m_bUseSAO || m_BIF) +#endif +#else +#if JVET_W0066_CCSAO + if (m_bUseSAO || m_CCSAO) #else if (m_bUseSAO) +#endif #endif { const uint32_t widthInCtus = (m_iSourceWidth + m_maxCUWidth - 1) / m_maxCUWidth; @@ -1268,6 +1276,9 @@ void EncLib::xInitSPS( SPS& sps ) cinfo->setNoQtbttDualTreeIntraConstraintFlag(m_noQtbttDualTreeIntraConstraintFlag); cinfo->setNoPartitionConstraintsOverrideConstraintFlag(m_noPartitionConstraintsOverrideConstraintFlag); cinfo->setNoSaoConstraintFlag(m_noSaoConstraintFlag); +#if JVET_W0066_CCSAO + cinfo->setNoCCSaoConstraintFlag(m_noCCSaoConstraintFlag); +#endif cinfo->setNoAlfConstraintFlag(m_noAlfConstraintFlag); cinfo->setNoCCAlfConstraintFlag(m_noCCAlfConstraintFlag); #if JVET_S0058_GCI @@ -1474,6 +1485,9 @@ void EncLib::xInitSPS( SPS& sps ) sps.setUseWPBiPred( m_useWeightedBiPred ); sps.setSAOEnabledFlag( m_bUseSAO ); +#if JVET_W0066_CCSAO + sps.setCCSAOEnabledFlag( m_CCSAO ); +#endif sps.setJointCbCrEnabledFlag( m_JointCbCrMode ); sps.setMaxTLayers( m_maxTempLayer ); sps.setTemporalIdNestingFlag( ( m_maxTempLayer == 1 ) ? true : false ); diff --git a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp index 5458432cb74637912522e3babd3aca7c7ecb2458..ab9961388cbd0c1536a5e91c08f417ed9b1b0f73 100644 --- a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp +++ b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.cpp @@ -56,6 +56,25 @@ #define SAOCtx(c) SubCtx( Ctx::Sao, c ) +#if JVET_W0066_CCSAO +#include <algorithm> + +struct SetIdxCount +{ + uint8_t setIdx; + uint16_t count; +}; + +struct CtbCost +{ + int16_t pos; + double cost; +}; + +bool compareSetIdxCount(SetIdxCount a, SetIdxCount b) { return a.count > b.count; } + +bool compareCtbCost(CtbCost a, CtbCost b) { return a.cost < b.cost; } +#endif //! rounding with IBDI inline double xRoundIbdi2(int bitDepth, double x) @@ -175,6 +194,28 @@ void EncSampleAdaptiveOffset::createEncData(bool isPreDBFSamplesUsed, uint32_t n } } } + +#if JVET_W0066_CCSAO + if (m_createdEnc) + { + return; + } + m_createdEnc = true; + + for (int i = 0; i < MAX_CCSAO_SET_NUM; i++) + { + m_ccSaoStatData[i] = new CcSaoStatData[m_numCTUsInPic]; + } + + m_bestCcSaoControl = new uint8_t[m_numCTUsInPic]; + m_tempCcSaoControl = new uint8_t[m_numCTUsInPic]; + m_initCcSaoControl = new uint8_t[m_numCTUsInPic]; + + for (int i = 0; i < MAX_CCSAO_SET_NUM; i++) + { + m_trainingDistortion[i] = new int64_t[m_numCTUsInPic]; + } +#endif } void EncSampleAdaptiveOffset::destroyEncData() @@ -199,6 +240,28 @@ void EncSampleAdaptiveOffset::destroyEncData() delete[] m_preDBFstatData[i]; } m_preDBFstatData.clear(); + +#if JVET_W0066_CCSAO + if (!m_createdEnc) + { + return; + } + m_createdEnc = false; + + for (int i = 0; i < MAX_CCSAO_SET_NUM; i++) + { + if (m_ccSaoStatData[i]) { delete[] m_ccSaoStatData[i]; m_ccSaoStatData[i] = nullptr; } + } + + if (m_bestCcSaoControl) { delete[] m_bestCcSaoControl; m_bestCcSaoControl = nullptr; } + if (m_tempCcSaoControl) { delete[] m_tempCcSaoControl; m_tempCcSaoControl = nullptr; } + if (m_initCcSaoControl) { delete[] m_initCcSaoControl; m_initCcSaoControl = nullptr; } + + for (int i = 0; i < MAX_CCSAO_SET_NUM; i++) + { + if (m_trainingDistortion[i]) { delete[] m_trainingDistortion[i]; m_trainingDistortion[i] = nullptr; } + } +#endif } void EncSampleAdaptiveOffset::initCABACEstimator( CABACEncoder* cabacEncoder, CtxCache* ctxCache, Slice* pcSlice ) @@ -1163,6 +1226,27 @@ void EncSampleAdaptiveOffset::decideBlkParams(CodingStructure& cs, bool* sliceEn } else { +#if JVET_W0066_CCSAO + offsetCTUnoClip(area, srcYuv, resYuv, reconParams[ctuRsAddr], cs); +#if JVET_V0094_BILATERAL_FILTER + if (cs.pps->getUseBIF()) + { + BifParams& bifParams = cs.picture->getBifParam(); + for (auto& currCU : cs.traverseCUs(CS::getArea(cs, area, CH_L), CH_L)) + { + for (auto& currTU : CU::traverseTUs(currCU)) + { + + bool isInter = (currCU.predMode == MODE_INTER) ? true : false; + if (bifParams.ctuOn[ctuRsAddr] && ((TU::getCbf(currTU, COMPONENT_Y) || isInter == false) && (currTU.cu->qp > 17)) && (128 > std::max(currTU.lumaSize().width, currTU.lumaSize().height)) && ((isInter == false) || (32 > std::min(currTU.lumaSize().width, currTU.lumaSize().height)))) + { + bilateralFilter.bilateralFilterDiamond5x5NoClip(srcYuv, resYuv, currTU.cu->qp, cs.slice->clpRng(COMPONENT_Y), currTU); + } + } + } + } +#endif +#else #if JVET_V0094_BILATERAL_FILTER if(cs.pps->getUseBIF()) { @@ -1207,6 +1291,8 @@ void EncSampleAdaptiveOffset::decideBlkParams(CodingStructure& cs, bool* sliceEn #else offsetCTU(area, srcYuv, resYuv, reconParams[ctuRsAddr], cs); #endif +#endif + } ctuRsAddr++; @@ -1251,6 +1337,27 @@ void EncSampleAdaptiveOffset::decideBlkParams(CodingStructure& cs, bool* sliceEn } #endif +#if JVET_W0066_CCSAO + offsetCTUnoClip(area, srcYuv, resYuv, reconParams[ctuRsAddr], cs); +#if JVET_V0094_BILATERAL_FILTER + if (cs.pps->getUseBIF()) + { + BifParams& bifParams = cs.picture->getBifParam(); + for (auto& currCU : cs.traverseCUs(CS::getArea(cs, area, CH_L), CH_L)) + { + for (auto& currTU : CU::traverseTUs(currCU)) + { + + bool isInter = (currCU.predMode == MODE_INTER) ? true : false; + if (bifParams.ctuOn[ctuRsAddr] && ((TU::getCbf(currTU, COMPONENT_Y) || isInter == false) && (currTU.cu->qp > 17)) && (128 > std::max(currTU.lumaSize().width, currTU.lumaSize().height)) && ((isInter == false) || (32 > std::min(currTU.lumaSize().width, currTU.lumaSize().height)))) + { + bilateralFilter.bilateralFilterDiamond5x5NoClip(srcYuv, resYuv, currTU.cu->qp, cs.slice->clpRng(COMPONENT_Y), currTU); + } + } + } + } +#endif +#else #if JVET_V0094_BILATERAL_FILTER if(cs.pps->getUseBIF()) { @@ -1295,6 +1402,7 @@ void EncSampleAdaptiveOffset::decideBlkParams(CodingStructure& cs, bool* sliceEn } #else offsetCTU(area, srcYuv, resYuv, reconParams[ctuRsAddr], cs); +#endif #endif ctuRsAddr++; } @@ -1881,6 +1989,720 @@ void EncSampleAdaptiveOffset::getBlkStats(const ComponentID compIdx, const int c } } +#if JVET_W0066_CCSAO +void EncSampleAdaptiveOffset::CCSAOProcess(CodingStructure& cs, const double* lambdas, const int intraPeriod) +{ + PelUnitBuf orgYuv = cs.getOrgBuf(); + PelUnitBuf dstYuv = cs.getRecoBuf(); + PelUnitBuf srcYuv = m_ccSaoBuf.getBuf( cs.area ); + srcYuv.extendBorderPel( MAX_CCSAO_FILTER_LENGTH >> 1 ); + m_intraPeriod = intraPeriod; + + setupCcSaoLambdas(cs, lambdas); + + if (cs.slice->getSPS()->getCCSAOEnabledFlag()) + { + const TempCtx ctxStartCcSao(m_CtxCache, SubCtx(Ctx::CcSaoControlIdc, m_CABACEstimator->getCtx())); + m_CABACEstimator->getCtx() = SubCtx(Ctx::CcSaoControlIdc, ctxStartCcSao); deriveCcSao(cs, COMPONENT_Y, orgYuv, srcYuv, dstYuv); + m_CABACEstimator->getCtx() = SubCtx(Ctx::CcSaoControlIdc, ctxStartCcSao); deriveCcSao(cs, COMPONENT_Cb, orgYuv, srcYuv, dstYuv); + m_CABACEstimator->getCtx() = SubCtx(Ctx::CcSaoControlIdc, ctxStartCcSao); deriveCcSao(cs, COMPONENT_Cr, orgYuv, srcYuv, dstYuv); + applyCcSao(cs, *cs.pcv, srcYuv, dstYuv); + } +} + +void EncSampleAdaptiveOffset::setupCcSaoLambdas(CodingStructure& cs, const double* lambdas) +{ + m_lambda[COMPONENT_Y ] = m_picWidth * m_picHeight <= 416 * 240 + ? lambdas[COMPONENT_Y ] * 4.0 + : lambdas[COMPONENT_Y ]; + m_lambda[COMPONENT_Cb] = lambdas[COMPONENT_Cb]; + m_lambda[COMPONENT_Cr] = lambdas[COMPONENT_Cr]; +} + +void EncSampleAdaptiveOffset::deriveCcSao(CodingStructure& cs, const ComponentID compID, const CPelUnitBuf& orgYuv, const CPelUnitBuf& srcYuv, const CPelUnitBuf& dstYuv) +{ + double bestCost = 0; + double tempCost = 0; + double bestCostS[MAX_CCSAO_SET_NUM + 1] = { 0 }; + + double bestCostG[17] = { 0 }; + int classNumG[17] = { 0 }; + int stageNum = m_intraPeriod == 1 ? MAX_CCSAO_CLASS_NUM / 4 : MAX_CCSAO_CLASS_NUM / 16; + for (int stage = 1; stage <= stageNum; stage++) + classNumG[stage] = stage * (MAX_CCSAO_CLASS_NUM / stageNum); + + m_bestCcSaoParam.reset(); + memset(m_bestCcSaoControl, 0, sizeof(uint8_t) * m_numCTUsInPic); + + for (int setNum = 1; setNum <= MAX_CCSAO_SET_NUM; setNum++) + { + if (setNum > 1) + { + getCcSaoStatistics(cs, compID, orgYuv, srcYuv, dstYuv, m_ccSaoStatData, m_bestCcSaoParam); + } + setupInitCcSaoParam(cs, compID, setNum, m_trainingDistortion, m_ccSaoStatData, m_ccSaoStatFrame, + m_initCcSaoParam, m_bestCcSaoParam, m_initCcSaoControl, m_bestCcSaoControl); + + for (int stage = 1; stage <= stageNum; stage++) + { + for (int bandNumY = 1; bandNumY <= MAX_CCSAO_BAND_NUM_Y; bandNumY++) + for (int bandNumU = 1; bandNumU <= MAX_CCSAO_BAND_NUM_U; bandNumU++) + for (int bandNumV = 1; bandNumV <= MAX_CCSAO_BAND_NUM_V; bandNumV++) + for (int candPosY = 0; candPosY < MAX_CCSAO_CAND_POS_Y && bandNumY > 1; candPosY++) + { + if (bandNumY < bandNumU || bandNumY < bandNumV) + continue; + + int classNum = bandNumY * bandNumU * bandNumV; + if (classNum > MAX_CCSAO_CLASS_NUM) + continue; + + if (classNum <= classNumG[stage - 1] || classNum > classNumG[stage]) + continue; + + setupTempCcSaoParam(cs, compID, setNum, candPosY, bandNumY, bandNumU, bandNumV, m_tempCcSaoParam, m_initCcSaoParam, m_tempCcSaoControl, m_initCcSaoControl); + getCcSaoStatistics(cs, compID, orgYuv, srcYuv, dstYuv, m_ccSaoStatData, m_tempCcSaoParam); + deriveCcSaoRDO(cs, compID, m_trainingDistortion, m_ccSaoStatData, m_ccSaoStatFrame, + m_bestCcSaoParam, m_tempCcSaoParam, m_bestCcSaoControl, m_tempCcSaoControl, bestCost, tempCost); + } + + bestCostG[stage] = bestCost; + if (bestCostG[stage] >= bestCostG[stage - 1]) + break; + } + + bestCostS[setNum] = bestCost; + if (bestCostS[setNum] >= bestCostS[setNum - 1]) + break; + } + + bool oneBlockFiltered = false; + for (int ctbIdx = 0; m_bestCcSaoParam.setNum > 0 && ctbIdx < m_numCTUsInPic; ctbIdx++) + { + if (m_bestCcSaoControl[ctbIdx]) + { + oneBlockFiltered = true; + break; + } + } + + m_ccSaoComParam.reset(compID); + memset(m_ccSaoControl[compID], 0, sizeof(uint8_t) * m_numCTUsInPic); + + m_ccSaoComParam.enabled[compID] = oneBlockFiltered; + if (oneBlockFiltered) + { + CcSaoEncParam storedBestCcSaoParam = m_bestCcSaoParam; + memcpy(m_tempCcSaoControl, m_bestCcSaoControl, sizeof(uint8_t) * m_numCTUsInPic); + + int setNum = 0; + for (int setIdx = 0; setIdx < MAX_CCSAO_SET_NUM; setIdx++) + { + uint8_t setIdc = m_bestCcSaoParam.mapIdxToIdc[setIdx]; + if (m_bestCcSaoParam.setEnabled[setIdx]) + { + for (int ctbIdx = 0; ctbIdx < m_numCTUsInPic; ctbIdx++) + { + if (m_tempCcSaoControl[ctbIdx] == (setIdx + 1) ) + { + m_bestCcSaoControl[ctbIdx] = setIdc; + } + } + m_bestCcSaoParam.candPos[setIdc - 1][COMPONENT_Y ] = storedBestCcSaoParam.candPos[setIdx][COMPONENT_Y ]; + m_bestCcSaoParam.bandNum[setIdc - 1][COMPONENT_Y ] = storedBestCcSaoParam.bandNum[setIdx][COMPONENT_Y ]; + m_bestCcSaoParam.bandNum[setIdc - 1][COMPONENT_Cb] = storedBestCcSaoParam.bandNum[setIdx][COMPONENT_Cb]; + m_bestCcSaoParam.bandNum[setIdc - 1][COMPONENT_Cr] = storedBestCcSaoParam.bandNum[setIdx][COMPONENT_Cr]; + memcpy(m_bestCcSaoParam.offset[setIdc - 1], storedBestCcSaoParam.offset[setIdx], sizeof(storedBestCcSaoParam.offset[setIdx])); + setNum++; + } + m_bestCcSaoParam.setEnabled[setIdx] = setIdx < m_bestCcSaoParam.setNum ? true : false; + } + CHECK(setNum != m_bestCcSaoParam.setNum, "Number of sets enabled != setNum"); + + m_ccSaoComParam.setNum [compID] = m_bestCcSaoParam.setNum; + + for ( int setIdx = 0; setIdx < m_bestCcSaoParam.setNum; setIdx++ ) + { + m_ccSaoComParam.setEnabled[compID][setIdx] = m_bestCcSaoParam.setEnabled[setIdx]; + m_ccSaoComParam.candPos [compID][setIdx][COMPONENT_Y ] = m_bestCcSaoParam.candPos [setIdx][COMPONENT_Y ]; + m_ccSaoComParam.bandNum [compID][setIdx][COMPONENT_Y ] = m_bestCcSaoParam.bandNum [setIdx][COMPONENT_Y ]; + m_ccSaoComParam.bandNum [compID][setIdx][COMPONENT_Cb] = m_bestCcSaoParam.bandNum [setIdx][COMPONENT_Cb]; + m_ccSaoComParam.bandNum [compID][setIdx][COMPONENT_Cr] = m_bestCcSaoParam.bandNum [setIdx][COMPONENT_Cr]; + memcpy(m_ccSaoComParam.offset[compID][setIdx], m_bestCcSaoParam.offset[setIdx], sizeof(m_bestCcSaoParam.offset[setIdx])); + } + memcpy(m_ccSaoControl[compID], m_bestCcSaoControl, sizeof(uint8_t) * m_numCTUsInPic); + } +} + +void EncSampleAdaptiveOffset::setupInitCcSaoParam(CodingStructure& cs, const ComponentID compID, const int setNum, int64_t* trainingDistortion[MAX_CCSAO_SET_NUM] + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], CcSaoStatData frameStats[MAX_CCSAO_SET_NUM] + , CcSaoEncParam& initCcSaoParam, CcSaoEncParam& bestCcSaoParam + , uint8_t* initCcSaoControl, uint8_t* bestCcSaoControl) +{ + initCcSaoParam.reset(); + memset(initCcSaoControl, 0, sizeof(uint8_t) * m_numCTUsInPic); + + if (setNum == 1) + { + std::fill_n(initCcSaoControl, m_numCTUsInPic, 1); + return; + } + + for (int setIdx = 0; setIdx < MAX_CCSAO_SET_NUM; setIdx++) + { + if (bestCcSaoParam.setEnabled[setIdx]) + { + getCcSaoFrameStats(compID, setIdx, bestCcSaoControl, blkStats, frameStats); + getCcSaoDistortion(compID, setIdx, blkStats, bestCcSaoParam.offset, trainingDistortion); + } + } + + initCcSaoParam = bestCcSaoParam; + + int ctbCntOn = 0; + CtbCost *ctbCost = new CtbCost[m_numCTUsInPic]; + + for (int ctbIdx = 0; ctbIdx < m_numCTUsInPic; ctbIdx++) + { + int64_t dist = 0; + + if (bestCcSaoControl[ctbIdx]) + { + int setIdx = bestCcSaoControl[ctbIdx] - 1; + dist = trainingDistortion[setIdx][ctbIdx]; + ctbCntOn++; + } + + ctbCost[ctbIdx].pos = ctbIdx; + ctbCost[ctbIdx].cost = (double)dist; + } + + std::stable_sort(ctbCost, ctbCost + m_numCTUsInPic, compareCtbCost); + + for (int ctbIdx = 0; ctbIdx < m_numCTUsInPic; ctbIdx++) + { + int ctbPos = ctbCost[ctbIdx].pos; + if (ctbIdx < ctbCntOn) + { + if (ctbIdx * 2 > ctbCntOn) + { + initCcSaoControl[ctbPos] = setNum; + } + else + { + initCcSaoControl[ctbPos] = bestCcSaoControl[ctbPos]; + } + } + else + { + initCcSaoControl[ctbPos] = 0; + } + } + + delete[] ctbCost; + ctbCost = nullptr; +} + +void EncSampleAdaptiveOffset::setupTempCcSaoParam(CodingStructure& cs, const ComponentID compID, const int setNum + , const int candPosY, const int bandNumY, const int bandNumU, const int bandNumV + , CcSaoEncParam& tempCcSaoParam, CcSaoEncParam& initCcSaoParam + , uint8_t* tempCcSaoControl, uint8_t* initCcSaoControl) +{ + tempCcSaoParam.reset(); + memset(tempCcSaoControl, 0, sizeof(uint8_t) * m_numCTUsInPic); + + tempCcSaoParam = initCcSaoParam;; + memcpy(tempCcSaoControl, initCcSaoControl, sizeof(uint8_t) * m_numCTUsInPic); + + tempCcSaoParam.setNum = setNum; + tempCcSaoParam.setEnabled[setNum - 1] = true; + tempCcSaoParam.candPos [setNum - 1][COMPONENT_Y ] = candPosY; + tempCcSaoParam.bandNum [setNum - 1][COMPONENT_Y ] = bandNumY; + tempCcSaoParam.bandNum [setNum - 1][COMPONENT_Cb] = bandNumU; + tempCcSaoParam.bandNum [setNum - 1][COMPONENT_Cr] = bandNumV; + + for (int setIdx = 0; setIdx <= setNum; setIdx++) + { + tempCcSaoParam.mapIdxToIdc[setIdx] = setIdx < setNum ? setIdx + 1 : 0; + } +} + +void EncSampleAdaptiveOffset::getCcSaoStatistics(CodingStructure& cs, const ComponentID compID + , const CPelUnitBuf& orgYuv, const CPelUnitBuf& srcYuv, const CPelUnitBuf& dstYuv + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], const CcSaoEncParam& ccSaoParam) +{ + bool isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail, isAboveLeftAvail, isAboveRightAvail; + + const PreCalcValues& pcv = *cs.pcv; + + int ctuRsAddr = 0; + for( uint32_t yPos = 0; yPos < pcv.lumaHeight; yPos += pcv.maxCUHeight ) + { + for( uint32_t xPos = 0; xPos < pcv.lumaWidth; xPos += pcv.maxCUWidth ) + { + const uint32_t width = (xPos + pcv.maxCUWidth > pcv.lumaWidth) ? (pcv.lumaWidth - xPos) : pcv.maxCUWidth; + const uint32_t height = (yPos + pcv.maxCUHeight > pcv.lumaHeight) ? (pcv.lumaHeight - yPos) : pcv.maxCUHeight; + const UnitArea area( cs.area.chromaFormat, Area(xPos , yPos, width, height) ); + + deriveLoopFilterBoundaryAvailibility(cs, area.Y(), isLeftAvail, isAboveAvail, isAboveLeftAvail ); + + //NOTE: The number of skipped lines during gathering CTU statistics depends on the slice boundary availabilities. + //For simplicity, here only picture boundaries are considered. + + isRightAvail = (xPos + pcv.maxCUWidth < pcv.lumaWidth ); + isBelowAvail = (yPos + pcv.maxCUHeight < pcv.lumaHeight); + isAboveRightAvail = ((yPos > 0) && (isRightAvail)); + + for (int setIdx = 0; setIdx < MAX_CCSAO_SET_NUM; setIdx++) + { + blkStats[setIdx][ctuRsAddr].reset(); + if (!ccSaoParam.setEnabled[setIdx]) + continue; + + const CompArea &compArea = area.block(compID); + const int srcStrideY = srcYuv.get(COMPONENT_Y ).stride; + const int srcStrideU = srcYuv.get(COMPONENT_Cb).stride; + const int srcStrideV = srcYuv.get(COMPONENT_Cr).stride; + const int dstStride = dstYuv.get(compID ).stride; + const int orgStride = orgYuv.get(compID ).stride; + const Pel *srcBlkY = srcYuv.get(COMPONENT_Y ).bufAt(area.block(COMPONENT_Y )); + const Pel *srcBlkU = srcYuv.get(COMPONENT_Cb).bufAt(area.block(COMPONENT_Cb)); + const Pel *srcBlkV = srcYuv.get(COMPONENT_Cr).bufAt(area.block(COMPONENT_Cr)); + const Pel *dstBlk = dstYuv.get(compID ).bufAt(compArea); + const Pel *orgBlk = orgYuv.get(compID ).bufAt(compArea); + + const uint16_t candPosY = ccSaoParam.candPos[setIdx][COMPONENT_Y ]; + const uint16_t bandNumY = ccSaoParam.bandNum[setIdx][COMPONENT_Y ]; + const uint16_t bandNumU = ccSaoParam.bandNum[setIdx][COMPONENT_Cb]; + const uint16_t bandNumV = ccSaoParam.bandNum[setIdx][COMPONENT_Cr]; + + getCcSaoBlkStats(compID, cs.area.chromaFormat, cs.sps->getBitDepth(toChannelType(compID)) + , setIdx, blkStats, ctuRsAddr + , candPosY, bandNumY, bandNumU, bandNumV + , srcBlkY, srcBlkU, srcBlkV, orgBlk, dstBlk + , srcStrideY, srcStrideU, srcStrideV, orgStride, dstStride, compArea.width, compArea.height + , isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail, isAboveLeftAvail, isAboveRightAvail + ); + } + ctuRsAddr++; + } + } +} + +void EncSampleAdaptiveOffset::getCcSaoBlkStats(const ComponentID compID, const ChromaFormat chromaFormat, const int bitDepth + , const int setIdx, CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], const int ctuRsAddr + , const uint16_t candPosY + , const uint16_t bandNumY, const uint16_t bandNumU, const uint16_t bandNumV + , const Pel* srcY, const Pel* srcU, const Pel* srcV, const Pel* org, const Pel* dst + , const int srcStrideY, const int srcStrideU, const int srcStrideV, const int orgStride, const int dstStride + , const int width, const int height + , bool isLeftAvail, bool isRightAvail, bool isAboveAvail, bool isBelowAvail, bool isAboveLeftAvail, bool isAboveRightAvail + ) +{ + const int candPosYX = g_ccSaoCandPosX[COMPONENT_Y][candPosY]; + const int candPosYY = g_ccSaoCandPosY[COMPONENT_Y][candPosY]; + + switch(compID) + { + case COMPONENT_Y: + { + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + const Pel *colY = srcY + x + srcStrideY * candPosYY + candPosYX; + const Pel *colU = srcU + (x >> 1); + const Pel *colV = srcV + (x >> 1); + + const int bandY = (*colY * bandNumY) >> bitDepth; + const int bandU = (*colU * bandNumU) >> bitDepth; + const int bandV = (*colV * bandNumV) >> bitDepth; + const int bandIdx = bandY * bandNumU * bandNumV + + bandU * bandNumV + + bandV; + const int classIdx = bandIdx; + + blkStats[setIdx][ctuRsAddr].diff [classIdx] += org[x] - dst[x]; + blkStats[setIdx][ctuRsAddr].count[classIdx]++; + } + + srcY += srcStrideY; + srcU += srcStrideU * (y & 0x1); + srcV += srcStrideV * (y & 0x1); + org += orgStride; + dst += dstStride; + } + } + break; + case COMPONENT_Cb: + case COMPONENT_Cr: + { + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + const Pel *colY = srcY + (x << 1) + srcStrideY * candPosYY + candPosYX; + const Pel *colU = srcU + x; + const Pel *colV = srcV + x; + + const int bandY = (*colY * bandNumY) >> bitDepth; + const int bandU = (*colU * bandNumU) >> bitDepth; + const int bandV = (*colV * bandNumV) >> bitDepth; + const int bandIdx = bandY * bandNumU * bandNumV + + bandU * bandNumV + + bandV; + const int classIdx = bandIdx; + + blkStats[setIdx][ctuRsAddr].diff [classIdx] += org[x] - dst[x]; + blkStats[setIdx][ctuRsAddr].count[classIdx]++; + } + + srcY += srcStrideY << 1; + srcU += srcStrideU; + srcV += srcStrideV; + org += orgStride; + dst += dstStride; + } + } + break; + default: + { + THROW("Not a supported CCSAO compID\n"); + } + } +} + +void EncSampleAdaptiveOffset::getCcSaoFrameStats(const ComponentID compID, const int setIdx, const uint8_t* ccSaoControl + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], CcSaoStatData frameStats[MAX_CCSAO_SET_NUM]) +{ + frameStats[setIdx].reset(); + int setIdc = setIdx + 1; + + for (int ctbIdx = 0; ctbIdx < m_numCTUsInPic; ctbIdx++) + { + if (ccSaoControl[ctbIdx] == setIdc) + { + frameStats[setIdx] += blkStats[setIdx][ctbIdx]; + } + } +} + +inline int EncSampleAdaptiveOffset::estCcSaoIterOffset(const double lambda, const int offsetInput, const int64_t count, const int64_t diffSum, const int shift, const int bitIncrease, int64_t& bestDist, double& bestCost, const int offsetTh) +{ + int iterOffset, tempOffset; + int64_t tempDist, tempRate; + double tempCost, tempMinCost; + int offsetOutput = 0; + iterOffset = offsetInput; + // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. entropy coder can be used to measure the exact rate here. + tempMinCost = lambda; + while (iterOffset != 0) + { + // Calculate the bits required for signaling the offset + tempRate = lengthUvlc(abs(iterOffset)) + (iterOffset == 0 ? 0 : 1); + + // Do the dequantization before distortion calculation + tempOffset = iterOffset << bitIncrease; + tempDist = estSaoDist(count, tempOffset, diffSum, shift); + tempCost = ((double)tempDist + lambda * (double)tempRate); + if (tempCost < tempMinCost) + { + tempMinCost = tempCost; + offsetOutput = iterOffset; + bestDist = tempDist; + bestCost = tempCost; + } + iterOffset = (iterOffset > 0) ? (iterOffset - 1) : (iterOffset + 1); + } + return offsetOutput; +} + +void EncSampleAdaptiveOffset::deriveCcSaoOffsets(const ComponentID compID, const int bitDepth, const int setIdx + , CcSaoStatData frameStats[MAX_CCSAO_SET_NUM] + , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM]) +{ + int quantOffsets[MAX_CCSAO_CLASS_NUM] = { 0 }; + + for(int k = 0; k < MAX_CCSAO_CLASS_NUM; k++) + { + if(frameStats[setIdx].count[k] == 0) + continue; + + quantOffsets[k] = + (int) xRoundIbdi(bitDepth, (double)(frameStats[setIdx].diff [k] << DISTORTION_PRECISION_ADJUSTMENT(bitDepth)) + / (double)(frameStats[setIdx].count[k])); + quantOffsets[k] = Clip3(-MAX_CCSAO_OFFSET_THR, MAX_CCSAO_OFFSET_THR, quantOffsets[k]); + } + + int64_t dist[MAX_CCSAO_CLASS_NUM] = { 0 }; + double cost[MAX_CCSAO_CLASS_NUM] = { 0 }; + for (int k = 0; k < MAX_CCSAO_CLASS_NUM; k++) + { + cost[k] = m_lambda[compID]; + if (quantOffsets[k] != 0) + { + quantOffsets[k] = estCcSaoIterOffset(m_lambda[compID], quantOffsets[k], frameStats[setIdx].count[k], frameStats[setIdx].diff[k], 0, 0, dist[k], cost[k], MAX_CCSAO_OFFSET_THR); + } + } + + for (int k = 0; k < MAX_CCSAO_CLASS_NUM; k++) + { + CHECK(quantOffsets[k] < -MAX_CCSAO_OFFSET_THR || quantOffsets[k] > MAX_CCSAO_OFFSET_THR, "Exceeded valid range for CCSAO offset"); + offset[setIdx][k] = quantOffsets[k]; + } +} + +void EncSampleAdaptiveOffset::getCcSaoDistortion(const ComponentID compID, const int setIdx, CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM] + , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM] + , int64_t* trainingDistortion[MAX_CCSAO_SET_NUM]) +{ + ::memset(trainingDistortion[setIdx], 0, sizeof(int64_t) * m_numCTUsInPic); + + for (int ctbIdx = 0; ctbIdx < m_numCTUsInPic; ctbIdx++) + { + for (int k = 0; k < MAX_CCSAO_CLASS_NUM; k++) + { + trainingDistortion[setIdx][ctbIdx] + += estSaoDist(blkStats[setIdx][ctbIdx].count[k], offset[setIdx][k], blkStats[setIdx][ctbIdx].diff[k], 0); + } + } +} + +void EncSampleAdaptiveOffset::determineCcSaoControlIdc(CodingStructure& cs, const ComponentID compID + , const int ctuWidthC, const int ctuHeightC, const int picWidthC, const int picHeightC + , CcSaoEncParam& ccSaoParam, uint8_t* ccSaoControl + , int64_t* trainingDistorsion[MAX_CCSAO_SET_NUM] + , int64_t& curTotalDist, double& curTotalRate) +{ + bool setEnabled[MAX_CCSAO_SET_NUM]; + std::fill_n(setEnabled, MAX_CCSAO_SET_NUM, false); + + SetIdxCount setIdxCount[MAX_CCSAO_SET_NUM]; + for (int i = 0; i < MAX_CCSAO_SET_NUM; i++) + { + setIdxCount[i].setIdx = i; + setIdxCount[i].count = 0; + } + + double prevRate = curTotalRate; + + TempCtx ctxInitial(m_CtxCache); + TempCtx ctxBest(m_CtxCache); + TempCtx ctxStart(m_CtxCache); + ctxInitial = SubCtx(Ctx::CcSaoControlIdc, m_CABACEstimator->getCtx()); + ctxBest = SubCtx(Ctx::CcSaoControlIdc, m_CABACEstimator->getCtx()); + + int ctbIdx = 0; + for (int yCtb = 0; yCtb < picHeightC; yCtb += ctuHeightC) + { + for (int xCtb = 0; xCtb < picWidthC; xCtb += ctuWidthC) + { + int64_t bestDist = MAX_INT; + double bestRate = MAX_DOUBLE; + double bestCost = MAX_DOUBLE; + uint8_t bestSetIdc = 0; + uint8_t bestSetIdx = 0; + + m_CABACEstimator->getCtx() = ctxBest; + ctxStart = SubCtx(Ctx::CcSaoControlIdc, m_CABACEstimator->getCtx()); + + for (int setIdx = 0; setIdx <= MAX_CCSAO_SET_NUM; setIdx++) + { + if (setIdx < MAX_CCSAO_SET_NUM && !ccSaoParam.setEnabled[setIdx]) + continue; + + uint8_t setIdc = ccSaoParam.mapIdxToIdc[setIdx]; + m_CABACEstimator->getCtx() = ctxStart; + m_CABACEstimator->resetBits(); + const Position lumaPos = Position({ xCtb << getComponentScaleX(compID, cs.pcv->chrFormat), + yCtb << getComponentScaleY(compID, cs.pcv->chrFormat) }); + m_CABACEstimator->codeCcSaoControlIdc(setIdc, cs, compID, ctbIdx, ccSaoControl, lumaPos, ccSaoParam.setNum); + + int64_t dist = setIdx == MAX_CCSAO_SET_NUM ? 0 : trainingDistorsion[setIdx][ctbIdx]; + double rate = FRAC_BITS_SCALE * m_CABACEstimator->getEstFracBits(); + double cost = rate * m_lambda[compID] + dist; + + if (cost < bestCost) + { + bestCost = cost; + bestRate = rate; + bestDist = dist; + bestSetIdc = setIdc; + bestSetIdx = setIdx; + ctxBest = SubCtx(Ctx::CcSaoControlIdc, m_CABACEstimator->getCtx()); + ccSaoControl[ctbIdx] = setIdx == MAX_CCSAO_SET_NUM ? 0 : setIdx + 1; + } + } + if (bestSetIdc != 0) + { + setEnabled [bestSetIdx] = true; + setIdxCount[bestSetIdx].count++; + } + curTotalRate += bestRate; + curTotalDist += bestDist; + ctbIdx++; + } + } + + std::copy_n(setEnabled, MAX_CCSAO_SET_NUM, ccSaoParam.setEnabled); + + std::stable_sort(setIdxCount, setIdxCount + MAX_CCSAO_SET_NUM, compareSetIdxCount); + + int setIdc = 1; + ccSaoParam.setNum = 0; + for (SetIdxCount &s : setIdxCount) + { + int setIdx = s.setIdx; + if (ccSaoParam.setEnabled[setIdx]) + { + ccSaoParam.mapIdxToIdc[setIdx] = setIdc; + ccSaoParam.setNum++; + setIdc++; + } + } + + curTotalRate = prevRate; + m_CABACEstimator->getCtx() = ctxInitial; + m_CABACEstimator->resetBits(); + ctbIdx = 0; + for (int yCtb = 0; yCtb < picHeightC; yCtb += ctuHeightC) + { + for (int xCtb = 0; xCtb < picWidthC; xCtb += ctuWidthC) + { + const int setIdxPlus1 = ccSaoControl[ctbIdx]; + const Position lumaPos = Position({ xCtb << getComponentScaleX(compID, cs.pcv->chrFormat), + yCtb << getComponentScaleY(compID, cs.pcv->chrFormat) }); + + m_CABACEstimator->codeCcSaoControlIdc(setIdxPlus1 == 0 ? 0 : ccSaoParam.mapIdxToIdc[setIdxPlus1 - 1], + cs, compID, ctbIdx, ccSaoControl, lumaPos, ccSaoParam.setNum); + ctbIdx++; + } + } + curTotalRate += FRAC_BITS_SCALE*m_CABACEstimator->getEstFracBits(); + + // restore for next iteration + m_CABACEstimator->getCtx() = ctxInitial; +} + +int EncSampleAdaptiveOffset::lengthUvlc(int uiCode) +{ + int uiLength = 1; + int uiTemp = ++uiCode; + + CHECK(!uiTemp, "Integer overflow"); + + while (1 != uiTemp) + { + uiTemp >>= 1; + uiLength += 2; + } + // Take care of cases where uiLength > 32 + return (uiLength >> 1) + ((uiLength + 1) >> 1); +} + +int EncSampleAdaptiveOffset::getCcSaoParamRate(const ComponentID compID, const CcSaoEncParam& ccSaoParam) +{ + int bits = 0; + + if (ccSaoParam.setNum > 0 ) + { + bits += lengthUvlc(ccSaoParam.setNum - 1); + + int signaledSetNum = 0; + for (int setIdx = 0; setIdx < MAX_CCSAO_SET_NUM; setIdx++) + { + if (ccSaoParam.setEnabled[setIdx]) + { + bits += MAX_CCSAO_CAND_POS_Y_BITS; + bits += MAX_CCSAO_BAND_NUM_Y_BITS; + bits += MAX_CCSAO_BAND_NUM_U_BITS; + bits += MAX_CCSAO_BAND_NUM_V_BITS; + + int classNum = ccSaoParam.bandNum[setIdx][COMPONENT_Y ] + * ccSaoParam.bandNum[setIdx][COMPONENT_Cb] + * ccSaoParam.bandNum[setIdx][COMPONENT_Cr]; + for (int i = 0; i < classNum; i++) + { + bits += lengthUvlc(abs(ccSaoParam.offset[setIdx][i])) + (ccSaoParam.offset[setIdx][i] == 0 ? 0 : 1); + } + signaledSetNum++; + } + } + CHECK(signaledSetNum != ccSaoParam.setNum, "Number of sets signaled not the same as indicated"); + } + return bits; +} + +void EncSampleAdaptiveOffset::deriveCcSaoRDO(CodingStructure& cs, const ComponentID compID, int64_t* trainingDistortion[MAX_CCSAO_SET_NUM] + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], CcSaoStatData frameStats[MAX_CCSAO_SET_NUM] + , CcSaoEncParam& bestCcSaoParam, CcSaoEncParam& tempCcSaoParam + , uint8_t* bestCcSaoControl, uint8_t* tempCcSaoControl + , double& bestCost, double& tempCost) +{ + const int scaleX = getComponentScaleX(compID, cs.pcv->chrFormat); + const int scaleY = getComponentScaleY(compID, cs.pcv->chrFormat); + const int ctuWidthC = cs.pcv->maxCUWidth >> scaleX; + const int ctuHeightC = cs.pcv->maxCUHeight >> scaleY; + const int picWidthC = cs.pcv->lumaWidth >> scaleX; + const int picHeightC = cs.pcv->lumaHeight >> scaleY; + const int maxTrainingIter = 15; + + const TempCtx ctxStartCcSaoControlFlag ( m_CtxCache, SubCtx( Ctx::CcSaoControlIdc, m_CABACEstimator->getCtx() ) ); + + int trainingIter = 0; + bool keepTraining = true; + bool improved = false; + double prevCost = MAX_DOUBLE; + while (keepTraining) + { + improved = false; + + for (int setIdx = 0; setIdx < MAX_CCSAO_SET_NUM; setIdx++) + { + if (tempCcSaoParam.setEnabled[setIdx]) + { + getCcSaoFrameStats(compID, setIdx, tempCcSaoControl, blkStats, frameStats); + deriveCcSaoOffsets(compID, cs.sps->getBitDepth(toChannelType(compID)), setIdx, frameStats, tempCcSaoParam.offset); + getCcSaoDistortion(compID, setIdx, blkStats, tempCcSaoParam.offset, trainingDistortion); + } + } + + m_CABACEstimator->getCtx() = ctxStartCcSaoControlFlag; + + int64_t curTotalDist = 0; + double curTotalRate = 0; + determineCcSaoControlIdc(cs, compID, ctuWidthC, ctuHeightC, picWidthC, picHeightC, + tempCcSaoParam, tempCcSaoControl, trainingDistortion, + curTotalDist, curTotalRate); + + if (tempCcSaoParam.setNum > 0) + { + curTotalRate += getCcSaoParamRate(compID, tempCcSaoParam); + tempCost = curTotalRate * m_lambda[compID] + curTotalDist; + + if (tempCost < prevCost) + { + prevCost = tempCost; + improved = true; + } + + if (tempCost < bestCost) + { + bestCost = tempCost; + bestCcSaoParam = tempCcSaoParam; + memcpy(bestCcSaoControl, tempCcSaoControl, sizeof(uint8_t) * m_numCTUsInPic); + } + } + + trainingIter++; + if (!improved || trainingIter > maxTrainingIter) + { + keepTraining = false; + } + } +} +#endif + void EncSampleAdaptiveOffset::deriveLoopFilterBoundaryAvailibility(CodingStructure& cs, const Position &pos, bool& isLeftAvail, bool& isAboveAvail, bool& isAboveLeftAvail) const { bool isLoopFiltAcrossSlicePPS = cs.pps->getLoopFilterAcrossSlicesEnabledFlag(); diff --git a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h index 686592bf8797a71d55802fd5894cd8b6c7ba0c2a..9f6e04c8e08f30ebb0f2afaf8c61f41126583a22 100644 --- a/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h +++ b/source/Lib/EncoderLib/EncSampleAdaptiveOffset.h @@ -79,6 +79,69 @@ struct SAOStatData //data structure for SAO statistics } }; +#if JVET_W0066_CCSAO +struct CcSaoStatData +{ + int64_t diff [MAX_CCSAO_CLASS_NUM]; + uint32_t count[MAX_CCSAO_CLASS_NUM]; + + CcSaoStatData(){} + ~CcSaoStatData(){} + void reset() + { + ::memset(diff, 0, sizeof(int64_t) * MAX_CCSAO_CLASS_NUM); + ::memset(count, 0, sizeof(uint32_t) * MAX_CCSAO_CLASS_NUM); + } + const CcSaoStatData& operator=(const CcSaoStatData& src) + { + ::memcpy(diff, src.diff, sizeof(int64_t) * MAX_CCSAO_CLASS_NUM); + ::memcpy(count, src.count, sizeof(uint32_t) * MAX_CCSAO_CLASS_NUM); + return *this; + } + const CcSaoStatData& operator+= (const CcSaoStatData& src) + { + for(int i = 0; i < MAX_CCSAO_CLASS_NUM; i++) + { + diff [i] += src.diff [i]; + count[i] += src.count[i]; + } + return *this; + } +}; + +struct CcSaoEncParam +{ + uint8_t setNum; + bool setEnabled [MAX_CCSAO_SET_NUM]; + uint16_t candPos [MAX_CCSAO_SET_NUM][MAX_NUM_LUMA_COMP]; + uint16_t bandNum [MAX_CCSAO_SET_NUM][MAX_NUM_COMPONENT]; + short offset [MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM]; + uint8_t mapIdxToIdc[MAX_CCSAO_SET_NUM + 1]; + + CcSaoEncParam() {} + ~CcSaoEncParam() {} + void reset() + { + setNum = 0; + ::memset(setEnabled, false, sizeof(setEnabled)); + ::memset(candPos, 0, sizeof(candPos)); + ::memset(bandNum, 0, sizeof(bandNum)); + ::memset(offset, 0, sizeof(offset)); + ::memset(mapIdxToIdc, 0, sizeof(mapIdxToIdc)); + } + const CcSaoEncParam& operator= (const CcSaoEncParam& src) + { + setNum = src.setNum; + ::memcpy(setEnabled, src.setEnabled, sizeof(setEnabled)); + ::memcpy(candPos, src.candPos, sizeof(candPos)); + ::memcpy(bandNum, src.bandNum, sizeof(bandNum)); + ::memcpy(offset, src.offset, sizeof(offset)); + ::memcpy(mapIdxToIdc, src.mapIdxToIdc, sizeof(mapIdxToIdc)); + return *this; + } +}; +#endif + class EncSampleAdaptiveOffset : public SampleAdaptiveOffset { public: @@ -98,7 +161,9 @@ public: ,BIFCabacEst* BifCABACEstimator #endif ); - +#if JVET_W0066_CCSAO + void CCSAOProcess(CodingStructure& cs, const double* lambdas, const int intraPeriod); +#endif void disabledRate( CodingStructure& cs, SAOBlkParam* reconParams, const double saoEncodingRate, const double saoEncodingRateChroma ); void getPreDBFStatistics(CodingStructure& cs); private: //methods @@ -129,6 +194,46 @@ private: //methods inline int64_t estSaoDist(int64_t count, int64_t offset, int64_t diffSum, int shift); inline int estIterOffset(int typeIdx, double lambda, int offsetInput, int64_t count, int64_t diffSum, int shift, int bitIncrease, int64_t& bestDist, double& bestCost, int offsetTh ); void addPreDBFStatistics(std::vector<SAOStatData**>& blkStats); +#if JVET_W0066_CCSAO + void setupCcSaoLambdas(CodingStructure& cs, const double* lambdas); + void deriveCcSao(CodingStructure& cs, const ComponentID compID, const CPelUnitBuf& orgYuv, const CPelUnitBuf& srcYuv, const CPelUnitBuf& dstYuv); + void setupInitCcSaoParam(CodingStructure& cs, const ComponentID compID, const int setNum, int64_t* trainingDistortion[MAX_CCSAO_SET_NUM] + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], CcSaoStatData frameStats[MAX_CCSAO_SET_NUM] + , CcSaoEncParam& initCcSaoParam, CcSaoEncParam& bestCcSaoParam + , uint8_t* initCcSaoControl, uint8_t* bestCcSaoControl); + void setupTempCcSaoParam(CodingStructure& cs, const ComponentID compID, const int setNum + , const int candPosY, const int bandNumY, const int bandNumU, const int bandNumV + , CcSaoEncParam& tempCcSaoParam, CcSaoEncParam& initCcSaoParam + , uint8_t* tempCcSaoControl, uint8_t* initCcSaoControl); + void getCcSaoStatistics(CodingStructure& cs, const ComponentID compID, const CPelUnitBuf& orgYuv, const CPelUnitBuf& srcYuv, const CPelUnitBuf& dstYuv + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], const CcSaoEncParam& ccSaoParam); + void getCcSaoBlkStats(const ComponentID compID, const ChromaFormat chromaFormat, const int bitDepth + , const int setIdx, CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], const int ctuRsAddr + , const uint16_t candPosY + , const uint16_t bandNumY, const uint16_t bandNumU, const uint16_t bandNumV + , const Pel* srcY, const Pel* srcU, const Pel* srcV, const Pel* org, const Pel* dst + , const int srcStrideY, const int srcStrideU, const int srcStrideV, const int orgStride, const int dstStride, const int width, const int height + , bool isLeftAvail, bool isRightAvail, bool isAboveAvail, bool isBelowAvail, bool isAboveLeftAvail, bool isAboveRightAvail); + void getCcSaoFrameStats(const ComponentID compID, const int setIdx, const uint8_t* ccSaoControl + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], CcSaoStatData frameStats[MAX_CCSAO_SET_NUM]); + void deriveCcSaoOffsets(const ComponentID compID, const int bitDepth, const int setIdx + , CcSaoStatData frameStats[MAX_CCSAO_SET_NUM] + , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM]); + inline int estCcSaoIterOffset(const double lambda, const int offsetInput, const int64_t count, const int64_t diffSum, const int shift, const int bitIncrease, int64_t& bestDist, double& bestCost, const int offsetTh); + void getCcSaoDistortion(const ComponentID compID, const int setIdx, CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM] + , short offset[MAX_CCSAO_SET_NUM][MAX_CCSAO_CLASS_NUM], int64_t* trainingDistortion[MAX_CCSAO_SET_NUM]); + void deriveCcSaoRDO(CodingStructure& cs, const ComponentID compID, int64_t* trainingDistortion[MAX_CCSAO_SET_NUM] + , CcSaoStatData* blkStats[MAX_CCSAO_SET_NUM], CcSaoStatData frameStats[MAX_CCSAO_SET_NUM] + , CcSaoEncParam& bestCcSaoParam, CcSaoEncParam& tempCcSaoParam + , uint8_t* bestCcSaoControl, uint8_t* tempCcSaoControl + , double& bestCost, double& tempCost); + void determineCcSaoControlIdc(CodingStructure& cs, const ComponentID compID, + const int ctuWidthC, const int ctuHeightC, const int picWidthC, const int picHeightC, + CcSaoEncParam& ccSaoParam, uint8_t* ccSaoControl, int64_t* trainingDistorsion[MAX_CCSAO_SET_NUM], + int64_t& curTotalDist, double& curTotalRate); + int getCcSaoParamRate(const ComponentID compID, const CcSaoEncParam& ccSaoParam); + int lengthUvlc(int uiCode); +#endif private: //members //for RDO CABACWriter* m_CABACEstimator; @@ -141,6 +246,20 @@ private: //members double m_saoDisabledRate[MAX_NUM_COMPONENT][MAX_TLAYER]; int m_skipLinesR[MAX_NUM_COMPONENT][NUM_SAO_NEW_TYPES]; int m_skipLinesB[MAX_NUM_COMPONENT][NUM_SAO_NEW_TYPES]; + +#if JVET_W0066_CCSAO + bool m_createdEnc = false; + int m_intraPeriod; + CcSaoStatData* m_ccSaoStatData [MAX_CCSAO_SET_NUM]; + CcSaoStatData m_ccSaoStatFrame[MAX_CCSAO_SET_NUM]; + CcSaoEncParam m_bestCcSaoParam; + CcSaoEncParam m_tempCcSaoParam; + CcSaoEncParam m_initCcSaoParam; + uint8_t* m_bestCcSaoControl; + uint8_t* m_tempCcSaoControl; + uint8_t* m_initCcSaoControl; + int64_t* m_trainingDistortion[MAX_CCSAO_SET_NUM]; +#endif }; diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index 18a94142937889a37aa2579125b51f696c4aff72..6e88da5e13b01d53de280176e78837d13e3a9945 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -1233,6 +1233,9 @@ void HLSWriter::codeSPS( const SPS* pcSPS ) } WRITE_FLAG( pcSPS->getSAOEnabledFlag(), "sps_sao_enabled_flag"); +#if JVET_W0066_CCSAO + WRITE_FLAG( pcSPS->getCCSAOEnabledFlag(), "sps_ccsao_enabled_flag" ); +#endif WRITE_FLAG( pcSPS->getALFEnabledFlag(), "sps_alf_enabled_flag" ); if (pcSPS->getALFEnabledFlag() && pcSPS->getChromaFormatIdc() != CHROMA_400) { @@ -2365,7 +2368,29 @@ void HLSWriter::codePictureHeader( PicHeader* picHeader, bool writeRbspTrailingB picHeader->setSaoEnabledFlag(CHANNEL_TYPE_CHROMA, false); } - +#if JVET_W0066_CCSAO + if(sps->getCCSAOEnabledFlag()) + { + if (pps->getSaoInfoInPhFlag()) + { + WRITE_FLAG(picHeader->getCcSaoEnabledFlag(COMPONENT_Y), "ph_cc_sao_y_enabled_flag"); + WRITE_FLAG(picHeader->getCcSaoEnabledFlag(COMPONENT_Cb), "ph_cc_sao_cb_enabled_flag"); + WRITE_FLAG(picHeader->getCcSaoEnabledFlag(COMPONENT_Cr), "ph_cc_sao_cr_enabled_flag"); + } + else + { + picHeader->setCcSaoEnabledFlag(COMPONENT_Y, true); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cb, true); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cr, true); + } + } + else + { + picHeader->setCcSaoEnabledFlag(COMPONENT_Y, false); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cb, false); + picHeader->setCcSaoEnabledFlag(COMPONENT_Cr, false); + } +#endif // deblocking filter controls if (pps->getDeblockingFilterControlPresentFlag()) @@ -2822,6 +2847,9 @@ void HLSWriter::codeSliceHeader ( Slice* pcSlice ) } } +#if JVET_W0066_CCSAO + codeCcSao(pcSlice, picHeader, pcSlice->getSPS(), pcSlice->m_ccSaoComParam); +#endif if (pcSlice->getPPS()->getDeblockingFilterControlPresentFlag()) { @@ -3033,6 +3061,9 @@ void HLSWriter::codeConstraintInfo ( const ConstraintInfo* cinfo ) /* loop filter */ WRITE_FLAG(cinfo->getNoSaoConstraintFlag() ? 1 : 0, "gci_no_sao_constraint_flag"); +#if JVET_W0066_CCSAO + WRITE_FLAG(cinfo->getNoCCSaoConstraintFlag() ? 1 : 0, "gci_no_ccsao_constraint_flag"); +#endif WRITE_FLAG(cinfo->getNoAlfConstraintFlag() ? 1 : 0, "gci_no_alf_constraint_flag"); WRITE_FLAG(cinfo->getNoCCAlfConstraintFlag() ? 1 : 0, "gci_no_ccalf_constraint_flag"); WRITE_FLAG(cinfo->getNoLmcsConstraintFlag() ? 1 : 0, "gci_no_lmcs_constraint_flag"); @@ -3566,6 +3597,65 @@ bool HLSWriter::xFindMatchingLTRP(Slice* pcSlice, uint32_t *ltrpsIndex, int ltrp return false; } +#if JVET_W0066_CCSAO +void HLSWriter::codeCcSao(Slice* pcSlice, PicHeader* picHeader, const SPS* sps, const CcSaoComParam& ccSaoParam) +{ + if (pcSlice->getSPS()->getCCSAOEnabledFlag()) + { + WRITE_FLAG(ccSaoParam.enabled[COMPONENT_Y ] ? 1 : 0, "slice_ccsao_y_enabled_flag"); + WRITE_FLAG(ccSaoParam.enabled[COMPONENT_Cb] ? 1 : 0, "slice_ccsao_cb_enabled_flag"); + WRITE_FLAG(ccSaoParam.enabled[COMPONENT_Cr] ? 1 : 0, "slice_ccsao_cr_enabled_flag"); + + for (int compIdx = 0; compIdx < MAX_NUM_COMPONENT; compIdx++) + { + if (ccSaoParam.enabled[compIdx]) + { + CHECK(ccSaoParam.setNum[compIdx] == 0 + || ccSaoParam.setNum[compIdx] > MAX_CCSAO_SET_NUM, "CCSAO setNum out of range"); + WRITE_UVLC(ccSaoParam.setNum[compIdx] - 1, "ccsao_set_num"); + + for (int setIdx = 0; setIdx < ccSaoParam.setNum[compIdx]; setIdx++) + { + CHECK(ccSaoParam.candPos[compIdx][setIdx][COMPONENT_Y ] >= MAX_CCSAO_CAND_POS_Y, "CCSAO candPosY out of range"); + CHECK(ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Y ] == 0 + || ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Y ] > MAX_CCSAO_BAND_NUM_Y, "CCSAO bandNumY out of range"); + CHECK(ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cb] == 0 + || ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cb] > MAX_CCSAO_BAND_NUM_U, "CCSAO bandNumU out of range"); + CHECK(ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cr] == 0 + || ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cr] > MAX_CCSAO_BAND_NUM_V, "CCSAO bandNumV out of range"); + + WRITE_CODE(ccSaoParam.candPos[compIdx][setIdx][COMPONENT_Y ], MAX_CCSAO_CAND_POS_Y_BITS, "ccsao_cand_pos_y"); + WRITE_CODE(ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Y ] - 1, MAX_CCSAO_BAND_NUM_Y_BITS, "ccsao_band_num_y"); + WRITE_CODE(ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cb] - 1, MAX_CCSAO_BAND_NUM_U_BITS, "ccsao_band_num_u"); + WRITE_CODE(ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cr] - 1, MAX_CCSAO_BAND_NUM_V_BITS, "ccsao_band_num_v"); + + const short *offset = ccSaoParam.offset [compIdx][setIdx]; + const int classNum = ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Y ] + * ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cb] + * ccSaoParam.bandNum[compIdx][setIdx][COMPONENT_Cr]; + for (int i = 0; i < classNum; i++) + { + CHECK((offset[i] > MAX_CCSAO_OFFSET_THR || offset[i] < -MAX_CCSAO_OFFSET_THR), "CCSAO offset out of range"); + WRITE_UVLC(abs(offset[i]), "ccsao_offset_abs"); + if (abs(offset[i]) != 0) + { + WRITE_FLAG((offset[i] < 0) ? 1 : 0, "ccsao_offset_sign"); + } + } + + DTRACE(g_trace_ctx, D_SYNTAX, "offset setIdx %d: ", setIdx); + for (int i = 0; i < classNum; i++) + { + DTRACE(g_trace_ctx, D_SYNTAX, "%d ", offset[i]); + } + DTRACE(g_trace_ctx, D_SYNTAX, "\n"); + } + } + } + } +} +#endif + #if ALF_IMPROVEMENT void HLSWriter::alfGolombEncode(int coeff, int k, const bool signed_coeff) { diff --git a/source/Lib/EncoderLib/VLCWriter.h b/source/Lib/EncoderLib/VLCWriter.h index 982262d9dab50081992e09238b30cf4213751f1b..037897b325492a8ac317bb58a845b7e979c4de4a 100644 --- a/source/Lib/EncoderLib/VLCWriter.h +++ b/source/Lib/EncoderLib/VLCWriter.h @@ -160,6 +160,9 @@ public: #else void codeScalingList ( const ScalingList &scalingList ); #endif +#if JVET_W0066_CCSAO + void codeCcSao ( Slice* pcSlice, PicHeader* picHeader, const SPS* sps, const CcSaoComParam& ccSaoParam ); +#endif #if ALF_IMPROVEMENT void alfFilter( const AlfParam& alfParam, const bool isChroma, const int altIdx, int order0, int order1 ); void alfGolombEncode(const int coeff, const int k, const bool signed_coeff = true);