diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index ead92686fb3847607b77b1843b0a9526e57bb8df..d5fff470e0c3445ffd2155c40bb63ef4b479b522 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -1830,14 +1830,22 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) msg( WARNING, "*************************************************************************\n" ); } +#if ENABLE_QPA_SUB_CTU + #if QP_SWITCHING_FOR_PARALLEL + if (m_LargeCTU && (m_iQP < 38) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && (m_iSourceWidth <= 2048) && (m_iSourceHeight <= 1280) + #else + if (m_LargeCTU && ((int)m_fQP < 38) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && (m_iSourceWidth <= 2048) && (m_iSourceHeight <= 1280) + #endif + && ((1 << (std::max (m_quadtreeTULog2MaxSize, m_tuLog2MaxSize) + 1)) == m_uiCTUSize) && (m_iSourceWidth > 512 || m_iSourceHeight > 320)) + { + m_iMaxCuDQPDepth = 1; + } +#else #if QP_SWITCHING_FOR_PARALLEL if( m_LargeCTU && ( m_iQP < 38 ) && ( m_iGOPSize > 4 ) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && ( m_iSourceHeight <= 1280 ) && ( m_iSourceWidth <= 2048 ) ) #else if( m_LargeCTU && ( ( int ) m_fQP < 38 ) && ( m_iGOPSize > 4 ) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && ( m_iSourceHeight <= 1280 ) && ( m_iSourceWidth <= 2048 ) ) #endif -#else - if( false ) -#endif { msg( WARNING, "*************************************************************************\n" ); msg( WARNING, "* WARNING: QPA on with large CTU for <=HD sequences, limiting CTU size! *\n" ); @@ -1847,6 +1855,8 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) if( ( 1u << m_quadtreeTULog2MaxSize ) > m_uiCTUSize ) m_quadtreeTULog2MaxSize--; if( ( 1u << m_tuLog2MaxSize ) > m_uiCTUSize ) m_tuLog2MaxSize--; } +#endif +#endif // ENABLE_QPA const int minCuSize = 1 << MIN_CU_LOG2; m_uiMaxCodingDepth = 0; diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index e16ab2d3f4a9b163cbbbe0f2b33bdb1c70ef416d..116f5c69fac4469eb69e6ec8ba755d133060ecdd 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -118,6 +118,7 @@ struct AreaBuf : public Size void removeHighFreq ( const AreaBuf<T>& other, const bool bClip, const ClpRng& clpRng); void updateHistogram ( std::vector<int32_t>& hist ) const; + T mean () const; T meanDiff ( const AreaBuf<const T> &other ) const; void subtract ( const T val ); @@ -546,6 +547,27 @@ void AreaBuf<T>::extendBorderPel( unsigned margin ) ::memcpy( p - ( y + 1 ) * s, p, sizeof( T ) * ( w + ( margin << 1 ) ) ); } } + +template<typename T> +T AreaBuf<T>::mean() const +{ + int64_t sum = 0; + + CHECK (area() == 0, "size of area is zero"); + + const T* src = buf; + +#define MEAN_INC src += stride +#define MEAN_OP(ADDR) sum += src[ADDR] + + SIZE_AWARE_PER_EL_OP(MEAN_OP, MEAN_INC); + +#undef MEAN_INC +#undef MEAN_OP + + return T ((sum + (area() >> 1)) / area()); +} + template<typename T> T AreaBuf<T>::meanDiff( const AreaBuf<const T> &other ) const { diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h index 2e90cc5382746f8a7675372eceb892748202fa98..55756cdbed31462ec6fae7c98e966efb9c0b34b1 100644 --- a/source/Lib/CommonLib/Picture.h +++ b/source/Lib/CommonLib/Picture.h @@ -297,6 +297,9 @@ public: #if ENABLE_QPA std::vector<double> m_uEnerHpCtu; ///< CTU-wise L2 or squared L1 norm of high-passed luma input std::vector<Pel> m_iOffsetCtu; ///< CTU-wise DC offset (later QP index offset) of luma input + #if ENABLE_QPA_SUB_CTU + std::vector<int8_t> m_subCtuQP; ///< sub-CTU-wise adapted QPs for delta-QP depth of 1 or more + #endif #endif std::vector<SAOBlkParam> m_sao[2]; diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index b60bfdd4e99d78688d73f37a552e30744313f8de..1d0452b39117b53d0fda1c2196cd0bb4e042e83a 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -224,7 +224,7 @@ #define SHARP_LUMA_DELTA_QP 1 ///< include non-normative LCU deltaQP and normative chromaQP change #define ER_CHROMA_QP_WCG_PPS 1 ///< Chroma QP model for WCG used in Anchor 3.2 #define ENABLE_QPA 1 ///< Non-normative perceptual QP adaptation according to JVET-H0047 and JVET-K0206. Deactivated by default, activated using encoder arguments --PerceptQPA=1 --SliceChromaQPOffsetPeriodicity=1 - +#define ENABLE_QPA_SUB_CTU ( 1 && ENABLE_QPA ) ///< when maximum delta-QP depth is greater than zero, use sub-CTU QPA #define RDOQ_CHROMA 1 ///< use of RDOQ in chroma diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index d34cdde6d78d357ad6ff06f4bf98053a5433ebe4..f74add71740e260dd27ea94d3353de02ddd5e820 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -363,7 +363,7 @@ void EncCu::compressCtu( CodingStructure& cs, const UnitArea& area, const unsign cs.useSubStructure( *bestCS, partitioner->chType, CS::getArea( *bestCS, area, partitioner->chType ), copyUnsplitCTUSignals, false, false, copyUnsplitCTUSignals ); cs.slice->copyMotionLUTs(bestMotCandLUTs, cs.slice->getMotionLUTs()); - if( !cs.pcv->ISingleTree && cs.slice->isIRAP() && cs.pcv->chrFormat != CHROMA_400 ) + if (CS::isDualITree (cs) && isChromaEnabled (cs.pcv->chrFormat)) { m_CABACEstimator->getCtx() = m_CurrCtx->start; @@ -639,7 +639,7 @@ void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Par { EncTestMode currTestMode = m_modeCtrl->currTestMode(); - if (tempCS->pps->getUseDQP() && CS::isDualITree(*tempCS) && isChroma(partitioner.chType)) + if (pps.getUseDQP() && CS::isDualITree(*tempCS) && isChroma(partitioner.chType)) { const Position chromaCentral(tempCS->area.Cb().chromaPos().offset(tempCS->area.Cb().chromaSize().width >> 1, tempCS->area.Cb().chromaSize().height >> 1)); const Position lumaRefPos(chromaCentral.x << getComponentScaleX(COMPONENT_Cb, tempCS->area.chromaFormat), chromaCentral.y << getComponentScaleY(COMPONENT_Cb, tempCS->area.chromaFormat)); @@ -653,15 +653,24 @@ void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Par } } +#if SHARP_LUMA_DELTA_QP || ENABLE_QPA_SUB_CTU + if (partitioner.currDepth <= pps.getMaxCuDQPDepth() && ( #if SHARP_LUMA_DELTA_QP - if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() && partitioner.currDepth <= pps.getMaxCuDQPDepth() ) + (m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled()) || +#endif +#if ENABLE_QPA_SUB_CTU + (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && pps.getUseDQP()) +#else + false +#endif + )) { #if ENABLE_SPLIT_PARALLELISM CHECK( tempCS->picture->scheduler.getSplitJobId() > 0, "Changing lambda is only allowed in the master thread!" ); #endif if (currTestMode.qp >= 0) { - updateLambda(&slice, currTestMode.qp); + updateLambda (&slice, currTestMode.qp, CS::isDualITree (*tempCS) || (partitioner.currDepth == 0)); } } #endif @@ -773,8 +782,8 @@ void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Par CHECK( bestCS->cost == MAX_DOUBLE , "No possible encoding found" ); } -#if SHARP_LUMA_DELTA_QP -void EncCu::updateLambda( Slice* slice, double dQP ) +#if SHARP_LUMA_DELTA_QP || ENABLE_QPA_SUB_CTU +void EncCu::updateLambda (Slice* slice, const int dQP, const bool updateRdCostLambda) { #if WCG_EXT int NumberBFrames = ( m_pcEncCfg->getGOPSize() - 1 ); @@ -838,14 +847,19 @@ void EncCu::updateLambda( Slice* slice, double dQP ) dLambda *= lambdaModifier; int qpBDoffset = slice->getSPS()->getQpBDOffset(CHANNEL_TYPE_LUMA); - int iQP = Clip3(-qpBDoffset, MAX_QP, (int)floor(dQP + 0.5)); + int iQP = Clip3(-qpBDoffset, MAX_QP, (int)floor((double)dQP + 0.5)); m_pcSliceEncoder->setUpLambda(slice, dLambda, iQP); #else - int iQP = (int)dQP; + int iQP = dQP; const double oldQP = (double)slice->getSliceQpBase(); +#if ENABLE_QPA_SUB_CTU + const double oldLambda = (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && slice->getPPS()->getUseDQP()) ? slice->getLambdas()[0] : + m_pcSliceEncoder->calculateLambda (slice, m_pcSliceEncoder->getGopId(), slice->getDepth(), oldQP, oldQP, iQP); +#else const double oldLambda = m_pcSliceEncoder->calculateLambda (slice, m_pcSliceEncoder->getGopId(), slice->getDepth(), oldQP, oldQP, iQP); - const double newLambda = oldLambda * pow (2.0, (dQP - oldQP) / 3.0); +#endif + const double newLambda = oldLambda * pow (2.0, ((double)dQP - oldQP) / 3.0); #if RDOQ_CHROMA_LAMBDA const double chromaLambda = newLambda / m_pcRdCost->getChromaWeight(); const double lambdaArray[MAX_NUM_COMPONENT] = {newLambda, chromaLambda, chromaLambda}; @@ -853,7 +867,7 @@ void EncCu::updateLambda( Slice* slice, double dQP ) #else m_pcTrQuant->setLambda (newLambda); #endif - m_pcRdCost->setLambda( newLambda, slice->getSPS()->getBitDepths() ); + if (updateRdCostLambda) m_pcRdCost->setLambda (newLambda, slice->getSPS()->getBitDepths()); #endif } #endif @@ -1051,7 +1065,12 @@ void EncCu::xCheckModeSplit(CodingStructure *&tempCS, CodingStructure *&bestCS, m_CABACEstimator->getCtx() = SubCtx( Ctx::SplitFlag, ctxStartSP ); m_CABACEstimator->getCtx() = SubCtx( Ctx::BTSplitFlag, ctxStartBT ); - if( cost > bestCS->cost ) + if( cost > bestCS->cost +#if ENABLE_QPA_SUB_CTU + || (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && pps.getUseDQP() && (pps.getMaxCuDQPDepth() > 0) && (split == CU_HORZ_SPLIT || split == CU_VERT_SPLIT) && + (partitioner.currArea().lwidth() == tempCS->pcv->maxCUWidth) && (partitioner.currArea().lheight() == tempCS->pcv->maxCUHeight)) // force quad-split or no split at CTU level +#endif + ) { xCheckBestMode( tempCS, bestCS, partitioner, encTestMode ); return; @@ -2833,10 +2852,15 @@ void EncCu::xCheckRDCostCPRModeMerge2Nx2N(CodingStructure *&tempCS, CodingStruct m_pcInterSearch->encodeResAndCalcRdInterCU(*tempCS, partitioner, (numResidualPass != 0), true, chroma); xEncodeDontSplit(*tempCS, partitioner); +#if ENABLE_QPA_SUB_CTU + xCheckDQP (*tempCS, partitioner); +#else + // this if-check is redundant if (tempCS->pps->getUseDQP() && (partitioner.currDepth) <= tempCS->pps->getMaxCuDQPDepth()) { xCheckDQP(*tempCS, partitioner); } +#endif hasResidual[emtCuFlag] = cu.rootCbf; emtCost[emtCuFlag] = tempCS->cost; @@ -2934,10 +2958,15 @@ void EncCu::xCheckRDCostCPRMode(CodingStructure *&tempCS, CodingStructure *&best } xEncodeDontSplit(*tempCS, partitioner); +#if ENABLE_QPA_SUB_CTU + xCheckDQP (*tempCS, partitioner); +#else + // this if-check is redundant if (tempCS->pps->getUseDQP() && (partitioner.currDepth) <= tempCS->pps->getMaxCuDQPDepth()) { xCheckDQP(*tempCS, partitioner); } +#endif DTRACE_MODE_COST(*tempCS, m_pcRdCost->getLambda()); xCheckBestMode(tempCS, bestCS, partitioner, encTestMode); diff --git a/source/Lib/EncoderLib/EncCu.h b/source/Lib/EncoderLib/EncCu.h index 4adbc26dce805e3710a1d1673de0025bd1f97415..22e50e7a8aaa90999ca4bfb34be4e85c34369aaa 100644 --- a/source/Lib/EncoderLib/EncCu.h +++ b/source/Lib/EncoderLib/EncCu.h @@ -127,8 +127,8 @@ private: #endif int m_bestGbiIdx[2]; double m_bestGbiCost[2]; -#if SHARP_LUMA_DELTA_QP - void updateLambda ( Slice* slice, double dQP ); +#if SHARP_LUMA_DELTA_QP || ENABLE_QPA_SUB_CTU + void updateLambda ( Slice* slice, const int dQP, const bool updateRdCostLambda ); #endif public: diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index 7377a0f2dd3c785ffcf78c8b7c153d4fe85f6d87..56da75e88d36164dc175e84f936577ccd3bdad2b 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -1980,8 +1980,16 @@ void EncGOP::compressGOP( int iPOCLast, int iNumPicRcvd, PicList& rcListPic, std::vector<OutputBitstream> substreamsOut(numSubstreams); #if ENABLE_QPA - pcPic->m_uEnerHpCtu.resize( numberOfCtusInFrame ); - pcPic->m_iOffsetCtu.resize( numberOfCtusInFrame ); + pcPic->m_uEnerHpCtu.resize (numberOfCtusInFrame); + pcPic->m_iOffsetCtu.resize (numberOfCtusInFrame); + #if ENABLE_QPA_SUB_CTU + if (pcSlice->getPPS()->getUseDQP() && pcSlice->getPPS()->getMaxCuDQPDepth() > 0) + { + const PreCalcValues &pcv = *pcPic->cs->pcv; + const unsigned mtsLog2 = (unsigned)g_aucLog2[std::min (pcPic->cs->sps->getMaxTrSize(), pcv.maxCUWidth)]; + pcPic->m_subCtuQP.resize ((pcv.maxCUWidth >> mtsLog2) * (pcv.maxCUHeight >> mtsLog2)); + } + #endif #endif if (pcSlice->getSPS()->getUseSAO()) { diff --git a/source/Lib/EncoderLib/EncModeCtrl.cpp b/source/Lib/EncoderLib/EncModeCtrl.cpp index eeee01f51e3c8c2262e37cb09b968b09afba6ebc..f5032831e5bdf4c74c8e47dabfe943fe56c5e7e4 100644 --- a/source/Lib/EncoderLib/EncModeCtrl.cpp +++ b/source/Lib/EncoderLib/EncModeCtrl.cpp @@ -151,19 +151,18 @@ void EncModeCtrl::xGetMinMaxQP( int& minQP, int& maxQP, const CodingStructure& c minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - deltaQP ); maxQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP + deltaQP ); } +#if ENABLE_QPA_SUB_CTU + else if (pps.getUseDQP() && pps.getMaxCuDQPDepth() > 0 && (!CS::isDualITree (cs) || isLuma (partitioner.chType))) + { + minQP = baseQP; + maxQP = baseQP; + } +#endif else { minQP = cs.currQP[partitioner.chType]; maxQP = cs.currQP[partitioner.chType]; } - -#if SHARP_LUMA_DELTA_QP - if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() ) - { - minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - m_lumaQPOffset ); - maxQP = minQP; // force encode choose the modified QO - } -#endif } else { @@ -173,7 +172,11 @@ void EncModeCtrl::xGetMinMaxQP( int& minQP, int& maxQP, const CodingStructure& c minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - deltaQP ); maxQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP + deltaQP ); } - else if( currDepth < pps.getMaxCuDQPDepth() ) + else if (currDepth < pps.getMaxCuDQPDepth() +#if ENABLE_QPA_SUB_CTU + || (pps.getUseDQP() && pps.getMaxCuDQPDepth() > 0 && (!CS::isDualITree (cs) || isLuma (partitioner.chType))) +#endif + ) { minQP = baseQP; maxQP = baseQP; @@ -183,15 +186,15 @@ void EncModeCtrl::xGetMinMaxQP( int& minQP, int& maxQP, const CodingStructure& c minQP = cs.currQP[partitioner.chType]; maxQP = cs.currQP[partitioner.chType]; } - + } #if SHARP_LUMA_DELTA_QP - if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() ) - { - minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - m_lumaQPOffset ); - maxQP = minQP; - } -#endif + + if (m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() && (!CS::isDualITree (cs) || isLuma (partitioner.chType))) + { + minQP = Clip3 (-sps.getQpBDOffset (CHANNEL_TYPE_LUMA), MAX_QP, baseQP - m_lumaQPOffset); + maxQP = minQP; } +#endif } @@ -247,16 +250,8 @@ int EncModeCtrl::calculateLumaDQP( const CPelBuf& rcOrg ) CHECK( m_pcEncCfg->getLumaLevelToDeltaQPMapping().mode != LUMALVL_TO_DQP_AVG_METHOD, "invalid delta qp mode" ); #endif { - // Use avg method - int sum = 0; - for( uint32_t y = 0; y < rcOrg.height; y++ ) - { - for( uint32_t x = 0; x < rcOrg.width; x++ ) - { - sum += rcOrg.at( x, y ); - } - } - avg = ( double ) sum / rcOrg.area(); + // Use average luma value + avg = (double) rcOrg.mean(); } #if !WCG_EXT else @@ -870,6 +865,18 @@ void EncModeCtrlMTnoRQT::initCTUEncoding( const Slice &slice ) } } +#if ENABLE_QPA_SUB_CTU +static Position getMaxLumaDQPDepthPos (const CodingStructure &cs, const Partitioner &partitioner) +{ + if (partitioner.currDepth <= cs.pps->getMaxCuDQPDepth()) + { + return partitioner.currArea().lumaPos(); + } + const PartLevel splitAtMaxDepth = partitioner.getPartStack().at (cs.pps->getMaxCuDQPDepth()); + // the parent node of qtDepth + mttDepth == maxDqpDepth + return splitAtMaxDepth.parts[splitAtMaxDepth.idx].lumaPos(); +} +#endif void EncModeCtrlMTnoRQT::initCULevel( Partitioner &partitioner, const CodingStructure& cs ) { @@ -927,28 +934,38 @@ void EncModeCtrlMTnoRQT::initCULevel( Partitioner &partitioner, const CodingStru // QP int baseQP = cs.baseQP; - if( m_pcEncCfg->getUseAdaptiveQP() ) + if (!CS::isDualITree (cs) || isLuma (partitioner.chType)) { - if (!CS::isDualITree(cs) || isLuma(partitioner.chType)) + if (m_pcEncCfg->getUseAdaptiveQP()) { - baseQP = Clip3(-cs.sps->getQpBDOffset(CHANNEL_TYPE_LUMA), MAX_QP, baseQP + xComputeDQP(cs, partitioner)); + baseQP = Clip3 (-cs.sps->getQpBDOffset (CHANNEL_TYPE_LUMA), MAX_QP, baseQP + xComputeDQP (cs, partitioner)); } - } +#if ENABLE_QPA_SUB_CTU + else if (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && cs.pps->getUseDQP() && cs.pps->getMaxCuDQPDepth() > 0) + { + const PreCalcValues &pcv = *cs.pcv; - int minQP = baseQP; - int maxQP = baseQP; + if ((partitioner.currArea().lwidth() < pcv.maxCUWidth) && (partitioner.currArea().lheight() < pcv.maxCUHeight) && cs.picture) + { + const Position &pos = getMaxLumaDQPDepthPos (cs, partitioner); + const unsigned mtsLog2 = (unsigned)g_aucLog2[std::min (cs.sps->getMaxTrSize(), pcv.maxCUWidth)]; + const unsigned stride = pcv.maxCUWidth >> mtsLog2; + baseQP = cs.picture->m_subCtuQP[((pos.x & pcv.maxCUWidthMask) >> mtsLog2) + stride * ((pos.y & pcv.maxCUHeightMask) >> mtsLog2)]; + } + } +#endif #if SHARP_LUMA_DELTA_QP - if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() ) - { - if( partitioner.currDepth <= cs.pps->getMaxCuDQPDepth() ) + if (m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() && partitioner.currDepth <= cs.pps->getMaxCuDQPDepth()) { CompArea clipedArea = clipArea( cs.area.Y(), cs.picture->Y() ); // keep using the same m_QP_LUMA_OFFSET in the same CTU m_lumaQPOffset = calculateLumaDQP( cs.getOrgBuf( clipedArea ) ); } - } #endif + } + int minQP = baseQP; + int maxQP = baseQP; xGetMinMaxQP( minQP, maxQP, cs, partitioner, baseQP, *cs.sps, *cs.pps, true ); bool checkCpr = true; diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp index acdebc0583c458926d47e4c70da4314e93864085..1fa2f008aaaf0a19e12e575e3be2ea5909869a9e 100644 --- a/source/Lib/EncoderLib/EncSlice.cpp +++ b/source/Lib/EncoderLib/EncSlice.cpp @@ -151,6 +151,11 @@ static inline int apprI3Log2 (const double d) // rounded 3*log2(d) return d < 1.5e-13 ? -128 : int (floor (3.0 * log (d) / log (2.0) + 0.5)); } +static inline int lumaDQPOffset (const uint32_t avgLumaValue, const int bitDepth) +{ + return (1 - int ((3 * uint64_t (avgLumaValue * avgLumaValue)) >> uint64_t (2 * bitDepth - 1))); +} + static void filterAndCalculateAverageEnergies (const Pel* pSrc, const int iSrcStride, double &hpEner, const int iHeight, const int iWidth, const uint32_t uBitDepth /* luma bit-depth (4-16) */) @@ -203,52 +208,111 @@ static double getAveragePictureEnergy (const CPelBuf picOrig, const uint32_t uBi } #endif +static int getGlaringColorQPOffset (Picture* const pcPic, const int ctuAddr, const uint32_t startAddr, const uint32_t boundingAddr, + const int bitDepth, uint32_t &avgLumaValue) +{ + const PreCalcValues& pcv = *pcPic->cs->pcv; + const ChromaFormat chrFmt = pcPic->chromaFormat; + const uint32_t chrWidth = pcv.maxCUWidth >> getChannelTypeScaleX (CH_C, chrFmt); + const uint32_t chrHeight = pcv.maxCUHeight >> getChannelTypeScaleY (CH_C, chrFmt); + const int midLevel = 1 << (bitDepth - 1); + int chrValue = MAX_INT; + avgLumaValue = (startAddr < boundingAddr) ? 0 : (uint32_t)pcPic->getOrigBuf().Y().mean(); + + if (ctuAddr >= 0) // luma + { + avgLumaValue = (uint32_t)pcPic->m_iOffsetCtu[ctuAddr]; + } + else if (startAddr < boundingAddr) + { + for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++) + { +#if HEVC_TILES_WPP + const uint32_t ctuRsAddr = pcPic->tileMap->getCtuTsToRsAddrMap (ctuTsAddr); +#else + const uint32_t ctuRsAddr = ctuTsAddr; +#endif + + avgLumaValue += pcPic->m_iOffsetCtu[ctuRsAddr]; + } + avgLumaValue = (avgLumaValue + ((boundingAddr - startAddr) >> 1)) / (boundingAddr - startAddr); + } + + for (uint32_t comp = COMPONENT_Cb; comp < MAX_NUM_COMPONENT; comp++) + { + const ComponentID compID = (ComponentID)comp; + int avgCompValue; + + if (ctuAddr >= 0) // chroma + { + const CompArea chrArea = clipArea (CompArea (compID, chrFmt, Area ((ctuAddr % pcv.widthInCtus) * chrWidth, (ctuAddr / pcv.widthInCtus) * chrHeight, chrWidth, chrHeight)), pcPic->block (compID)); + + avgCompValue = pcPic->getOrigBuf (chrArea).mean(); + } + else avgCompValue = pcPic->getOrigBuf (pcPic->block (compID)).mean(); + + if (chrValue > avgCompValue) chrValue = avgCompValue; // minimum of the DC offsets + } + CHECK (chrValue < 0, "DC offset cannot be negative!"); + + chrValue = (int)avgLumaValue - chrValue; + + if (chrValue > midLevel) return apprI3Log2 (double (chrValue * chrValue) / double (midLevel * midLevel)); + + return 0; +} + static int applyQPAdaptationChroma (Picture* const pcPic, Slice* const pcSlice, EncCfg* const pcEncCfg, const int sliceQP) { + const int iBitDepth = pcSlice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA); // overall image bit-depth double hpEner[MAX_NUM_COMPONENT] = {0.0, 0.0, 0.0}; int optSliceChromaQpOffset[2] = {0, 0}; int savedLumaQP = -1; + uint32_t meanLuma = MAX_UINT; for (uint32_t comp = 0; comp < getNumberValidComponents (pcPic->chromaFormat); comp++) { const ComponentID compID = (ComponentID)comp; const CPelBuf picOrig = pcPic->getOrigBuf (pcPic->block (compID)); - filterAndCalculateAverageEnergies (picOrig.buf, picOrig.stride, hpEner[comp], picOrig.height, picOrig.width, - pcSlice->getSPS()->getBitDepth (toChannelType (compID)) - (isChroma (compID) ? 1 : 0)); + filterAndCalculateAverageEnergies (picOrig.buf, picOrig.stride, hpEner[comp], + picOrig.height, picOrig.width, iBitDepth - (isChroma (compID) ? 1 : 0)); if (isChroma (compID)) { const int adaptChromaQPOffset = 2.0 * hpEner[comp] <= hpEner[0] ? 0 : apprI3Log2 (2.0 * hpEner[comp] / hpEner[0]); - #if GLOBAL_AVERAGING - int averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP + apprI3Log2 (hpEner[0] / getAveragePictureEnergy (pcPic->getOrigBuf().Y(), pcSlice->getSPS()->getBitDepth (CH_L)))); - #else - int averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP); // mean slice QP - #endif - #if SHARP_LUMA_DELTA_QP - - // change mean picture QP index based on picture's average luma value (Sharp) - if (pcEncCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES) + + if (savedLumaQP < 0) { - const CPelBuf picLuma = pcPic->getOrigBuf().Y(); - uint64_t uAvgLuma = 0; +#if GLOBAL_AVERAGING + int averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP + apprI3Log2 (hpEner[0] / getAveragePictureEnergy (pcPic->getOrigBuf().Y(), iBitDepth))); +#else + int averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP); // mean slice QP +#endif - for (SizeType y = 0; y < picLuma.height; y++) + averageAdaptedLumaQP += getGlaringColorQPOffset (pcPic, -1 /*ctuRsAddr*/, 0 /*startAddr*/, 0 /*boundingAddr*/, iBitDepth, meanLuma); + + if (averageAdaptedLumaQP > MAX_QP +#if SHARP_LUMA_DELTA_QP + && (pcEncCfg->getLumaLevelToDeltaQPMapping().mode != LUMALVL_TO_DQP_NUM_MODES) +#endif + ) averageAdaptedLumaQP = MAX_QP; +#if SHARP_LUMA_DELTA_QP + + // change mean picture QP index based on picture's average luma value (Sharp) + if (pcEncCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES) { - for (SizeType x = 0; x < picLuma.width; x++) - { - uAvgLuma += (uint64_t)picLuma.at (x, y); - } + if (meanLuma == MAX_UINT) meanLuma = pcPic->getOrigBuf().Y().mean(); + + averageAdaptedLumaQP = Clip3 (0, MAX_QP, averageAdaptedLumaQP + lumaDQPOffset (meanLuma, iBitDepth)); } - uAvgLuma = (uAvgLuma + (picLuma.area() >> 1)) / picLuma.area(); +#endif - averageAdaptedLumaQP = Clip3 (0, MAX_QP, averageAdaptedLumaQP + 1 - int((3 * uAvgLuma * uAvgLuma) >> uint64_t (2 * pcSlice->getSPS()->getBitDepth (CH_L) - 1))); - } - #endif - const int lumaChromaMappingDQP = averageAdaptedLumaQP - getScaledChromaQP (averageAdaptedLumaQP, pcEncCfg->getChromaFormatIdc()); + savedLumaQP = averageAdaptedLumaQP; + } // savedLumaQP < 0 - optSliceChromaQpOffset[comp-1] = std::min (3 + lumaChromaMappingDQP, adaptChromaQPOffset + lumaChromaMappingDQP); + const int lumaChromaMappingDQP = savedLumaQP - getScaledChromaQP (savedLumaQP, pcEncCfg->getChromaFormatIdc()); - if (savedLumaQP < 0) savedLumaQP = averageAdaptedLumaQP; // save it for later + optSliceChromaQpOffset[comp-1] = std::min (3 + lumaChromaMappingDQP, adaptChromaQPOffset + lumaChromaMappingDQP); } } @@ -794,9 +858,9 @@ void EncSlice::resetQP( Picture* pic, int sliceQP, double lambda ) } #if ENABLE_QPA -static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, const PreCalcValues& pcv, - const uint32_t startAddr, const uint32_t boundingAddr, const bool useSharpLumaDQP, - const double hpEnerAvg, const double hpEnerMax, const bool useFrameWiseQPA, const int previouslyAdaptedLumaQP = -1) +static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, const PreCalcValues& pcv, + const uint32_t startAddr, const uint32_t boundingAddr, const bool useSharpLumaDQP, + const bool useFrameWiseQPA, const int previouslyAdaptedLumaQP = -1) { const int iBitDepth = pcSlice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA); const int iQPIndex = pcSlice->getSliceQp(); // initial QP index for current slice, used in following loops @@ -804,6 +868,35 @@ static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, const TileMap& tileMap = *pcPic->tileMap; #endif bool sliceQPModified = false; + uint32_t meanLuma = MAX_UINT; + double hpEnerAvg = 0.0; + +#if GLOBAL_AVERAGING + if (!useFrameWiseQPA || previouslyAdaptedLumaQP < 0) // mean visual activity value and luma value in each CTU +#endif + { + for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++) + { +#if HEVC_TILES_WPP + const uint32_t ctuRsAddr = tileMap.getCtuTsToRsAddrMap (ctuTsAddr); +#else + const uint32_t ctuRsAddr = ctuTsAddr; +#endif + const Position pos ((ctuRsAddr % pcv.widthInCtus) * pcv.maxCUWidth, (ctuRsAddr / pcv.widthInCtus) * pcv.maxCUHeight); + const CompArea ctuArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x, pos.y, pcv.maxCUWidth, pcv.maxCUHeight)), pcPic->Y()); + const CompArea fltArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x > 0 ? pos.x - 1 : 0, pos.y > 0 ? pos.y - 1 : 0, pcv.maxCUWidth + (pos.x > 0 ? 2 : 1), pcv.maxCUHeight + (pos.y > 0 ? 2 : 1))), pcPic->Y()); + const CPelBuf picOrig = pcPic->getOrigBuf (fltArea); + double hpEner = 0.0; + + filterAndCalculateAverageEnergies (picOrig.buf, picOrig.stride, hpEner, + picOrig.height, picOrig.width, iBitDepth); + hpEnerAvg += hpEner; + pcPic->m_uEnerHpCtu[ctuRsAddr] = hpEner; + pcPic->m_iOffsetCtu[ctuRsAddr] = pcPic->getOrigBuf (ctuArea).mean(); + } + + hpEnerAvg /= double (boundingAddr - startAddr); + } #if GLOBAL_AVERAGING const double hpEnerPic = 1.0 / getAveragePictureEnergy (pcPic->getOrigBuf().Y(), iBitDepth); // inverse, speed #else @@ -812,36 +905,40 @@ static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, if (useFrameWiseQPA || (iQPIndex >= MAX_QP)) { - int iQPFixed; + int iQPFixed = (previouslyAdaptedLumaQP < 0) ? Clip3 (0, MAX_QP, iQPIndex + apprI3Log2 (hpEnerAvg * hpEnerPic)) : previouslyAdaptedLumaQP; - if (useFrameWiseQPA) + if (isChromaEnabled (pcPic->chromaFormat) && (iQPIndex < MAX_QP) && (previouslyAdaptedLumaQP < 0)) { - iQPFixed = (previouslyAdaptedLumaQP < 0) ? Clip3 (0, MAX_QP, iQPIndex + apprI3Log2 (hpEnerAvg * hpEnerPic)) : previouslyAdaptedLumaQP; // average-activity slice QP - } - else - { - iQPFixed = Clip3 (0, MAX_QP, iQPIndex + ((apprI3Log2 (hpEnerAvg * hpEnerPic) + apprI3Log2 (hpEnerMax * hpEnerPic) + 1) >> 1)); // adapted slice QP = (mean(QP) + max(QP)) / 2 + iQPFixed += getGlaringColorQPOffset (pcPic, -1 /*ctuRsAddr*/, startAddr, boundingAddr, iBitDepth, meanLuma); + + if (iQPFixed > MAX_QP +#if SHARP_LUMA_DELTA_QP + && !useSharpLumaDQP +#endif + ) iQPFixed = MAX_QP; } #if SHARP_LUMA_DELTA_QP // change new fixed QP based on average CTU luma value (Sharp) if (useSharpLumaDQP && (iQPIndex < MAX_QP) && (previouslyAdaptedLumaQP < 0)) { - uint64_t uAvgLuma = 0; - - for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++) + if (meanLuma == MAX_UINT) // collect picture mean luma value { -#if HEVC_TILES_WPP - const uint32_t ctuRsAddr = tileMap.getCtuTsToRsAddrMap (ctuTsAddr); -#else - const uint32_t ctuRsAddr = ctuTsAddr; -#endif + meanLuma = 0; - uAvgLuma += (uint64_t)pcPic->m_iOffsetCtu[ctuRsAddr]; - } - uAvgLuma = (uAvgLuma + ((boundingAddr - startAddr) >> 1)) / (boundingAddr - startAddr); + for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++) + { + #if HEVC_TILES_WPP + const uint32_t ctuRsAddr = tileMap.getCtuTsToRsAddrMap (ctuTsAddr); + #else + const uint32_t ctuRsAddr = ctuTsAddr; + #endif - iQPFixed = Clip3 (0, MAX_QP, iQPFixed + 1 - int((3 * uAvgLuma * uAvgLuma) >> uint64_t(2 * iBitDepth - 1))); + meanLuma += pcPic->m_iOffsetCtu[ctuRsAddr]; // CTU mean + } + meanLuma = (meanLuma + ((boundingAddr - startAddr) >> 1)) / (boundingAddr - startAddr); + } + iQPFixed = Clip3 (0, MAX_QP, iQPFixed + lumaDQPOffset (meanLuma, iBitDepth)); } #endif @@ -872,7 +969,7 @@ static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, pcPic->m_iOffsetCtu[ctuRsAddr] = (Pel)iQPFixed; // fixed QPs } } - else + else // CTU-wise QPA { for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++) { @@ -884,21 +981,30 @@ static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, int iQPAdapt = Clip3 (0, MAX_QP, iQPIndex + apprI3Log2 (pcPic->m_uEnerHpCtu[ctuRsAddr] * hpEnerPic)); -#if SHARP_LUMA_DELTA_QP if (pcv.widthInCtus > 1) // try to enforce CTU SNR greater than zero dB -#else - if (!pcSlice->isIntra()) // try to enforce CTU SNR greater than zero dB -#endif { - const Pel dcOffset = pcPic->m_iOffsetCtu[ctuRsAddr]; + meanLuma = (uint32_t)pcPic->m_iOffsetCtu[ctuRsAddr]; + + if (isChromaEnabled (pcPic->chromaFormat)) + { + iQPAdapt += getGlaringColorQPOffset (pcPic, (int)ctuRsAddr, startAddr, boundingAddr, iBitDepth, meanLuma); + + if (iQPAdapt > MAX_QP +#if SHARP_LUMA_DELTA_QP + && !useSharpLumaDQP +#endif + ) iQPAdapt = MAX_QP; + CHECK (meanLuma != (uint32_t)pcPic->m_iOffsetCtu[ctuRsAddr], "luma DC offsets don't match"); + } #if SHARP_LUMA_DELTA_QP // change adaptive QP based on mean CTU luma value (Sharp) if (useSharpLumaDQP) { - const uint64_t uAvgLuma = (uint64_t)dcOffset; - - iQPAdapt = std::max (0, iQPAdapt + 1 - int((3 * uAvgLuma * uAvgLuma) >> uint64_t(2 * iBitDepth - 1))); + #if ENABLE_QPA_SUB_CTU + pcPic->m_uEnerHpCtu[ctuRsAddr] = (double)meanLuma; // for sub-CTU QPA + #endif + iQPAdapt = Clip3 (0, MAX_QP, iQPAdapt + lumaDQPOffset (meanLuma, iBitDepth)); } #endif @@ -915,7 +1021,7 @@ static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, { for (SizeType w = 0; w < iSrcWidth; w++) { - uAbsDCless += (uint32_t)abs (pSrc[w] - dcOffset); + uAbsDCless += (uint32_t)abs (pSrc[w] - (Pel)meanLuma); } pSrc += iSrcStride; } @@ -937,15 +1043,15 @@ static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, iQPAdapt = std::max (0, iQPAdapt + redVal); } -#if SHARP_LUMA_DELTA_QP - - if (iQPAdapt > MAX_QP) iQPAdapt = MAX_QP; -#endif } pcPic->m_iOffsetCtu[ctuRsAddr] = (Pel)iQPAdapt; // adapted QPs +#if ENABLE_QPA_SUB_CTU + if (pcv.widthInCtus > 1 && pcSlice->getPPS()->getMaxCuDQPDepth() == 0) // reduce local DQP rate peaks +#else if (pcv.widthInCtus > 1) // try to reduce local bitrate peaks via minimum smoothing of the adapted QPs +#endif { iQPAdapt = ctuRsAddr % pcv.widthInCtus; // horizontal offset if (iQPAdapt == 0) @@ -978,6 +1084,97 @@ static bool applyQPAdaptation (Picture* const pcPic, Slice* const pcSlice, return sliceQPModified; } + +#if ENABLE_QPA_SUB_CTU +static int applyQPAdaptationSubCtu (CodingStructure &cs, const UnitArea ctuArea, const uint32_t ctuAddr, const bool useSharpLumaDQP) +{ + const PreCalcValues &pcv = *cs.pcv; + const Picture *pcPic = cs.picture; + const int iBitDepth = cs.slice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA); // overall image bit-depth + const int adaptedCtuQP = pcPic ? pcPic->m_iOffsetCtu[ctuAddr] : cs.slice->getSliceQpBase(); + + if (!pcPic || cs.pps->getMaxCuDQPDepth() == 0) return adaptedCtuQP; + + for (unsigned addr = 0; addr < cs.picture->m_subCtuQP.size(); addr++) + { + cs.picture->m_subCtuQP[addr] = (int8_t)adaptedCtuQP; + } + if (cs.slice->getSliceQp() < MAX_QP && pcv.widthInCtus > 1) + { +#if SHARP_LUMA_DELTA_QP + const int lumaCtuDQP = useSharpLumaDQP ? lumaDQPOffset ((uint32_t)pcPic->m_uEnerHpCtu[ctuAddr], iBitDepth) : 0; +#endif + const unsigned mts = std::min (cs.sps->getMaxTrSize(), pcv.maxCUWidth); + const unsigned mtsLog2 = (unsigned)g_aucLog2[mts]; + const unsigned stride = pcv.maxCUWidth >> mtsLog2; + unsigned numAct = 0; // number of block activities + double sumAct = 0.0; // sum of all block activities + double subAct[16]; // individual block activities +#if SHARP_LUMA_DELTA_QP + uint32_t subMLV[16]; // individual mean luma values +#endif + + CHECK (mts * 4 < pcv.maxCUWidth || mts * 4 < pcv.maxCUHeight, "max. transform size is too small for given CTU size"); + + for (unsigned h = 0; h < (pcv.maxCUHeight >> mtsLog2); h++) + { + for (unsigned w = 0; w < stride; w++) + { + const unsigned addr = w + h * stride; + const PosType x = ctuArea.lx() + w * mts; + const PosType y = ctuArea.ly() + h * mts; + const CompArea fltArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (x > 0 ? x - 1 : 0, y > 0 ? y - 1 : 0, mts + (x > 0 ? 2 : 1), mts + (y > 0 ? 2 : 1))), pcPic->Y()); + const CPelBuf picOrig = pcPic->getOrigBuf (fltArea); + + if (x >= pcPic->lwidth() || y >= pcPic->lheight()) + { + continue; + } + filterAndCalculateAverageEnergies (picOrig.buf, picOrig.stride, subAct[addr], + picOrig.height, picOrig.width, iBitDepth); + numAct++; + sumAct += subAct[addr]; +#if SHARP_LUMA_DELTA_QP + + if (useSharpLumaDQP) + { + const CompArea subArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (x, y, mts, mts)), pcPic->Y()); + + subMLV[addr] = pcPic->getOrigBuf (subArea).mean(); + } +#endif + } + } + if (sumAct <= 0.0) return adaptedCtuQP; + + sumAct = double(numAct) / sumAct; // 1.0 / (average CTU activity) + + for (unsigned h = 0; h < (pcv.maxCUHeight >> mtsLog2); h++) + { + for (unsigned w = 0; w < stride; w++) + { + const unsigned addr = w + h * stride; + + if (ctuArea.lx() + w * mts >= pcPic->lwidth() || ctuArea.ly() + h * mts >= pcPic->lheight()) + { + continue; + } + cs.picture->m_subCtuQP[addr] = (int8_t)Clip3 (0, MAX_QP, adaptedCtuQP + apprI3Log2 (subAct[addr] * sumAct)); +#if SHARP_LUMA_DELTA_QP + + // change adapted QP based on mean sub-CTU luma value (Sharp) + if (useSharpLumaDQP) + { + cs.picture->m_subCtuQP[addr] = (int8_t)Clip3 (0, MAX_QP, (int)cs.picture->m_subCtuQP[addr] - lumaCtuDQP + lumaDQPOffset (subMLV[addr], iBitDepth)); + } +#endif + } + } + } + + return adaptedCtuQP; +} +#endif // ENABLE_QPA_SUB_CTU #endif // ENABLE_QPA // ==================================================================================================================== @@ -1261,12 +1458,9 @@ void EncSlice::compressSlice( Picture* pcPic, const bool bCompressEntireSlice, c CHECK( pcPic->m_prevQP[0] == std::numeric_limits<int>::max(), "Invalid previous QP" ); CodingStructure& cs = *pcPic->cs; -#if ENABLE_QPA || ENABLE_WPP_PARALLELISM - const PreCalcValues& pcv = *cs.pcv; - const uint32_t widthInCtus = pcv.widthInCtus; -#endif - - cs.slice = pcSlice; + cs.slice = pcSlice; + cs.pcv = pcSlice->getPPS()->pcv; + cs.fracBits = 0; if (startCtuTsAddr == 0) { @@ -1274,74 +1468,19 @@ void EncSlice::compressSlice( Picture* pcPic, const bool bCompressEntireSlice, c } #if ENABLE_QPA - double hpEnerMax = 1.0; - double hpEnerPic = 0.0; - int iSrcOffset; - - if (m_pcCfg->getUsePerceptQPA() && !m_pcCfg->getUseRateCtrl()) - { - for (uint32_t ctuTsAddr = startCtuTsAddr; ctuTsAddr < boundingCtuTsAddr; ctuTsAddr++) - { - #if HEVC_TILES_WPP - const uint32_t ctuRsAddr = tileMap.getCtuTsToRsAddrMap (ctuTsAddr); - #else - const uint32_t ctuRsAddr = ctuTsAddr; - #endif - const Position pos ((ctuRsAddr % widthInCtus) * pcv.maxCUWidth, (ctuRsAddr / widthInCtus) * pcv.maxCUHeight); - const CompArea subArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x, pos.y, pcv.maxCUWidth, pcv.maxCUHeight)), pcPic->Y()); - const CompArea fltArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x > 0 ? pos.x - 1 : 0, pos.y > 0 ? pos.y - 1 : 0, pcv.maxCUWidth + (pos.x > 0 ? 2 : 1), pcv.maxCUHeight + (pos.y > 0 ? 2 : 1))), pcPic->Y()); - const SizeType iSrcStride = pcPic->getOrigBuf (subArea).stride; - const Pel* pSrc = pcPic->getOrigBuf (subArea).buf; - const SizeType iSrcHeight = pcPic->getOrigBuf (subArea).height; - const SizeType iSrcWidth = pcPic->getOrigBuf (subArea).width; - const SizeType iFltHeight = pcPic->getOrigBuf (fltArea).height; - const SizeType iFltWidth = pcPic->getOrigBuf (fltArea).width; - double hpEner = 0.0; - - DTRACE_UPDATE (g_trace_ctx, std::make_pair ("ctu", ctuRsAddr)); - - // compute DC offset to be subtracted from luma values - iSrcOffset = 0; - for (SizeType h = 0; h < iSrcHeight; h++) - { - for (SizeType w = 0; w < iSrcWidth; w++) - { - iSrcOffset += pSrc[w]; - } - pSrc += iSrcStride; - } - CHECK (iSrcOffset < 0, "DC offset cannot be negative!"); - - int x = iSrcHeight * iSrcWidth; - iSrcOffset = (iSrcOffset + (x >> 1)) / x; // slow division - - filterAndCalculateAverageEnergies (pcPic->getOrigBuf (fltArea).buf, iSrcStride, - hpEner, iFltHeight, iFltWidth, - pcSlice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA)); - - if (hpEner > hpEnerMax) hpEnerMax = hpEner; - hpEnerPic += hpEner; - pcPic->m_uEnerHpCtu[ctuRsAddr] = hpEner; - pcPic->m_iOffsetCtu[ctuRsAddr] = (Pel)iSrcOffset; - } // end iteration over all CTUs in current slice - - } - if (m_pcCfg->getUsePerceptQPA() && !m_pcCfg->getUseRateCtrl() && (boundingCtuTsAddr > startCtuTsAddr)) { - const double hpEnerAvg = hpEnerPic / double(boundingCtuTsAddr - startCtuTsAddr); - - if (applyQPAdaptation (pcPic, pcSlice, pcv, startCtuTsAddr, boundingCtuTsAddr, m_pcCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES, - hpEnerAvg, hpEnerMax, (m_pcCfg->getBaseQP() >= 38) || (m_pcCfg->getSourceWidth() <= 512 && m_pcCfg->getSourceHeight() <= 320), m_adaptedLumaQP)) + if (applyQPAdaptation (pcPic, pcSlice, *cs.pcv, startCtuTsAddr, boundingCtuTsAddr, m_pcCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES, + (m_pcCfg->getBaseQP() >= 38) || (m_pcCfg->getSourceWidth() <= 512 && m_pcCfg->getSourceHeight() <= 320), m_adaptedLumaQP)) { m_CABACEstimator->initCtxModels (*pcSlice); - #if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM +#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM for (int jId = 1; jId < m_pcLib->getNumCuEncStacks(); jId++) { CABACWriter* cw = m_pcLib->getCABACEncoder (jId)->getCABACEstimator (pcSlice->getSPS()); cw->initCtxModels (*pcSlice); } - #endif +#endif #if HEVC_DEPENDENT_SLICES if (!pcSlice->getDependentSliceSegmentFlag()) { @@ -1358,10 +1497,6 @@ void EncSlice::compressSlice( Picture* pcPic, const bool bCompressEntireSlice, c } #endif // ENABLE_QPA - cs.pcv = pcSlice->getPPS()->pcv; - cs.fracBits = 0; - - #if ENABLE_WPP_PARALLELISM bool bUseThreads = m_pcCfg->getNumWppThreads() > 1; if( bUseThreads ) @@ -1420,7 +1555,6 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons #endif #if ENABLE_QPA const int iQPIndex = pcSlice->getSliceQpBase(); - int iSrcOffset = 0; #endif #if ENABLE_WPP_PARALLELISM @@ -1515,7 +1649,7 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons #else #endif -#if RDOQ_CHROMA_LAMBDA && ENABLE_QPA +#if RDOQ_CHROMA_LAMBDA && ENABLE_QPA && !ENABLE_QPA_SUB_CTU double oldLambdaArray[MAX_NUM_COMPONENT] = {0.0}; #endif const double oldLambda = pRdCost->getLambda(); @@ -1561,9 +1695,14 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons #if ENABLE_QPA else if (pCfg->getUsePerceptQPA() && pcSlice->getPPS()->getUseDQP()) { - iSrcOffset = pcPic->m_iOffsetCtu[ctuRsAddr]; - const double newLambda = oldLambda * pow (2.0, double(iSrcOffset - iQPIndex) / 3.0); - pcPic->m_uEnerHpCtu[ctuRsAddr] = newLambda; +#if ENABLE_QPA_SUB_CTU + const int adaptedQP = applyQPAdaptationSubCtu (cs, ctuArea, ctuRsAddr, m_pcCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES); +#else + const int adaptedQP = pcPic->m_iOffsetCtu[ctuRsAddr]; +#endif + const double newLambda = pcSlice->getLambdas()[0] * pow (2.0, double (adaptedQP - iQPIndex) / 3.0); + pcPic->m_uEnerHpCtu[ctuRsAddr] = newLambda; // for ALF and SAO +#if !ENABLE_QPA_SUB_CTU #if RDOQ_CHROMA_LAMBDA pTrQuant->getLambdas (oldLambdaArray); // save the old lambdas const double chromaLambda = newLambda / pRdCost->getChromaWeight(); @@ -1573,7 +1712,8 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons pTrQuant->setLambda (newLambda); #endif pRdCost->setLambda (newLambda, pcSlice->getSPS()->getBitDepths()); - currQP[0] = currQP[1] = iSrcOffset; +#endif + currQP[0] = currQP[1] = adaptedQP; } #endif @@ -1684,7 +1824,7 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons pRateCtrl->getRCPic()->updateAfterCTU( pRateCtrl->getRCPic()->getLCUCoded(), actualBits, actualQP, actualLambda, pcSlice->isIRAP() ? 0 : pCfg->getLCULevelRC() ); } -#if ENABLE_QPA +#if ENABLE_QPA && !ENABLE_QPA_SUB_CTU else if (pCfg->getUsePerceptQPA() && pcSlice->getPPS()->getUseDQP()) { #if RDOQ_CHROMA_LAMBDA