Commit 46e8a1d5 authored by Adam Wieckowski's avatar Adam Wieckowski

fixed cu parallel encoding

parent 26981a4f
......@@ -81,7 +81,7 @@
#define FIX_DB_MAX_TRANSFORM_SIZE 1
#define JVET_M0908_CIIP_DB 1
#define JVET_M0471_LONG_DEBLOCKING_FILTERS 1
#define JVET_M0427_INLOOP_RESHAPER 1
#define JVET_M0427_INLOOP_RESHAPER 0
#define JVET_M0470 1 // Fixed GR/TU+EG-k transition point, use limited prefix length for escape codes
#define JVET_M0253_HASH_ME 1
......@@ -185,7 +185,7 @@ typedef std::pair<int, int> TrCost;
#define ENABLE_JVET_L0283_MRL 1 // 1: Enable MRL, 0: Disable MRL
#define JVET_L0090_PAIR_AVG 1 // Add pairwise average candidates, replace HEVC combined candidates
#define REUSE_CU_RESULTS 1
#define REUSE_CU_RESULTS 0
#if REUSE_CU_RESULTS && JVET_M0102_INTRA_SUBPARTITIONS
#define REUSE_CU_RESULTS_WITH_MULTIPLE_TUS 1
#define MAX_NUM_TUS 4
......@@ -221,7 +221,7 @@ typedef std::pair<int, int> TrCost;
#endif
#ifndef ENABLE_SPLIT_PARALLELISM
#define ENABLE_SPLIT_PARALLELISM 0
#define ENABLE_SPLIT_PARALLELISM 1
#endif
#if ENABLE_SPLIT_PARALLELISM
#define PARL_SPLIT_MAX_NUM_JOBS 6 // number of parallel jobs that can be defined and need memory allocated
......
......@@ -324,13 +324,30 @@ void EncCu::compressCtu( CodingStructure& cs, const UnitArea& area, const unsign
{
EncCu* jobEncCu = m_pcEncLib->getCuEncoder( cs.picture->scheduler.getSplitDataId( jId ) );
CacheBlkInfoCtrl* cacheCtrl = dynamic_cast< CacheBlkInfoCtrl* >( jobEncCu->m_modeCtrl );
#if REUSE_CU_RESULTS
BestEncInfoCache* bestCache = dynamic_cast< BestEncInfoCache* >( jobEncCu->m_modeCtrl );
#endif
SaveLoadEncInfoSbt *sbtCache = dynamic_cast< SaveLoadEncInfoSbt* >( jobEncCu->m_modeCtrl );
if( cacheCtrl )
{
cacheCtrl->init( *cs.slice );
}
#if REUSE_CU_RESULTS
if (bestCache)
{
bestCache->init(*cs.slice);
}
#endif
if (sbtCache)
{
sbtCache->init(*cs.slice);
}
}
}
#if REUSE_CU_RESULTS
if( auto* cacheCtrl = dynamic_cast<BestEncInfoCache*>( m_modeCtrl ) ) { cacheCtrl->tick(); }
#endif
if( auto* cacheCtrl = dynamic_cast<CacheBlkInfoCtrl*>( m_modeCtrl ) ) { cacheCtrl->tick(); }
#endif
// init the partitioning manager
......@@ -658,6 +675,16 @@ void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Par
int maxSLSize = sps.getUseSBT() ? tempCS->slice->getSPS()->getMaxSbtSize() : EMT_INTER_MAX_CU_WITH_QTBT;
#endif
slsSbt->resetSaveloadSbt( maxSLSize );
#if ENABLE_SPLIT_PARALLELISM
if (m_pcEncCfg->getNumSplitThreads() > 1)
{
for (int jId = 1; jId < NUM_RESERVERD_SPLIT_JOBS; jId++)
{
auto slsSbt = dynamic_cast<SaveLoadEncInfoSbt *>(m_pcEncLib->getCuEncoder(jId)->m_modeCtrl);
slsSbt->resetSaveloadSbt(maxSLSize);
}
}
#endif
}
m_sbtCostSave[0] = m_sbtCostSave[1] = MAX_DOUBLE;
#endif
......@@ -1010,12 +1037,14 @@ void EncCu::xCompressCUParallel( CodingStructure *&tempCS, CodingStructure *&bes
const int wppTId = picture->scheduler.getWppThreadId();
#endif
const bool doParallel = !m_pcEncCfg->getForceSingleSplitThread();
#if 1
#if _MSC_VER && ENABLE_WPP_PARALLELISM
#pragma omp parallel for schedule(dynamic,1) num_threads(NUM_SPLIT_THREADS_IF_MSVC) if(doParallel)
#else
omp_set_num_threads( m_pcEncCfg->getNumSplitThreads() );
#pragma omp parallel for schedule(dynamic,1) if(doParallel)
#endif
#endif
for( int jId = 1; jId <= numJobs; jId++ )
{
......@@ -1029,6 +1058,9 @@ void EncCu::xCompressCUParallel( CodingStructure *&tempCS, CodingStructure *&bes
Partitioner* jobPartitioner = PartitionerFactory::get( *tempCS->slice );
EncCu* jobCuEnc = m_pcEncLib->getCuEncoder( picture->scheduler.getSplitDataId( jId ) );
auto* jobBlkCache = dynamic_cast<CacheBlkInfoCtrl*>( jobCuEnc->m_modeCtrl );
#if REUSE_CU_RESULTS
auto* jobBestCache = dynamic_cast<BestEncInfoCache*>( jobCuEnc->m_modeCtrl );
#endif
jobPartitioner->copyState( partitioner );
jobCuEnc ->copyState( this, *jobPartitioner, currArea, true );
......@@ -1038,6 +1070,13 @@ void EncCu::xCompressCUParallel( CodingStructure *&tempCS, CodingStructure *&bes
jobBlkCache->tick();
}
#if REUSE_CU_RESULTS
if( jobBestCache )
{
jobBestCache->tick();
}
#endif
CodingStructure *&jobBest = jobCuEnc->m_pBestCS[wIdx][hIdx];
CodingStructure *&jobTemp = jobCuEnc->m_pTempCS[wIdx][hIdx];
......@@ -1094,7 +1133,22 @@ void EncCu::xCompressCUParallel( CodingStructure *&tempCS, CodingStructure *&bes
blkCache->tick();
}
#if REUSE_CU_RESULTS
if( auto *blkCache = dynamic_cast<BestEncInfoCache*>( m_modeCtrl ) )
{
for( int jId = 1; jId <= numJobs; jId++ )
{
if( !jobUsed[jId] || jId == bestJId ) continue;
auto *jobBlkCache = dynamic_cast<BestEncInfoCache*>( m_pcEncLib->getCuEncoder( picture->scheduler.getSplitDataId( jId ) )->m_modeCtrl );
CHECK( !jobBlkCache, "If own mode controller has blk info cache capability so should all other mode controllers!" );
blkCache->BestEncInfoCache::copyState( *jobBlkCache, partitioner.currArea() );
}
blkCache->tick();
}
#endif
}
void EncCu::copyState( EncCu* other, Partitioner& partitioner, const UnitArea& currArea, const bool isDist )
......@@ -1110,7 +1164,7 @@ void EncCu::copyState( EncCu* other, Partitioner& partitioner, const UnitArea& c
else
{
CodingStructure* dst = m_pBestCS[wIdx][hIdx];
const CodingStructure *src = other->m_pBestCS[wIdx][hIdx];
const CodingStructure* src = other->m_pBestCS[wIdx][hIdx];
bool keepResi = KEEP_PRED_AND_RESI_SIGNALS;
dst->useSubStructure( *src, partitioner.chType, currArea, KEEP_PRED_AND_RESI_SIGNALS, true, keepResi, keepResi );
......@@ -1129,6 +1183,7 @@ void EncCu::copyState( EncCu* other, Partitioner& partitioner, const UnitArea& c
m_modeCtrl ->copyState( *other->m_modeCtrl, partitioner.currArea() );
m_pcRdCost ->copyState( *other->m_pcRdCost );
m_pcTrQuant ->copyState( *other->m_pcTrQuant );
//m_pcReshape ->copyState( *other->m_pcReshape );
m_CABACEstimator->getCtx() = other->m_CABACEstimator->getCtx();
}
......
......@@ -147,7 +147,16 @@ void EncLib::create ()
#if JVET_M0427_INLOOP_RESHAPER
if (m_lumaReshapeEnable)
{
#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
m_cReshaper = new EncReshape[m_numCuEncStacks];
for (int jId = 0; jId < m_numCuEncStacks; jId++)
{
m_cReshaper[jId].createEnc(getSourceWidth(), getSourceHeight(), m_maxCUWidth, m_maxCUHeight, m_bitDepth[COMPONENT_Y]);
}
#else
m_cReshaper.createEnc( getSourceWidth(), getSourceHeight(), m_maxCUWidth, m_maxCUHeight, m_bitDepth[COMPONENT_Y]);
#endif
}
#endif
if ( m_RCEnableRateControl )
......@@ -180,8 +189,15 @@ void EncLib::destroy ()
m_cLoopFilter. destroy();
m_cRateCtrl. destroy();
#if JVET_M0427_INLOOP_RESHAPER
#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
for (int jId = 0; jId < m_numCuEncStacks; jId++)
{
m_cReshaper[jId]. destroy();
}
#else
m_cReshaper. destroy();
#endif
#endif
#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
for( int jId = 0; jId < m_numCuEncStacks; jId++ )
{
......@@ -315,13 +331,22 @@ void EncLib::init( bool isFieldCoding, AUWriterIf* auWriterIf )
&m_cTrQuant[jId],
&m_cRdCost[jId],
cabacEstimator,
getCtxCache( jId ), m_maxCUWidth, m_maxCUHeight, m_maxTotalCUDepth );
getCtxCache( jId ), m_maxCUWidth, m_maxCUHeight, m_maxTotalCUDepth
#if JVET_M0427_INLOOP_RESHAPER
, &m_cReshaper[jId]
#endif
);
m_cInterSearch[jId].init( this,
&m_cTrQuant[jId],
m_iSearchRange,
m_bipredSearchRange,
m_motionEstimationSearchMethod,
m_maxCUWidth, m_maxCUHeight, m_maxTotalCUDepth, &m_cRdCost[jId], cabacEstimator, getCtxCache( jId ) );
getUseCompositeRef(),
m_maxCUWidth, m_maxCUHeight, m_maxTotalCUDepth, &m_cRdCost[jId], cabacEstimator, getCtxCache( jId )
#if JVET_M0427_INLOOP_RESHAPER
, &m_cReshaper[jId]
#endif
);
// link temporary buffets from intra search with inter search to avoid unnecessary memory overhead
m_cInterSearch[jId].setTempBuffers( m_cIntraSearch[jId].getSplitCSBuf(), m_cIntraSearch[jId].getFullCSBuf(), m_cIntraSearch[jId].getSaveCSBuf() );
......
......@@ -102,7 +102,11 @@ private:
#endif
#if JVET_M0427_INLOOP_RESHAPER
#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
EncReshape *m_cReshaper; ///< reshaper class
#else
EncReshape m_cReshaper; ///< reshaper class
#endif
#endif
// processing unit
......@@ -242,7 +246,11 @@ public:
#endif
#if JVET_M0427_INLOOP_RESHAPER
#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
EncReshape* getReshaper( int jId = 0 ) { return &m_cReshaper[jId]; }
#else
EncReshape* getReshaper() { return &m_cReshaper; }
#endif
#endif
// -------------------------------------------------------------------------------------------------------------------
// encoder function
......
......@@ -621,6 +621,13 @@ bool SaveLoadEncInfoSbt::saveBestSbt( const UnitArea& area, const uint32_t curPu
return true;
}
#if ENABLE_SPLIT_PARALLELISM
void SaveLoadEncInfoSbt::copyState(const SaveLoadEncInfoSbt &other)
{
m_sliceSbt = other.m_sliceSbt;
}
#endif
void SaveLoadEncInfoSbt::resetSaveloadSbt( int maxSbtSize )
{
int numSizeIdx = gp_sizeIdxInfo->idxFrom( maxSbtSize ) - MIN_CU_LOG2 + 1;
......@@ -917,6 +924,10 @@ void BestEncInfoCache::init( const Slice &slice )
}
}
}
#if ENABLE_SPLIT_PARALLELISM
m_currTemporalId = 0;
#endif
}
bool BestEncInfoCache::setFromCs( const CodingStructure& cs, const Partitioner& partitioner )
......@@ -1062,6 +1073,70 @@ bool BestEncInfoCache::setCsFrom( CodingStructure& cs, EncTestMode& testMode, co
return true;
}
#if ENABLE_SPLIT_PARALLELISM
void BestEncInfoCache::copyState(const BestEncInfoCache &other, const UnitArea &area)
{
m_slice_bencinf = other.m_slice_bencinf;
m_currTemporalId = other.m_currTemporalId;
if( m_slice_bencinf->isIntra() ) return;
const int cuSizeMask = m_slice_bencinf->getSPS()->getMaxCUWidth() - 1;
const int minPosX = ( area.lx() & cuSizeMask ) >> MIN_CU_LOG2;
const int minPosY = ( area.ly() & cuSizeMask ) >> MIN_CU_LOG2;
const int maxPosX = ( area.Y().bottomRight().x & cuSizeMask ) >> MIN_CU_LOG2;
const int maxPosY = ( area.Y().bottomRight().y & cuSizeMask ) >> MIN_CU_LOG2;
for( unsigned x = minPosX; x <= maxPosX; x++ )
{
for( unsigned y = minPosY; y <= maxPosY; y++ )
{
for( int wIdx = 0; wIdx < gp_sizeIdxInfo->numWidths(); wIdx++ )
{
const int width = gp_sizeIdxInfo->sizeFrom( wIdx );
if( m_bestEncInfo[x][y][wIdx] && width <= area.lwidth() && x + ( width >> MIN_CU_LOG2 ) <= ( maxPosX + 1 ) )
{
for( int hIdx = 0; hIdx < gp_sizeIdxInfo->numHeights(); hIdx++ )
{
const int height = gp_sizeIdxInfo->sizeFrom( hIdx );
if( gp_sizeIdxInfo->isCuSize( height ) && height <= area.lheight() && y + ( height >> MIN_CU_LOG2 ) <= ( maxPosY + 1 ) )
{
if( other.m_bestEncInfo[x][y][wIdx][hIdx]->temporalId > m_bestEncInfo[x][y][wIdx][hIdx]->temporalId )
{
memcpy( m_bestEncInfo[x][y][wIdx][hIdx], other.m_bestEncInfo[x][y][wIdx][hIdx], sizeof( BestEncodingInfo ) );
m_bestEncInfo[x][y][wIdx][hIdx]->temporalId = m_currTemporalId;
}
}
else if( y + ( height >> MIN_CU_LOG2 ) > maxPosY + 1 )
{
break;;
}
}
}
else if( x + ( width >> MIN_CU_LOG2 ) > maxPosX + 1 )
{
break;
}
}
}
}
}
void BestEncInfoCache::touch(const UnitArea &area)
{
unsigned idx1, idx2, idx3, idx4;
getAreaIdx(area.Y(), *m_slice_bencinf->getPPS()->pcv, idx1, idx2, idx3, idx4);
BestEncodingInfo &encInfo = *m_bestEncInfo[idx1][idx2][idx3][idx4];
encInfo.temporalId = m_currTemporalId;
}
#endif
#endif
static bool interHadActive( const ComprCUCtx& ctx )
......@@ -1961,7 +2036,10 @@ bool EncModeCtrlMTnoRQT::tryMode( const EncTestMode& encTestmode, const CodingSt
relatedCU.isIntra = true;
}
#if ENABLE_SPLIT_PARALLELISM
touch( partitioner.currArea() );
#if REUSE_CU_RESULTS
BestEncInfoCache::touch(partitioner.currArea());
#endif
CacheBlkInfoCtrl::touch(partitioner.currArea());
#endif
cuECtx.set( IS_BEST_NOSPLIT_SKIP, bestCU->skip );
}
......@@ -2109,6 +2187,10 @@ void EncModeCtrlMTnoRQT::copyState( const EncModeCtrl& other, const UnitArea& ar
this->EncModeCtrl ::copyState( *pOther, area );
this->CacheBlkInfoCtrl ::copyState( *pOther, area );
#if REUSE_CU_RESULTS
this->BestEncInfoCache ::copyState( *pOther, area );
#endif
this->SaveLoadEncInfoSbt ::copyState( *pOther );
m_skipThreshold = pOther->m_skipThreshold;
}
......
......@@ -369,7 +369,13 @@ struct SaveLoadStructSbt
class SaveLoadEncInfoSbt
{
protected:
#if ENABLE_SPLIT_PARALLELISM
public:
#endif
void init( const Slice &slice );
#if ENABLE_SPLIT_PARALLELISM
protected:
#endif
void create();
void destroy();
......@@ -382,6 +388,9 @@ public:
void resetSaveloadSbt( int maxSbtSize );
uint16_t findBestSbt( const UnitArea& area, const uint32_t curPuSse );
bool saveBestSbt( const UnitArea& area, const uint32_t curPuSse, const uint8_t curPuSbt, const uint8_t curPuTrs );
#if ENABLE_SPLIT_PARALLELISM
void copyState(const SaveLoadEncInfoSbt& other);
#endif
};
#endif
......@@ -467,6 +476,10 @@ struct BestEncodingInfo
EncTestMode testMode;
int poc;
#if ENABLE_SPLIT_PARALLELISM
int64_t temporalId;
#endif
};
class BestEncInfoCache
......@@ -480,23 +493,31 @@ private:
Pel *m_pPcmBuf;
CodingStructure m_dummyCS;
XUCache m_dummyCache;
#if ENABLE_SPLIT_PARALLELISM
int64_t m_currTemporalId;
#endif
protected:
void create ( const ChromaFormat chFmt );
void destroy ();
void init ( const Slice &slice );
bool setFromCs( const CodingStructure& cs, const Partitioner& partitioner );
bool isValid ( const CodingStructure& cs, const Partitioner& partitioner, int qp );
// TODO: implement copyState
bool isValid ( const CodingStructure &cs, const Partitioner &partitioner, int qp );
#if ENABLE_SPLIT_PARALLELISM
void touch ( const UnitArea& area );
#endif
public:
BestEncInfoCache() : m_slice_bencinf( nullptr ), m_dummyCS( m_dummyCache.cuCache, m_dummyCache.puCache, m_dummyCache.tuCache ) {}
virtual ~BestEncInfoCache() {}
#if ENABLE_SPLIT_PARALLELISM
void copyState( const BestEncInfoCache &other, const UnitArea &area );
void tick () { m_currTemporalId++; CHECK( m_currTemporalId <= 0, "Problem with integer overflow!" ); }
#endif
void init ( const Slice &slice );
bool setCsFrom( CodingStructure& cs, EncTestMode& testMode, const Partitioner& partitioner ) const;
};
......
......@@ -1267,6 +1267,43 @@ void EncReshape::constructReshaperSDR()
updateChromaScaleLUT();
}
#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
void EncReshape::copyState(const EncReshape &other)
{
m_srcReshaped = other.m_srcReshaped;
m_picWidth = other.m_picWidth;
m_picHeight = other.m_picHeight;
m_maxCUWidth = other.m_maxCUWidth;
m_maxCUHeight = other.m_maxCUHeight;
m_widthInCtus = other.m_widthInCtus;
m_heightInCtus = other.m_heightInCtus;
m_numCtuInFrame = other.m_numCtuInFrame;
m_exceedSTD = other.m_exceedSTD;
m_binImportance = other.m_binImportance;
m_tcase = other.m_tcase;
m_rateAdpMode = other.m_rateAdpMode;
m_useAdpCW = other.m_useAdpCW;
m_initCWAnalyze = other.m_initCWAnalyze;
m_dQPModel = other.m_dQPModel;
m_reshapeCW = other.m_reshapeCW;
memcpy( m_cwLumaWeight, other.m_cwLumaWeight, sizeof( m_cwLumaWeight ) );
m_chromaWeight = other.m_chromaWeight;
m_chromaAdj = other.m_chromaAdj;
m_sliceReshapeInfo = other.m_sliceReshapeInfo;
m_CTUFlag = other.m_CTUFlag;
m_recReshaped = other.m_recReshaped;
m_invLUT = other.m_invLUT;
m_fwdLUT = other.m_fwdLUT;
m_chromaAdjHelpLUT = other.m_chromaAdjHelpLUT;
m_binCW = other.m_binCW;
m_initCW = other.m_initCW;
m_reshape = other.m_reshape;
m_reshapePivot = other.m_reshapePivot;
m_lumaBD = other.m_lumaBD;
m_reshapeLUTSize = other.m_reshapeLUTSize;
}
#endif
#endif
//
//! \}
......@@ -99,6 +99,10 @@ public:
ReshapeCW * getReshapeCW() { return &m_reshapeCW; }
Pel * getWeightTable() { return m_cwLumaWeight; }
double getCWeight() { return m_chromaWeight; }
#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
void copyState(const EncReshape& other);
#endif
};// END CLASS DEFINITION EncReshape
//! \}
......
......@@ -175,11 +175,6 @@ void InterSearch::setTempBuffers( CodingStructure ****pSplitCS, CodingStructure
#if ENABLE_SPLIT_PARALLELISM
void InterSearch::copyState( const InterSearch& other )
{
if( !m_pcEncCfg->getQTBT() )
{
memcpy( m_integerMv2Nx2N, other.m_integerMv2Nx2N, sizeof( m_integerMv2Nx2N ) );
}
memcpy( m_aaiAdaptSR, other.m_aaiAdaptSR, sizeof( m_aaiAdaptSR ) );
}
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment