Skip to content
Snippets Groups Projects
EncCu.cpp 107 KiB
Newer Older
  • Learn to ignore specific revisions
  • /* The copyright in this software is being made available under the BSD
     * License, included below. This software may be subject to other third party
     * and contributor rights, including patent rights, and no such rights are
     * granted under this license.
     *
     * Copyright (c) 2010-2018, ITU/ISO/IEC
     * All rights reserved.
     *
     * Redistribution and use in source and binary forms, with or without
     * modification, are permitted provided that the following conditions are met:
     *
     *  * Redistributions of source code must retain the above copyright notice,
     *    this list of conditions and the following disclaimer.
     *  * Redistributions in binary form must reproduce the above copyright notice,
     *    this list of conditions and the following disclaimer in the documentation
     *    and/or other materials provided with the distribution.
     *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
     *    be used to endorse or promote products derived from this software without
     *    specific prior written permission.
     *
     * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
     * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
     * THE POSSIBILITY OF SUCH DAMAGE.
     */
    
    /** \file     EncCu.cpp
        \brief    Coding Unit (CU) encoder class
    */
    
    #include "EncCu.h"
    
    #include "EncLib.h"
    #include "Analyze.h"
    #include "AQp.h"
    
    #include "CommonLib/dtrace_codingstruct.h"
    #include "CommonLib/Picture.h"
    #include "CommonLib/UnitTools.h"
    
    
    #include "CommonLib/dtrace_buffer.h"
    
    #include <stdio.h>
    #include <cmath>
    #include <algorithm>
    #if ENABLE_WPP_PARALLELISM
    #include <mutex>
    extern std::recursive_mutex g_cache_mutex;
    #endif
    
    
    
    //! \ingroup EncoderLib
    //! \{
    
    // ====================================================================================================================
    // Constructor / destructor / create / destroy
    // ====================================================================================================================
    
    void EncCu::create( EncCfg* encCfg )
    {
      unsigned      uiMaxWidth    = encCfg->getMaxCUWidth();
      unsigned      uiMaxHeight   = encCfg->getMaxCUHeight();
      ChromaFormat  chromaFormat  = encCfg->getChromaFormatIdc();
      bool          BTnoRQT       = encCfg->getQTBT();
    
      unsigned      numWidths     = gp_sizeIdxInfo->numWidths();
      unsigned      numHeights    = gp_sizeIdxInfo->numHeights();
      unsigned      maxMEPart     = BTnoRQT ? 1 : NUMBER_OF_PART_SIZES;
      m_pTempCS = new CodingStructure**  [numWidths];
      m_pBestCS = new CodingStructure**  [numWidths];
    
    
    #if JVET_L0266_HMVP
    
    Li's avatar
    Li committed
      m_pTempMotLUTs = new LutMotionCand**[numWidths];
      m_pBestMotLUTs = new LutMotionCand**[numWidths];
      m_pSplitTempMotLUTs = new LutMotionCand**[numWidths];
    
      for( unsigned w = 0; w < numWidths; w++ )
      {
        m_pTempCS[w] = new CodingStructure*  [numHeights];
        m_pBestCS[w] = new CodingStructure*  [numHeights];
    
    #if JVET_L0266_HMVP
    
    Li's avatar
    Li committed
        m_pTempMotLUTs[w] = new LutMotionCand*[numHeights];
        m_pBestMotLUTs[w] = new LutMotionCand*[numHeights];
        m_pSplitTempMotLUTs[w] = new LutMotionCand*[numHeights];
    
    
        for( unsigned h = 0; h < numHeights; h++ )
        {
          unsigned width  = gp_sizeIdxInfo->sizeFrom( w );
          unsigned height = gp_sizeIdxInfo->sizeFrom( h );
    
          if( ( BTnoRQT || w == h ) && gp_sizeIdxInfo->isCuSize( width ) && gp_sizeIdxInfo->isCuSize( height ) )
          {
            m_pTempCS[w][h] = new CodingStructure( m_unitCache.cuCache, m_unitCache.puCache, m_unitCache.tuCache );
            m_pBestCS[w][h] = new CodingStructure( m_unitCache.cuCache, m_unitCache.puCache, m_unitCache.tuCache );
    
            m_pTempCS[w][h]->create( chromaFormat, Area( 0, 0, width, height ), false );
            m_pBestCS[w][h]->create( chromaFormat, Area( 0, 0, width, height ), false );
    
    #if JVET_L0266_HMVP
    
    Li's avatar
    Li committed
            m_pTempMotLUTs[w][h] = new LutMotionCand ;
            m_pBestMotLUTs[w][h] = new LutMotionCand ;
            m_pSplitTempMotLUTs[w][h] = new LutMotionCand;
    
            m_pSplitTempMotLUTs[w][h]->currCnt = 0;
            m_pSplitTempMotLUTs[w][h]->motionCand = nullptr;
            m_pSplitTempMotLUTs[w][h]->motionCand = new MotionInfo[MAX_NUM_HMVP_CANDS];
    
            m_pTempMotLUTs[w][h]->currCnt = 0;
            m_pTempMotLUTs[w][h]->motionCand = nullptr;
            m_pTempMotLUTs[w][h]->motionCand = new MotionInfo[MAX_NUM_HMVP_CANDS];
    
            m_pBestMotLUTs[w][h]->currCnt = 0;
            m_pBestMotLUTs[w][h]->motionCand = nullptr;
            m_pBestMotLUTs[w][h]->motionCand = new MotionInfo[MAX_NUM_HMVP_CANDS];
    
          }
          else
          {
            m_pTempCS[w][h] = nullptr;
            m_pBestCS[w][h] = nullptr;
    
    #if JVET_L0266_HMVP
            m_pTempMotLUTs[w][h] = nullptr;
            m_pBestMotLUTs[w][h] = nullptr;
            m_pSplitTempMotLUTs[w][h] = nullptr;
    #endif
    
          }
        }
      }
    
      // WIA: only the weight==height case is relevant without QTBT
      m_pImvTempCS = nullptr;
    
      if( IMV_OFF != encCfg->getIMV() && !BTnoRQT )
      {
        m_pImvTempCS = new CodingStructure**[numWidths];
    
        for( unsigned w = 0; w < numWidths; w++ )
        {
          unsigned width  = gp_sizeIdxInfo->sizeFrom( w );
          unsigned height = gp_sizeIdxInfo->sizeFrom( w );
    
          m_pImvTempCS[w] = new CodingStructure*[maxMEPart];
    
          for( unsigned p = 0; p < maxMEPart; p++ )
          {
            if( gp_sizeIdxInfo->isCuSize( width ) )
            {
              m_pImvTempCS[w][p] = new CodingStructure( m_unitCache.cuCache, m_unitCache.puCache, m_unitCache.tuCache );
              m_pImvTempCS[w][p]->create( chromaFormat, Area( 0, 0, width, height ), false );
            }
            else
            {
              m_pImvTempCS[w][p] = nullptr;
            }
          }
        }
      }
    
    
      m_cuChromaQpOffsetIdxPlus1 = 0;
    
      unsigned maxDepth = numWidths + numHeights;
    
      if( encCfg->getQTBT() )
      {
        m_modeCtrl = new EncModeCtrlMTnoRQT();
      }
      else
      {
        THROW( "Unknown partitioner!" );
      }
    
    #if REUSE_CU_RESULTS
      m_modeCtrl->create( *encCfg );
    
    #endif
    
    #if JVET_L0054_MMVD
      for (unsigned ui = 0; ui < MMVD_MRG_MAX_RD_BUF_NUM; ui++)
    #else
    
      for( unsigned ui = 0; ui < MRG_MAX_NUM_CANDS; ui++ )
    
      {
        m_acMergeBuffer[ui].create( chromaFormat, Area( 0, 0, uiMaxWidth, uiMaxHeight ) );
      }
    
    #if JVET_L0100_MULTI_HYPOTHESIS_INTRA && JVET_L0054_MMVD
    
      for (unsigned ui = 0; ui < MRG_MAX_NUM_CANDS; ui++)
      {
        m_acRealMergeBuffer[ui].create(chromaFormat, Area(0, 0, uiMaxWidth, uiMaxHeight));
      }
    #endif
    
    
      m_CtxBuffer.resize( maxDepth );
      m_CurrCtx = 0;
    }
    
    
    void EncCu::destroy()
    {
      bool          BTnoRQT   = m_pcEncCfg->getQTBT();
      unsigned      maxMEPart = BTnoRQT ? 1 : NUMBER_OF_PART_SIZES;
    
      unsigned numWidths  = gp_sizeIdxInfo->numWidths();
      unsigned numHeights = gp_sizeIdxInfo->numHeights();
    
      for( unsigned w = 0; w < numWidths; w++ )
      {
        for( unsigned h = 0; h < numHeights; h++ )
        {
          if( BTnoRQT || w == h )
          {
            if( m_pBestCS[w][h] ) m_pBestCS[w][h]->destroy();
            if( m_pTempCS[w][h] ) m_pTempCS[w][h]->destroy();
    
            delete m_pBestCS[w][h];
            delete m_pTempCS[w][h];
    
    #if JVET_L0266_HMVP
            if (m_pTempMotLUTs[w][h])
            {
    
    Li's avatar
    Li committed
              delete[] m_pTempMotLUTs[w][h]->motionCand;
              m_pTempMotLUTs[w][h]->motionCand = nullptr;
    
              delete[] m_pTempMotLUTs[w][h];
            }
            if (m_pBestMotLUTs[w][h])
            {
    
    Li's avatar
    Li committed
              delete[] m_pBestMotLUTs[w][h]->motionCand;
              m_pBestMotLUTs[w][h]->motionCand = nullptr;
    
              delete[] m_pBestMotLUTs[w][h];
            }
    
            if (m_pSplitTempMotLUTs[w][h])
            {
    
    Li's avatar
    Li committed
              delete[] m_pSplitTempMotLUTs[w][h]->motionCand;
              m_pSplitTempMotLUTs[w][h]->motionCand = nullptr;
    
              delete[] m_pSplitTempMotLUTs[w][h];
            }
    #endif
    
    #if JVET_L0266_HMVP
        delete[] m_pBestMotLUTs[w];
        delete[] m_pTempMotLUTs[w];
        delete[] m_pSplitTempMotLUTs[w];
    #endif
    
      }
    
      delete[] m_pBestCS; m_pBestCS = nullptr;
      delete[] m_pTempCS; m_pTempCS = nullptr;
    
    #if JVET_L0266_HMVP
      delete[] m_pSplitTempMotLUTs; m_pSplitTempMotLUTs = nullptr;
      delete[] m_pBestMotLUTs; m_pBestMotLUTs = nullptr;
      delete[] m_pTempMotLUTs; m_pTempMotLUTs = nullptr;
    #endif
    
    
    #if REUSE_CU_RESULTS
      m_modeCtrl->destroy();
    
    #endif
      delete m_modeCtrl;
      m_modeCtrl = nullptr;
    
      // WIA: only the weight==height case is relevant without QTBT
      if( m_pImvTempCS )
      {
        for( unsigned w = 0; w < numWidths; w++ )
        {
          for( unsigned p = 0; p < maxMEPart; p++ )
          {
            if( m_pImvTempCS[w][p] ) m_pImvTempCS[w][p]->destroy();
            delete m_pImvTempCS[w][p];
          }
          delete[] m_pImvTempCS[w];
        }
    
        delete[] m_pImvTempCS;
        m_pImvTempCS = nullptr;
      }
    
    
    #if JVET_L0054_MMVD
      for (unsigned ui = 0; ui < MMVD_MRG_MAX_RD_BUF_NUM; ui++)
    #else
    
      for( unsigned ui = 0; ui < MRG_MAX_NUM_CANDS; ui++ )
    
    #if JVET_L0100_MULTI_HYPOTHESIS_INTRA && JVET_L0054_MMVD
    
      for (unsigned ui = 0; ui < MRG_MAX_NUM_CANDS; ui++)
      {
        m_acRealMergeBuffer[ui].destroy();
      }
    #endif
    
    }
    
    
    
    EncCu::~EncCu()
    {
    }
    
    
    
    /** \param    pcEncLib      pointer of encoder class
     */
    void EncCu::init( EncLib* pcEncLib, const SPS& sps PARL_PARAM( const int tId ) )
    {
      m_pcEncCfg           = pcEncLib;
      m_pcIntraSearch      = pcEncLib->getIntraSearch( PARL_PARAM0( tId ) );
      m_pcInterSearch      = pcEncLib->getInterSearch( PARL_PARAM0( tId ) );
      m_pcTrQuant          = pcEncLib->getTrQuant( PARL_PARAM0( tId ) );
      m_pcRdCost           = pcEncLib->getRdCost ( PARL_PARAM0( tId ) );
      m_CABACEstimator     = pcEncLib->getCABACEncoder( PARL_PARAM0( tId ) )->getCABACEstimator( &sps );
      m_CABACEstimator->setEncCu(this);
      m_CtxCache           = pcEncLib->getCtxCache( PARL_PARAM0( tId ) );
      m_pcRateCtrl         = pcEncLib->getRateCtrl();
      m_pcSliceEncoder     = pcEncLib->getSliceEncoder();
    #if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
      m_pcEncLib           = pcEncLib;
      m_dataId             = tId;
    #endif
    
    #if REUSE_CU_RESULTS
      DecCu::init( m_pcTrQuant, m_pcIntraSearch, m_pcInterSearch );
    
    #endif
      m_modeCtrl->init( m_pcEncCfg, m_pcRateCtrl, m_pcRdCost );
    
      m_pcInterSearch->setModeCtrl( m_modeCtrl );
      ::memset(m_subMergeBlkSize, 0, sizeof(m_subMergeBlkSize));
      ::memset(m_subMergeBlkNum, 0, sizeof(m_subMergeBlkNum));
      m_prevPOC = MAX_UINT;
      m_clearSubMergeStatic = false;
    }
    
    // ====================================================================================================================
    // Public member functions
    // ====================================================================================================================
    
    void EncCu::compressCtu( CodingStructure& cs, const UnitArea& area, const unsigned ctuRsAddr, const int prevQP[], const int currQP[] )
    {
      m_modeCtrl->initCTUEncoding( *cs.slice );
    
    #if ENABLE_SPLIT_PARALLELISM
      if( m_pcEncCfg->getNumSplitThreads() > 1 )
      {
        for( int jId = 1; jId < NUM_RESERVERD_SPLIT_JOBS; jId++ )
        {
          EncCu*            jobEncCu  = m_pcEncLib->getCuEncoder( cs.picture->scheduler.getSplitDataId( jId ) );
          CacheBlkInfoCtrl* cacheCtrl = dynamic_cast< CacheBlkInfoCtrl* >( jobEncCu->m_modeCtrl );
          if( cacheCtrl )
          {
            cacheCtrl->init( *cs.slice );
          }
        }
      }
    
      if( auto* cacheCtrl = dynamic_cast<CacheBlkInfoCtrl*>( m_modeCtrl ) ) { cacheCtrl->tick(); }
    #endif
      // init the partitioning manager
      Partitioner *partitioner = PartitionerFactory::get( *cs.slice );
      partitioner->initCtu( area, CH_L, *cs.slice );
      // init current context pointer
      m_CurrCtx = m_CtxBuffer.data();
    
      CodingStructure *tempCS = m_pTempCS[gp_sizeIdxInfo->idxFrom( area.lumaSize().width )][gp_sizeIdxInfo->idxFrom( area.lumaSize().height )];
      CodingStructure *bestCS = m_pBestCS[gp_sizeIdxInfo->idxFrom( area.lumaSize().width )][gp_sizeIdxInfo->idxFrom( area.lumaSize().height )];
    
    #if JVET_L0266_HMVP
    
    Li's avatar
    Li committed
      LutMotionCand *tempMotCandLUTs = m_pTempMotLUTs[gp_sizeIdxInfo->idxFrom(area.lumaSize().width)][gp_sizeIdxInfo->idxFrom(area.lumaSize().height)];
      LutMotionCand *bestMotCandLUTs = m_pBestMotLUTs[gp_sizeIdxInfo->idxFrom(area.lumaSize().width)][gp_sizeIdxInfo->idxFrom(area.lumaSize().height)];
    
      cs.slice->copyMotionLUTs(cs.slice->getMotionLUTs(), tempMotCandLUTs);
      cs.slice->copyMotionLUTs(cs.slice->getMotionLUTs(), bestMotCandLUTs);
    #endif
    
    
      cs.initSubStructure( *tempCS, partitioner->chType, partitioner->currArea(), false );
      cs.initSubStructure( *bestCS, partitioner->chType, partitioner->currArea(), false );
      tempCS->currQP[CH_L] = bestCS->currQP[CH_L] =
      tempCS->baseQP       = bestCS->baseQP       = currQP[CH_L];
      tempCS->prevQP[CH_L] = bestCS->prevQP[CH_L] = prevQP[CH_L];
    
    
      xCompressCU( tempCS, bestCS, *partitioner
    #if JVET_L0266_HMVP
        , tempMotCandLUTs
        , bestMotCandLUTs
    #endif
      );
    
    
    
      // all signals were already copied during compression if the CTU was split - at this point only the structures are copied to the top level CS
      const bool copyUnsplitCTUSignals = bestCS->cus.size() == 1 && KEEP_PRED_AND_RESI_SIGNALS;
      cs.useSubStructure( *bestCS, partitioner->chType, CS::getArea( *bestCS, area, partitioner->chType ), copyUnsplitCTUSignals, false, false, copyUnsplitCTUSignals );
    
    #if JVET_L0266_HMVP
      cs.slice->copyMotionLUTs(bestMotCandLUTs, cs.slice->getMotionLUTs());
    #endif
    
      if( !cs.pcv->ISingleTree && cs.slice->isIRAP() && cs.pcv->chrFormat != CHROMA_400 )
    
      {
        m_CABACEstimator->getCtx() = m_CurrCtx->start;
    
        partitioner->initCtu( area, CH_C, *cs.slice );
    
        cs.initSubStructure( *tempCS, partitioner->chType, partitioner->currArea(), false );
        cs.initSubStructure( *bestCS, partitioner->chType, partitioner->currArea(), false );
        tempCS->currQP[CH_C] = bestCS->currQP[CH_C] =
        tempCS->baseQP       = bestCS->baseQP       = currQP[CH_C];
        tempCS->prevQP[CH_C] = bestCS->prevQP[CH_C] = prevQP[CH_C];
    
    
        xCompressCU( tempCS, bestCS, *partitioner
    #if JVET_L0266_HMVP
          , tempMotCandLUTs
          , bestMotCandLUTs
    #endif
        );
    
    
        const bool copyUnsplitCTUSignals = bestCS->cus.size() == 1 && KEEP_PRED_AND_RESI_SIGNALS;
        cs.useSubStructure( *bestCS, partitioner->chType, CS::getArea( *bestCS, area, partitioner->chType ), copyUnsplitCTUSignals, false, false, copyUnsplitCTUSignals );
      }
    
    
      if (m_pcEncCfg->getUseRateCtrl())
      {
        (m_pcRateCtrl->getRCPic()->getLCU(ctuRsAddr)).m_actualMSE = (double)bestCS->dist / (double)m_pcRateCtrl->getRCPic()->getLCU(ctuRsAddr).m_numberOfPixel;
      }
    
      // reset context states and uninit context pointer
      m_CABACEstimator->getCtx() = m_CurrCtx->start;
      m_CurrCtx                  = 0;
      delete partitioner;
    
    #if ENABLE_SPLIT_PARALLELISM && ENABLE_WPP_PARALLELISM
      if( m_pcEncCfg->getNumSplitThreads() > 1 && m_pcEncCfg->getNumWppThreads() > 1 )
      {
        cs.picture->finishCtuPart( area );
      }
    #endif
    
      // Ensure that a coding was found
      // Selected mode's RD-cost must be not MAX_DOUBLE.
      CHECK( bestCS->cus.empty()                                   , "No possible encoding found" );
      CHECK( bestCS->cus[0]->partSize == NUMBER_OF_PART_SIZES      , "No possible encoding found" );
      CHECK( bestCS->cus[0]->predMode == NUMBER_OF_PREDICTION_MODES, "No possible encoding found" );
      CHECK( bestCS->cost             == MAX_DOUBLE                , "No possible encoding found" );
    }
    
    // ====================================================================================================================
    // Protected member functions
    // ====================================================================================================================
    
    static int xCalcHADs8x8_ISlice(const Pel *piOrg, const int iStrideOrg)
    {
      int k, i, j, jj;
      int diff[64], m1[8][8], m2[8][8], m3[8][8], iSumHad = 0;
    
      for (k = 0; k < 64; k += 8)
      {
        diff[k + 0] = piOrg[0];
        diff[k + 1] = piOrg[1];
        diff[k + 2] = piOrg[2];
        diff[k + 3] = piOrg[3];
        diff[k + 4] = piOrg[4];
        diff[k + 5] = piOrg[5];
        diff[k + 6] = piOrg[6];
        diff[k + 7] = piOrg[7];
    
        piOrg += iStrideOrg;
      }
    
      //horizontal
      for (j = 0; j < 8; j++)
      {
        jj = j << 3;
        m2[j][0] = diff[jj    ] + diff[jj + 4];
        m2[j][1] = diff[jj + 1] + diff[jj + 5];
        m2[j][2] = diff[jj + 2] + diff[jj + 6];
        m2[j][3] = diff[jj + 3] + diff[jj + 7];
        m2[j][4] = diff[jj    ] - diff[jj + 4];
        m2[j][5] = diff[jj + 1] - diff[jj + 5];
        m2[j][6] = diff[jj + 2] - diff[jj + 6];
        m2[j][7] = diff[jj + 3] - diff[jj + 7];
    
        m1[j][0] = m2[j][0] + m2[j][2];
        m1[j][1] = m2[j][1] + m2[j][3];
        m1[j][2] = m2[j][0] - m2[j][2];
        m1[j][3] = m2[j][1] - m2[j][3];
        m1[j][4] = m2[j][4] + m2[j][6];
        m1[j][5] = m2[j][5] + m2[j][7];
        m1[j][6] = m2[j][4] - m2[j][6];
        m1[j][7] = m2[j][5] - m2[j][7];
    
        m2[j][0] = m1[j][0] + m1[j][1];
        m2[j][1] = m1[j][0] - m1[j][1];
        m2[j][2] = m1[j][2] + m1[j][3];
        m2[j][3] = m1[j][2] - m1[j][3];
        m2[j][4] = m1[j][4] + m1[j][5];
        m2[j][5] = m1[j][4] - m1[j][5];
        m2[j][6] = m1[j][6] + m1[j][7];
        m2[j][7] = m1[j][6] - m1[j][7];
      }
    
      //vertical
      for (i = 0; i < 8; i++)
      {
        m3[0][i] = m2[0][i] + m2[4][i];
        m3[1][i] = m2[1][i] + m2[5][i];
        m3[2][i] = m2[2][i] + m2[6][i];
        m3[3][i] = m2[3][i] + m2[7][i];
        m3[4][i] = m2[0][i] - m2[4][i];
        m3[5][i] = m2[1][i] - m2[5][i];
        m3[6][i] = m2[2][i] - m2[6][i];
        m3[7][i] = m2[3][i] - m2[7][i];
    
        m1[0][i] = m3[0][i] + m3[2][i];
        m1[1][i] = m3[1][i] + m3[3][i];
        m1[2][i] = m3[0][i] - m3[2][i];
        m1[3][i] = m3[1][i] - m3[3][i];
        m1[4][i] = m3[4][i] + m3[6][i];
        m1[5][i] = m3[5][i] + m3[7][i];
        m1[6][i] = m3[4][i] - m3[6][i];
        m1[7][i] = m3[5][i] - m3[7][i];
    
        m2[0][i] = m1[0][i] + m1[1][i];
        m2[1][i] = m1[0][i] - m1[1][i];
        m2[2][i] = m1[2][i] + m1[3][i];
        m2[3][i] = m1[2][i] - m1[3][i];
        m2[4][i] = m1[4][i] + m1[5][i];
        m2[5][i] = m1[4][i] - m1[5][i];
        m2[6][i] = m1[6][i] + m1[7][i];
        m2[7][i] = m1[6][i] - m1[7][i];
      }
    
      for (i = 0; i < 8; i++)
      {
        for (j = 0; j < 8; j++)
        {
          iSumHad += abs(m2[i][j]);
        }
      }
      iSumHad -= abs(m2[0][0]);
      iSumHad = (iSumHad + 2) >> 2;
      return(iSumHad);
    }
    
    int  EncCu::updateCtuDataISlice(const CPelBuf buf)
    {
      int  xBl, yBl;
      const int iBlkSize = 8;
      const Pel* pOrgInit = buf.buf;
      int  iStrideOrig = buf.stride;
    
      int iSumHad = 0;
      for( yBl = 0; ( yBl + iBlkSize ) <= buf.height; yBl += iBlkSize )
      {
        for( xBl = 0; ( xBl + iBlkSize ) <= buf.width; xBl += iBlkSize )
        {
          const Pel* pOrg = pOrgInit + iStrideOrig*yBl + xBl;
          iSumHad += xCalcHADs8x8_ISlice( pOrg, iStrideOrig );
        }
      }
      return( iSumHad );
    }
    
    
    #if JVET_L0266_HMVP
    bool EncCu::xCheckBestMode( CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &partitioner, const EncTestMode& encTestMode )
    #else
    
    void EncCu::xCheckBestMode( CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &partitioner, const EncTestMode& encTestMode )
    
    #if JVET_L0266_HMVP
    
      bool bestCSUpdated = false;
    
      if( !tempCS->cus.empty() )
      {
        if( tempCS->cus.size() == 1 )
        {
          const CodingUnit& cu = *tempCS->cus.front();
          CHECK( cu.skip && !cu.firstPU->mergeFlag, "Skip flag without a merge flag is not allowed!" );
        }
    
    #if WCG_EXT
        DTRACE_BEST_MODE( tempCS, bestCS, m_pcRdCost->getLambda( true ) );
    #else
        DTRACE_BEST_MODE( tempCS, bestCS, m_pcRdCost->getLambda() );
    #endif
    
        if( m_modeCtrl->useModeResult( encTestMode, tempCS, partitioner ) )
        {
          if( tempCS->cus.size() == 1 )
          {
            // if tempCS is not a split-mode
            CodingUnit &cu = *tempCS->cus.front();
    
            if( CU::isLosslessCoded( cu ) && !cu.ipcm )
            {
              xFillPCMBuffer( cu );
            }
          }
    
          std::swap( tempCS, bestCS );
          // store temp best CI for next CU coding
          m_CurrCtx->best = m_CABACEstimator->getCtx();
    
    #if JVET_L0266_HMVP
    
          bestCSUpdated = true;
    
        }
      }
    
      // reset context states
      m_CABACEstimator->getCtx() = m_CurrCtx->start;
    
    #if JVET_L0266_HMVP
    
      return bestCSUpdated;
    
    void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &partitioner
    #if JVET_L0266_HMVP
    
      , LutMotionCand *&tempMotCandLUTs
      , LutMotionCand *&bestMotCandLUTs
    
    {
    #if ENABLE_SPLIT_PARALLELISM
      CHECK( m_dataId != tempCS->picture->scheduler.getDataId(), "Working in the wrong dataId!" );
    
      if( m_pcEncCfg->getNumSplitThreads() != 1 && tempCS->picture->scheduler.getSplitJobId() == 0 )
      {
        if( m_modeCtrl->isParallelSplit( *tempCS, partitioner ) )
        {
          m_modeCtrl->setParallelSplit( true );
          xCompressCUParallel( tempCS, bestCS, partitioner );
          return;
        }
      }
    
    #endif
    
      Slice&   slice      = *tempCS->slice;
      const PPS &pps      = *tempCS->pps;
      const SPS &sps      = *tempCS->sps;
      const uint32_t uiLPelX  = tempCS->area.Y().lumaPos().x;
      const uint32_t uiTPelY  = tempCS->area.Y().lumaPos().y;
    
      const unsigned wIdx = gp_sizeIdxInfo->idxFrom( partitioner.currArea().lwidth()  );
    
      const UnitArea currCsArea = clipArea( CS::getArea( *bestCS, bestCS->area, partitioner.chType ), *tempCS->picture );
      if( m_pImvTempCS && !slice.isIntra() )
      {
        const unsigned maxMEPart = tempCS->pcv->only2Nx2N ? 1 : NUMBER_OF_PART_SIZES;
        for( unsigned p = 0; p < maxMEPart; p++ )
        {
          tempCS->initSubStructure( *m_pImvTempCS[wIdx][p], partitioner.chType, partitioner.currArea(), false );
        }
      }
    
      m_modeCtrl->initCULevel( partitioner, *tempCS );
    
      m_CurrCtx->start = m_CABACEstimator->getCtx();
    
      m_cuChromaQpOffsetIdxPlus1 = 0;
    
      if( slice.getUseChromaQpAdj() )
      {
        int lgMinCuSize = sps.getLog2MinCodingBlockSize() +
          std::max<int>( 0, sps.getLog2DiffMaxMinCodingBlockSize() - int( pps.getPpsRangeExtension().getDiffCuChromaQpOffsetDepth() ) );
        m_cuChromaQpOffsetIdxPlus1 = ( ( uiLPelX >> lgMinCuSize ) + ( uiTPelY >> lgMinCuSize ) ) % ( pps.getPpsRangeExtension().getChromaQpOffsetListLen() + 1 );
      }
    
      if( !m_modeCtrl->anyMode() )
      {
        m_modeCtrl->finishCULevel( partitioner );
        return;
      }
    
    #if JVET_L0266_HMVP
    
    Li's avatar
    Li committed
      if (!slice.isIntra())
      {
        tempCS->slice->copyMotionLUTs(tempMotCandLUTs, tempCS->slice->getMotionLUTs());
      }
    
    
      DTRACE_UPDATE( g_trace_ctx, std::make_pair( "cux", uiLPelX ) );
      DTRACE_UPDATE( g_trace_ctx, std::make_pair( "cuy", uiTPelY ) );
      DTRACE_UPDATE( g_trace_ctx, std::make_pair( "cuw", tempCS->area.lwidth() ) );
      DTRACE_UPDATE( g_trace_ctx, std::make_pair( "cuh", tempCS->area.lheight() ) );
      DTRACE( g_trace_ctx, D_COMMON, "@(%4d,%4d) [%2dx%2d]\n", tempCS->area.lx(), tempCS->area.ly(), tempCS->area.lwidth(), tempCS->area.lheight() );
    
      do
      {
        const EncTestMode currTestMode = m_modeCtrl->currTestMode();
    
    #if SHARP_LUMA_DELTA_QP
        if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() && partitioner.currDepth <= pps.getMaxCuDQPDepth() )
        {
    #if ENABLE_SPLIT_PARALLELISM
          CHECK( tempCS->picture->scheduler.getSplitJobId() > 0, "Changing lambda is only allowed in the master thread!" );
    #endif
          if (currTestMode.qp >= 0)
          {
            updateLambda(&slice, currTestMode.qp);
          }
        }
    #endif
    
        if( currTestMode.type == ETM_INTER_ME )
        {
          if( ( currTestMode.opts & ETO_IMV ) != 0 )
          {
            xCheckRDCostInterIMV(tempCS, bestCS, partitioner, currTestMode);
          }
          else
          {
            xCheckRDCostInter( tempCS, bestCS, partitioner, currTestMode );
          }
    
        }
        else if( currTestMode.type == ETM_AFFINE )
        {
          xCheckRDCostAffineMerge2Nx2N( tempCS, bestCS, partitioner, currTestMode );
        }
    #if REUSE_CU_RESULTS
        else if( currTestMode.type == ETM_RECO_CACHED )
        {
          xReuseCachedResult( tempCS, bestCS, partitioner );
        }
    #endif
        else if( currTestMode.type == ETM_MERGE_SKIP )
        {
          xCheckRDCostMerge2Nx2N( tempCS, bestCS, partitioner, currTestMode );
    
    #if JVET_L0054_MMVD
          CodingUnit* cu = bestCS->getCU(partitioner.chType);
          cu->mmvdSkip = cu->skip == false ? false : cu->mmvdSkip;
    #endif
    
        }
        else if( currTestMode.type == ETM_INTRA )
        {
          xCheckRDCostIntra( tempCS, bestCS, partitioner, currTestMode );
        }
        else if( currTestMode.type == ETM_IPCM )
        {
          xCheckIntraPCM( tempCS, bestCS, partitioner, currTestMode );
        }
        else if( isModeSplit( currTestMode ) )
        {
    
    
          xCheckModeSplit( tempCS, bestCS, partitioner, currTestMode
    #if JVET_L0266_HMVP
            , tempMotCandLUTs
            , bestMotCandLUTs
            , partitioner.currArea()
    #endif
          );
    
        }
        else
        {
          THROW( "Don't know how to handle mode: type = " << currTestMode.type << ", size = " << currTestMode.partSize << ", options = " << currTestMode.opts );
        }
      } while( m_modeCtrl->nextMode( *tempCS, partitioner ) );
    
      //////////////////////////////////////////////////////////////////////////
      // Finishing CU
    #if ENABLE_SPLIT_PARALLELISM
      if( bestCS->cus.empty() )
      {
        CHECK( bestCS->cost != MAX_DOUBLE, "Cost should be maximal if no encoding found" );
        CHECK( bestCS->picture->scheduler.getSplitJobId() == 0, "Should always get a result in serial case" );
    
        m_modeCtrl->finishCULevel( partitioner );
        return;
      }
    
    #endif
      // set context states
      m_CABACEstimator->getCtx() = m_CurrCtx->best;
    
      // QP from last processed CU for further processing
      bestCS->prevQP[partitioner.chType] = bestCS->cus.back()->qp;
    
    #if JVET_L0266_HMVP
      if (!slice.isIntra() && bestCS->cus.size() == 1 && bestCS->cus.back()->predMode == MODE_INTER && bestCS->area == *bestCS->cus.back())
      {
        bestCS->slice->updateMotionLUTs(bestMotCandLUTs, (*bestCS->cus.back()));
      }
    #endif
    
      bestCS->picture->getRecoBuf( currCsArea ).copyFrom( bestCS->getRecoBuf( currCsArea ) );
      m_modeCtrl->finishCULevel( partitioner );
    
    #if ENABLE_SPLIT_PARALLELISM
      if( tempCS->picture->scheduler.getSplitJobId() == 0 && m_pcEncCfg->getNumSplitThreads() != 1 )
      {
        tempCS->picture->finishParallelPart( currCsArea );
      }
    
    #endif
      // Assert if Best prediction mode is NONE
      // Selected mode's RD-cost must be not MAX_DOUBLE.
      CHECK( bestCS->cus.empty()                                   , "No possible encoding found" );
      CHECK( bestCS->cus[0]->partSize == NUMBER_OF_PART_SIZES      , "No possible encoding found" );
      CHECK( bestCS->cus[0]->predMode == NUMBER_OF_PREDICTION_MODES, "No possible encoding found" );
      CHECK( bestCS->cost             == MAX_DOUBLE                , "No possible encoding found" );
    }
    
    #if SHARP_LUMA_DELTA_QP
    void EncCu::updateLambda( Slice* slice, double dQP )
    {
    #if WCG_EXT
      int    NumberBFrames = ( m_pcEncCfg->getGOPSize() - 1 );
      int    SHIFT_QP = 12;
      double dLambda_scale = 1.0 - Clip3( 0.0, 0.5, 0.05*(double)(slice->getPic()->fieldPic ? NumberBFrames/2 : NumberBFrames) );
    
      int bitdepth_luma_qp_scale = 6
                                   * (slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA) - 8
                                      - DISTORTION_PRECISION_ADJUSTMENT(slice->getSPS()->getBitDepth(CHANNEL_TYPE_LUMA)));
      double qp_temp = (double) dQP + bitdepth_luma_qp_scale - SHIFT_QP;
    
      double dQPFactor = m_pcEncCfg->getGOPEntry( m_pcSliceEncoder->getGopId() ).m_QPFactor;
    
      if( slice->getSliceType() == I_SLICE )
      {
        if( m_pcEncCfg->getIntraQpFactor() >= 0.0 /*&& m_pcEncCfg->getGOPEntry( m_pcSliceEncoder->getGopId() ).m_sliceType != I_SLICE*/ )
        {
          dQPFactor = m_pcEncCfg->getIntraQpFactor();
        }
        else
        {
          if( m_pcEncCfg->getLambdaFromQPEnable() )
          {
            dQPFactor = 0.57;
          }
          else
          {
            dQPFactor = 0.57*dLambda_scale;
          }
        }
      }
      else if( m_pcEncCfg->getLambdaFromQPEnable() )
      {
        dQPFactor = 0.57*dQPFactor;
      }
    
      double dLambda = dQPFactor*pow( 2.0, qp_temp/3.0 );
      int depth = slice->getDepth();
    
      if( !m_pcEncCfg->getLambdaFromQPEnable() && depth>0 )
      {
        int qp_temp_slice = slice->getSliceQp() + bitdepth_luma_qp_scale - SHIFT_QP; // avoid lambda  over adjustment,  use slice_qp here
        dLambda *= Clip3( 2.00, 4.00, (qp_temp_slice / 6.0) ); // (j == B_SLICE && p_cur_frm->layer != 0 )
      }
      if( !m_pcEncCfg->getUseHADME() && slice->getSliceType( ) != I_SLICE )
      {
        dLambda *= 0.95;
      }
    
      const int temporalId = m_pcEncCfg->getGOPEntry( m_pcSliceEncoder->getGopId() ).m_temporalId;
      const std::vector<double> &intraLambdaModifiers = m_pcEncCfg->getIntraLambdaModifier();
      double lambdaModifier;
      if( slice->getSliceType( ) != I_SLICE || intraLambdaModifiers.empty())
      {
        lambdaModifier = m_pcEncCfg->getLambdaModifier(temporalId);
      }
      else
      {
        lambdaModifier = intraLambdaModifiers[(temporalId < intraLambdaModifiers.size()) ? temporalId : (intraLambdaModifiers.size() - 1)];
      }
      dLambda *= lambdaModifier;
    
      int qpBDoffset = slice->getSPS()->getQpBDOffset(CHANNEL_TYPE_LUMA);
    
      int iQP = Clip3(-qpBDoffset, MAX_QP, (int)floor(dQP + 0.5));
    
      m_pcSliceEncoder->setUpLambda(slice, dLambda, iQP);
    
    #else
      int iQP = (int)dQP;
      const double oldQP     = (double)slice->getSliceQpBase();
      const double oldLambda = m_pcSliceEncoder->calculateLambda (slice, m_pcSliceEncoder->getGopId(), slice->getDepth(), oldQP, oldQP, iQP);
      const double newLambda = oldLambda * pow (2.0, (dQP - oldQP) / 3.0);
    #if RDOQ_CHROMA_LAMBDA
      const double chromaLambda = newLambda / m_pcRdCost->getChromaWeight();
      const double lambdaArray[MAX_NUM_COMPONENT] = {newLambda, chromaLambda, chromaLambda};
      m_pcTrQuant->setLambdas (lambdaArray);
    #else
      m_pcTrQuant->setLambda (newLambda);
    #endif
      m_pcRdCost->setLambda( newLambda, slice->getSPS()->getBitDepths() );
    #endif
    }
    #endif
    
    #if ENABLE_SPLIT_PARALLELISM
    //#undef DEBUG_PARALLEL_TIMINGS
    //#define DEBUG_PARALLEL_TIMINGS 1
    void EncCu::xCompressCUParallel( CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &partitioner )
    {
      const unsigned wIdx = gp_sizeIdxInfo->idxFrom( partitioner.currArea().lwidth() );
      const unsigned hIdx = gp_sizeIdxInfo->idxFrom( partitioner.currArea().lheight() );
    
      Picture* picture = tempCS->picture;
    
      int numJobs = m_modeCtrl->getNumParallelJobs( *bestCS, partitioner );
    
      bool    jobUsed                            [NUM_RESERVERD_SPLIT_JOBS];
      std::fill( jobUsed, jobUsed + NUM_RESERVERD_SPLIT_JOBS, false );
    
      const UnitArea currArea = CS::getArea( *tempCS, partitioner.currArea(), partitioner.chType );
    #if ENABLE_WPP_PARALLELISM
      const int      wppTId   = picture->scheduler.getWppThreadId();
    #endif
      const bool doParallel   = !m_pcEncCfg->getForceSingleSplitThread();
    #if _MSC_VER && ENABLE_WPP_PARALLELISM
    #pragma omp parallel for schedule(dynamic,1) num_threads(NUM_SPLIT_THREADS_IF_MSVC) if(doParallel)
    #else
      omp_set_num_threads( m_pcEncCfg->getNumSplitThreads() );
    
    #pragma omp parallel for schedule(dynamic,1) if(doParallel)
    #endif
      for( int jId = 1; jId <= numJobs; jId++ )
      {
        // thread start
    #if ENABLE_WPP_PARALLELISM
        picture->scheduler.setWppThreadId( wppTId );
    #endif
        picture->scheduler.setSplitThreadId();
        picture->scheduler.setSplitJobId( jId );
    
        Partitioner* jobPartitioner = PartitionerFactory::get( *tempCS->slice );
        EncCu*       jobCuEnc       = m_pcEncLib->getCuEncoder( picture->scheduler.getSplitDataId( jId ) );
        auto*        jobBlkCache    = dynamic_cast<CacheBlkInfoCtrl*>( jobCuEnc->m_modeCtrl );
    
        jobPartitioner->copyState( partitioner );
        jobCuEnc      ->copyState( this, *jobPartitioner, currArea, true );
    
        if( jobBlkCache )
        {
          jobBlkCache->tick();
        }
    
        CodingStructure *&jobBest = jobCuEnc->m_pBestCS[wIdx][hIdx];
        CodingStructure *&jobTemp = jobCuEnc->m_pTempCS[wIdx][hIdx];
    
        jobUsed[jId] = true;
    
        jobCuEnc->xCompressCU( jobTemp, jobBest, *jobPartitioner );
    
        delete jobPartitioner;
    
        picture->scheduler.setSplitJobId( 0 );
        // thread stop
      }
      picture->scheduler.setSplitThreadId( 0 );
    
      int    bestJId  = 0;
      double bestCost = bestCS->cost;
      for( int jId = 1; jId <= numJobs; jId++ )
      {
        EncCu* jobCuEnc = m_pcEncLib->getCuEncoder( picture->scheduler.getSplitDataId( jId ) );
    
        if( jobUsed[jId] && jobCuEnc->m_pBestCS[wIdx][hIdx]->cost < bestCost )
        {
          bestCost = jobCuEnc->m_pBestCS[wIdx][hIdx]->cost;
          bestJId  = jId;
        }
      }
    
      if( bestJId > 0 )
      {
        copyState( m_pcEncLib->getCuEncoder( picture->scheduler.getSplitDataId( bestJId ) ), partitioner, currArea, false );
        m_CurrCtx->best = m_CABACEstimator->getCtx();
    
        tempCS = m_pTempCS[wIdx][hIdx];
        bestCS = m_pBestCS[wIdx][hIdx];
      }
    
      const int      bitDepthY = tempCS->sps->getBitDepth( CH_L );
      const UnitArea clipdArea = clipArea( currArea, *picture );
    
      CHECK( calcCheckSum( picture->getRecoBuf( clipdArea.Y() ), bitDepthY ) != calcCheckSum( bestCS->getRecoBuf( clipdArea.Y() ), bitDepthY ), "Data copied incorrectly!" );
    
      picture->finishParallelPart( currArea );
    
      if( auto *blkCache = dynamic_cast<CacheBlkInfoCtrl*>( m_modeCtrl ) )
      {
        for( int jId = 1; jId <= numJobs; jId++ )
        {
          if( !jobUsed[jId] || jId == bestJId ) continue;
    
          auto *jobBlkCache = dynamic_cast<CacheBlkInfoCtrl*>( m_pcEncLib->getCuEncoder( picture->scheduler.getSplitDataId( jId ) )->m_modeCtrl );
          CHECK( !jobBlkCache, "If own mode controller has blk info cache capability so should all other mode controllers!" );
          blkCache->CacheBlkInfoCtrl::copyState( *jobBlkCache, partitioner.currArea() );
        }
    
        blkCache->tick();
      }
    
    }