Skip to content
Snippets Groups Projects
IntraSearch.cpp 199 KiB
Newer Older
  • Learn to ignore specific revisions
  •           if (mode == numNonISPModes) // the list needs to be sorted only once
                if (m_pcEncCfg->getUseFastISP())
                if (!xSortISPCandList(bestCurrentCost, csBest->cost, uiBestPUMode))
              xGetNextISPMode(uiRdModeList[mode], (mode > 0 ? &uiRdModeList[mode - 1] : nullptr), Size(width, height));
              if (uiRdModeList[mode].ispMod == INTRA_SUBPARTITIONS_RESERVED)
              cu.lfnstIdx = m_curIspLfnstIdx;
              uiOrgMode = uiRdModeList[mode];
          cu.mipFlag                     = uiOrgMode.mipFlg;
          pu.mipTransposedFlag           = uiOrgMode.mipTrFlg;
          cu.ispMode                     = uiOrgMode.ispMod;
          pu.multiRefIdx                 = uiOrgMode.mRefId;
          pu.intraDir[CHANNEL_TYPE_LUMA] = uiOrgMode.modeId;
          CHECK(cu.mipFlag && pu.multiRefIdx, "Error: combination of MIP and MRL not supported");
          CHECK(pu.multiRefIdx && (pu.intraDir[0] == PLANAR_IDX), "Error: combination of MRL and Planar mode not supported");
          CHECK(cu.ispMode && cu.mipFlag, "Error: combination of ISP and MIP not supported");
          CHECK(cu.ispMode && pu.multiRefIdx, "Error: combination of ISP and MRL not supported");
          CHECK(cu.ispMode&& cu.colorTransform, "Error: combination of ISP and ACT not supported");
          pu.intraDir[CHANNEL_TYPE_CHROMA] = cu.colorTransform ? DM_CHROMA_IDX : pu.intraDir[CHANNEL_TYPE_CHROMA];
          // set context models
          m_CABACEstimator->getCtx() = ctxStart;
          // determine residual for partition
          cs.initSubStructure( *csTemp, partitioner.chType, cs.area, true );
          bool tmpValidReturn = false;
            if ( m_pcEncCfg->getUseFastISP() )
            tmpValidReturn = xIntraCodingLumaISP(*csTemp, subTuPartitioner, bestCurrentCost);
            if (csTemp->tus.size() == 0)
              // no TUs were coded
              csTemp->cost = MAX_DOUBLE;
            // we save the data for future tests
            m_ispTestedModes[m_curIspLfnstIdx].setModeResults((ISPType)cu.ispMode, (int)uiOrgMode.modeId, (int)csTemp->tus.size(), csTemp->cus[0]->firstTU->cbf[COMPONENT_Y] ? csTemp->cost : MAX_DOUBLE, csBest->cost);
            csTemp->cost = !tmpValidReturn ? MAX_DOUBLE : csTemp->cost;
            if (cu.colorTransform)
              tmpValidReturn = xRecurIntraCodingACTQT(*csTemp, partitioner, mtsCheckRangeFlag, mtsFirstCheckId, mtsLastCheckId, moreProbMTSIdxFirst);
            tmpValidReturn = xRecurIntraCodingLumaQT( *csTemp, partitioner, uiBestPUMode.ispMod ? bestCurrentCost : MAX_DOUBLE, -1, TU_NO_ISP, uiBestPUMode.ispMod,
                                                      mtsCheckRangeFlag, mtsFirstCheckId, mtsLastCheckId, moreProbMTSIdxFirst );
          if (!cu.ispMode && !cu.mtsFlag && !cu.lfnstIdx && !cu.bdpcmMode && !pu.multiRefIdx && !cu.mipFlag && testISP)
            m_regIntraRDListWithCosts.push_back( ModeInfoWithCost( cu.mipFlag, pu.mipTransposedFlag, pu.multiRefIdx, cu.ispMode, uiOrgMode.modeId, csTemp->cost ) );
          if( cu.ispMode && !csTemp->cus[0]->firstTU->cbf[COMPONENT_Y] )
            csTemp->cost = MAX_DOUBLE;
    Nan Hu's avatar
    Nan Hu committed
            csTemp->costDbOffset = 0;
          validReturn |= tmpValidReturn;
          if( sps.getUseLFNST() && mtsUsageFlag == 1 && !cu.ispMode && mode >= 0 )
            m_modeCostStore[lfnstIdx][mode] = tmpValidReturn ? csTemp->cost : (MAX_DOUBLE / 2.0); //(MAX_DOUBLE / 2.0) ??
          DTRACE(g_trace_ctx, D_INTRA_COST, "IntraCost T [x=%d,y=%d,w=%d,h=%d] %f (%d,%d,%d,%d,%d,%d) \n", cu.blocks[0].x,
            cu.blocks[0].y, (int)width, (int)height, csTemp->cost, uiOrgMode.modeId, uiOrgMode.ispMod,
            pu.multiRefIdx, cu.mipFlag, cu.lfnstIdx, cu.mtsFlag);
            if (isFirstColorSpace)
              if (m_pcEncCfg->getRGBFormatFlag() || !cu.ispMode)
                sortRdModeListFirstColorSpace(uiOrgMode, csTemp->cost, cu.bdpcmMode, m_savedRdModeFirstColorSpace[m_savedRdModeIdx], m_savedRdCostFirstColorSpace[m_savedRdModeIdx], m_savedBDPCMModeFirstColorSpace[m_savedRdModeIdx], m_numSavedRdModeFirstColorSpace[m_savedRdModeIdx]);
            // check r-d cost
            if( csTemp->cost < csBest->cost )
              std::swap( csTemp, csBest );
              uiBestPUMode  = uiOrgMode;
              bestBDPCMMode = cu.bdpcmMode;
              if( sps.getUseLFNST() && mtsUsageFlag == 1 && !cu.ispMode )
                m_bestModeCostStore[ lfnstIdx ] = csBest->cost; //cs.cost;
                m_bestModeCostValid[ lfnstIdx ] = true;
              if( csBest->cost < bestCurrentCost )
                bestCurrentCost = csBest->cost;
              if ( cu.ispMode )
                bestLfnstIdx = cu.lfnstIdx;
              else if ( testISP )
            if( !cu.ispMode && !cu.bdpcmMode && csBest->cost < bestCostNonBDPCM )
              bestCostNonBDPCM = csBest->cost;
          if( m_pcEncCfg->getFastLocalDualTreeMode() )
            if( cu.isConsIntra() && !cu.slice->isIntra() && csBest->cost != MAX_DOUBLE && costInterCU != COST_UNKNOWN && mode >= 0 )
              if( m_pcEncCfg->getFastLocalDualTreeMode() == 2 )
                //Note: only try one intra mode, which is especially useful to reduce EncT for LDB case (around 4%)
          if (sps.getUseColorTrans() && !CS::isDualITree(cs))
            if ((m_pcEncCfg->getRGBFormatFlag() && !cu.colorTransform) && csBest->cost != MAX_DOUBLE && bestCS->cost != MAX_DOUBLE && mode >= 0)
              if (csBest->cost > bestCS->cost)
        cu.ispMode = uiBestPUMode.ispMod;
        cu.lfnstIdx = bestLfnstIdx;
          if (cu.colorTransform)
            cs.useSubStructure(*csBest, partitioner.chType, pu, true, true, keepResi, keepResi);
          cs.useSubStructure( *csBest, partitioner.chType, pu.singleChan( CHANNEL_TYPE_LUMA ), true, true, keepResi, keepResi );
        if( validReturn )
          //=== update PU data ====
          cu.mipFlag = uiBestPUMode.mipFlg;
          pu.mipTransposedFlag             = uiBestPUMode.mipTrFlg;
          pu.multiRefIdx = uiBestPUMode.mRefId;
          pu.intraDir[ CHANNEL_TYPE_LUMA ] = uiBestPUMode.modeId;
          cu.bdpcmMode = bestBDPCMMode;
          if (cu.colorTransform)
            CHECK(pu.intraDir[CHANNEL_TYPE_CHROMA] != DM_CHROMA_IDX, "chroma should use DM mode for adaptive color transform");
      //===== reset context models =====
      m_CABACEstimator->getCtx() = ctxStart;
    void IntraSearch::estIntraPredChromaQT( CodingUnit &cu, Partitioner &partitioner, const double maxCostAllowed )
      const ChromaFormat format   = cu.chromaFormat;
      const uint32_t    numberValidComponents = getNumberValidComponents(format);
      CodingStructure &cs = *cu.cs;
      const TempCtx ctxStart  ( m_CtxCache, m_CABACEstimator->getCtx() );
      cs.setDecomp( cs.area.Cb(), false );
      double    bestCostSoFar = maxCostAllowed;
      bool      lumaUsesISP   = !cu.isSepTree() && cu.ispMode;
      PartSplit ispType       = lumaUsesISP ? CU::getISPType( cu, COMPONENT_Y ) : TU_NO_ISP;
      CHECK( cu.ispMode && bestCostSoFar < 0, "bestCostSoFar must be positive!" );
      auto &pu = *cu.firstPU;
        uint32_t       uiBestMode = 0;
        Distortion uiBestDist = 0;
        double     dBestCost = MAX_DOUBLE;
        int32_t bestBDPCMMode = 0;
          int32_t  uiMinMode = 0;
          int32_t  uiMaxMode = NUM_CHROMA_MODE;
          //----- check chroma modes -----
          uint32_t chromaCandModes[ NUM_CHROMA_MODE ];
          PU::getIntraChromaCandModes( pu, chromaCandModes );
          // create a temporary CS
          CodingStructure &saveCS = *m_pSaveCS[0];
          saveCS.pcv      = cs.pcv;
          saveCS.picture  = cs.picture;
          saveCS.area.repositionTo( cs.area );
          if( !cu.isSepTree() && cu.ispMode )
          if( cu.isSepTree() )
            if( partitioner.canSplit( TU_MAX_TR_SPLIT, cs ) )
              partitioner.splitCurrArea( TU_MAX_TR_SPLIT, cs );
                cs.addTU( CS::getArea( cs, partitioner.currArea(), partitioner.chType ), partitioner.chType ).depth = partitioner.currTrDepth;
              } while( partitioner.nextPart( cs ) );
            cs.addTU( CS::getArea( cs, partitioner.currArea(), partitioner.chType ), partitioner.chType );
          std::vector<TransformUnit*> orgTUs;
          if( lumaUsesISP )
            CodingUnit& auxCU = saveCS.addCU( cu, partitioner.chType );
            auxCU.ispMode = cu.ispMode;
            saveCS.sps = cu.cs->sps;
            saveCS.addPU( *cu.firstPU, partitioner.chType );
          // create a store for the TUs
          for( const auto &ptu : cs.tus )
            // for split TUs in HEVC, add the TUs without Chroma parts for correct setting of Cbfs
            if( lumaUsesISP || pu.contains( *ptu, CHANNEL_TYPE_CHROMA ) )
              saveCS.addTU( *ptu, partitioner.chType );
              orgTUs.push_back( ptu );
          // SATD pre-selecting.
          int satdModeList[NUM_CHROMA_MODE];
          int64_t satdSortedCost[NUM_CHROMA_MODE];
          for (int i = 0; i < NUM_CHROMA_MODE; i++)
            satdSortedCost[i] = 0; // for the mode not pre-select by SATD, do RDO by default, so set the initial value 0.
            satdModeList[i] = 0;
          bool modeIsEnable[NUM_INTRA_MODE + 1]; // use intra mode idx to check whether enable
          for (int i = 0; i < NUM_INTRA_MODE + 1; i++)
            modeIsEnable[i] = 1;
          DistParam distParamSad;
          DistParam distParamSatd;
          pu.intraDir[1] = MDLM_L_IDX; // temporary assigned, just to indicate this is a MDLM mode. for luma down-sampling operation.
          initIntraPatternChType(cu, pu.Cb());
          initIntraPatternChType(cu, pu.Cr());
          xGetLumaRecPixels(pu, pu.Cb());
          for (int idx = uiMinMode; idx <= uiMaxMode - 1; idx++)
            int mode = chromaCandModes[idx];
            satdModeList[idx] = mode;
            if (PU::isLMCMode(mode) && !PU::isLMCModeEnabled(pu, mode))
            if ((mode == LM_CHROMA_IDX) || (mode == PLANAR_IDX) || (mode == DM_CHROMA_IDX)) // only pre-check regular modes and MDLM modes, not including DM ,Planar, and LM
            pu.intraDir[1] = mode; // temporary assigned, for SATD checking.
            int64_t sad = 0;
            int64_t sadCb = 0;
            int64_t satdCb = 0;
            int64_t sadCr = 0;
            int64_t satdCr = 0;
            CodingStructure& cs = *(pu.cs);
            CompArea areaCb = pu.Cb();
            PelBuf orgCb = cs.getOrgBuf(areaCb);
            PelBuf predCb = cs.getPredBuf(areaCb);
            m_pcRdCost->setDistParam(distParamSad, orgCb, predCb, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), COMPONENT_Cb, false);
            m_pcRdCost->setDistParam(distParamSatd, orgCb, predCb, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), COMPONENT_Cb, true);
            distParamSad.applyWeight = false;
            distParamSatd.applyWeight = false;
            if (PU::isLMCMode(mode))
              predIntraChromaLM(COMPONENT_Cb, predCb, pu, areaCb, mode);
              initPredIntraParams(pu, pu.Cb(), *pu.cs->sps);
              predIntraAng(COMPONENT_Cb, predCb, pu);
            sadCb = distParamSad.distFunc(distParamSad) * 2;
            satdCb = distParamSatd.distFunc(distParamSatd);
            sad += std::min(sadCb, satdCb);
            CompArea areaCr = pu.Cr();
            PelBuf orgCr = cs.getOrgBuf(areaCr);
            PelBuf predCr = cs.getPredBuf(areaCr);
            m_pcRdCost->setDistParam(distParamSad, orgCr, predCr, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), COMPONENT_Cr, false);
            m_pcRdCost->setDistParam(distParamSatd, orgCr, predCr, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), COMPONENT_Cr, true);
            distParamSad.applyWeight = false;
            distParamSatd.applyWeight = false;
            if (PU::isLMCMode(mode))
              predIntraChromaLM(COMPONENT_Cr, predCr, pu, areaCr, mode);
              initPredIntraParams(pu, pu.Cr(), *pu.cs->sps);
              predIntraAng(COMPONENT_Cr, predCr, pu);
            sadCr = distParamSad.distFunc(distParamSad) * 2;
            satdCr = distParamSatd.distFunc(distParamSatd);
            sad += std::min(sadCr, satdCr);
            satdSortedCost[idx] = sad;
          // sort the mode based on the cost from small to large.
          int tempIdx = 0;
          int64_t tempCost = 0;
          for (int i = uiMinMode; i <= uiMaxMode - 1; i++)
            for (int j = i + 1; j <= uiMaxMode - 1; j++)
              if (satdSortedCost[j] < satdSortedCost[i])
                tempIdx = satdModeList[i];
                satdModeList[i] = satdModeList[j];
                satdModeList[j] = tempIdx;
                tempCost = satdSortedCost[i];
                satdSortedCost[i] = satdSortedCost[j];
                satdSortedCost[j] = tempCost;
          int reducedModeNumber = 2; // reduce the number of chroma modes
          for (int i = 0; i < reducedModeNumber; i++)
            modeIsEnable[satdModeList[uiMaxMode - 1 - i]] = 0; // disable the last reducedModeNumber modes
          bool testBDPCM = true;
          testBDPCM = testBDPCM && CU::bdpcmAllowed(cu, COMPONENT_Cb) && cu.ispMode == 0 && cu.mtsFlag == 0 && cu.lfnstIdx == 0;
          for (int32_t uiMode = uiMinMode - (2 * int(testBDPCM)); uiMode < uiMaxMode; uiMode++)
            int chromaIntraMode = chromaCandModes[uiMode];
            if (uiMode < 0)
                cu.bdpcmModeChroma = -uiMode;
                chromaIntraMode = chromaCandModes[0];
                cu.bdpcmModeChroma = 0;
            if( PU::isLMCMode( chromaIntraMode ) && ! PU::isLMCModeEnabled( pu, chromaIntraMode ) )
            if (!modeIsEnable[chromaIntraMode] && PU::isLMCModeEnabled(pu, chromaIntraMode)) // when CCLM is disable, then MDLM is disable. not use satd checking
            cs.setDecomp( pu.Cb(), false );
            cs.dist = baseDist;
            //----- restore context models -----
            m_CABACEstimator->getCtx() = ctxStart;
            //----- chroma coding -----
            pu.intraDir[1] = chromaIntraMode;
            xRecurIntraChromaCodingQT( cs, partitioner, bestCostSoFar, ispType );
            if( lumaUsesISP && cs.dist == MAX_UINT )
            if (cs.sps->getTransformSkipEnabledFlag())
            uint64_t fracBits   = xGetIntraFracBitsQT( cs, partitioner, false, true, -1, ispType );
            Distortion uiDist = cs.dist;
            double    dCost   = m_pcRdCost->calcRdCost( fracBits, uiDist - baseDist );
            //----- compare -----
            if( dCost < dBestCost )
              if( lumaUsesISP && dCost < bestCostSoFar )
                bestCostSoFar = dCost;
              for( uint32_t i = getFirstComponentOfChannel( CHANNEL_TYPE_CHROMA ); i < numberValidComponents; i++ )
                const CompArea &area = pu.blocks[i];
                saveCS.getRecoBuf     ( area ).copyFrom( cs.getRecoBuf   ( area ) );
                saveCS.getPredBuf     ( area ).copyFrom( cs.getPredBuf   ( area ) );
                saveCS.getResiBuf     ( area ).copyFrom( cs.getResiBuf   ( area ) );
    Taoran Lu's avatar
    Taoran Lu committed
                saveCS.getPredBuf     ( area ).copyFrom( cs.getPredBuf   (area ) );
                cs.picture->getPredBuf( area ).copyFrom( cs.getPredBuf   (area ) );
                cs.picture->getRecoBuf( area ).copyFrom( cs.getRecoBuf( area ) );
                for( uint32_t j = 0; j < saveCS.tus.size(); j++ )
                  saveCS.tus[j]->copyComponentFrom( *orgTUs[j], area.compID );
              dBestCost  = dCost;
              uiBestDist = uiDist;
              uiBestMode = chromaIntraMode;
              bestBDPCMMode = cu.bdpcmModeChroma;
          for( uint32_t i = getFirstComponentOfChannel( CHANNEL_TYPE_CHROMA ); i < numberValidComponents; i++ )
            const CompArea &area = pu.blocks[i];
            cs.getRecoBuf         ( area ).copyFrom( saveCS.getRecoBuf( area ) );
            cs.getPredBuf         ( area ).copyFrom( saveCS.getPredBuf( area ) );
            cs.getResiBuf         ( area ).copyFrom( saveCS.getResiBuf( area ) );
    Taoran Lu's avatar
    Taoran Lu committed
            cs.getPredBuf         ( area ).copyFrom( saveCS.getPredBuf( area ) );
            cs.picture->getPredBuf( area ).copyFrom( cs.getPredBuf    ( area ) );
            cs.picture->getRecoBuf( area ).copyFrom( cs.    getRecoBuf( area ) );
            for( uint32_t j = 0; j < saveCS.tus.size(); j++ )
              orgTUs[ j ]->copyComponentFrom( *saveCS.tus[ j ], area.compID );
        pu.intraDir[1] = uiBestMode;
        cs.dist        = uiBestDist;
        cu.bdpcmModeChroma = bestBDPCMMode;
      //----- restore context models -----
      m_CABACEstimator->getCtx() = ctxStart;
      if( lumaUsesISP && bestCostSoFar >= maxCostAllowed )
        cu.ispMode = 0;
    void IntraSearch::saveCuAreaCostInSCIPU( Area area, double cost )
      if( m_numCuInSCIPU < NUM_INTER_CU_INFO_SAVE )
        m_cuAreaInSCIPU[m_numCuInSCIPU] = area;
        m_cuCostInSCIPU[m_numCuInSCIPU] = cost;
    void IntraSearch::initCuAreaCostInSCIPU()
      for( int i = 0; i < NUM_INTER_CU_INFO_SAVE; i++ )
        m_cuAreaInSCIPU[i] = Area();
        m_cuCostInSCIPU[i] = 0;
      m_numCuInSCIPU = 0;
    void IntraSearch::PLTSearch(CodingStructure &cs, Partitioner& partitioner, ComponentID compBegin, uint32_t numComp)
      CodingUnit    &cu = *cs.getCU(partitioner.chType);
      TransformUnit &tu = *cs.getTU(partitioner.chType);
      uint32_t height = cu.block(compBegin).height;
      uint32_t width = cu.block(compBegin).width;
      if (m_pcEncCfg->getLmcs() && (cs.picHeader->getLmcsEnabledFlag() && m_pcReshape->getCTUFlag()))
      cu.lastPLTSize[compBegin] = cs.prevPLT.curPLTSize[compBegin];
      //derive palette
      derivePLTLossy(cs, partitioner, compBegin, numComp);
      reorderPLT(cs, partitioner, compBegin, numComp);
      preCalcPLTIndexRD(cs, partitioner, compBegin, numComp); // Pre-calculate distortions for each pixel 
      double rdCost = MAX_DOUBLE;
      deriveIndexMap(cs, partitioner, compBegin, numComp, PLT_SCAN_HORTRAV, rdCost); // Optimize palette index map (horizontal scan)
      if ((cu.curPLTSize[compBegin] + cu.useEscape[compBegin]) > 1)
        deriveIndexMap(cs, partitioner, compBegin, numComp, PLT_SCAN_VERTRAV, rdCost); // Optimize palette index map (vertical scan)
    Yung-Hsuan Chao (Jessie)'s avatar
    Yung-Hsuan Chao (Jessie) committed
      cu.useRotation[compBegin] = m_bestScanRotationMode;
      int indexMaxSize = cu.useEscape[compBegin] ? (cu.curPLTSize[compBegin] + 1) : cu.curPLTSize[compBegin];
      if (indexMaxSize <= 1)
        cu.useRotation[compBegin] = false;
      //reconstruct pixel
      PelBuf    curPLTIdx = tu.getcurPLTIdx(compBegin);
      for (uint32_t y = 0; y < height; y++)
        for (uint32_t x = 0; x < width; x++)
          if (, y) == cu.curPLTSize[compBegin])
            calcPixelPred(cs, partitioner, y, x, compBegin, numComp);
            for (uint32_t compID = compBegin; compID < (compBegin + numComp); compID++)
              CompArea area = cu.blocks[compID];
              PelBuf   recBuf = cs.getRecoBuf(area);
              uint32_t scaleX = getComponentScaleX((ComponentID)COMPONENT_Cb, cs.sps->getChromaFormatIdc());
              uint32_t scaleY = getComponentScaleY((ComponentID)COMPONENT_Cb, cs.sps->getChromaFormatIdc());
              if (compBegin != COMPONENT_Y || compID == COMPONENT_Y)
      , y) = cu.curPLT[compID][, y)];
              else if (compBegin == COMPONENT_Y && compID != COMPONENT_Y && y % (1 << scaleY) == 0 && x % (1 << scaleX) == 0)
       >> scaleX, y >> scaleY) = cu.curPLT[compID][, y)];
      cs.fracBits = MAX_UINT;
      cs.cost = MAX_DOUBLE;
      Distortion distortion = 0;
      for (uint32_t comp = compBegin; comp < (compBegin + numComp); comp++)
        const ComponentID compID = ComponentID(comp);
        CPelBuf reco = cs.getRecoBuf(compID);
        CPelBuf org = cs.getOrgBuf(compID);
        if (m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() || (
          m_pcEncCfg->getLmcs() && (cs.picHeader->getLmcsEnabledFlag() && m_pcReshape->getCTUFlag())))
          const CPelBuf orgLuma = cs.getOrgBuf(cs.area.blocks[COMPONENT_Y]);
          if (compID == COMPONENT_Y && !(m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled()))
            const CompArea &areaY = cu.Y();
            CompArea tmpArea1(COMPONENT_Y, areaY.chromaFormat, Position(0, 0), areaY.size());
            PelBuf   tmpRecLuma = m_tmpStorageLCU.getBuf(tmpArea1);
            distortion += m_pcRdCost->getDistPart(org, tmpRecLuma, cs.sps->getBitDepth(toChannelType(compID)), compID, DF_SSE_WTD, &orgLuma);
            distortion += m_pcRdCost->getDistPart(org, reco, cs.sps->getBitDepth(toChannelType(compID)), compID, DF_SSE_WTD, &orgLuma);
          distortion += m_pcRdCost->getDistPart(org, reco, cs.sps->getBitDepth(toChannelType(compID)), compID, DF_SSE);
      cs.dist += distortion;
      const CompArea &area = cu.blocks[compBegin];
    void IntraSearch::calcPixelPredRD(CodingStructure& cs, Partitioner& partitioner, Pel* orgBuf, Pel* paPixelValue, Pel* paRecoValue, ComponentID compBegin, uint32_t numComp)
      CodingUnit &cu = *cs.getCU(partitioner.chType);
      TransformUnit &tu = *cs.getTU(partitioner.chType);
      int qp[3];
      int qpRem[3];
      int qpPer[3];
      int quantiserScale[3];
      int quantiserRightShift[3];
      int rightShiftOffset[3];
      int invquantiserRightShift[3];
      int add[3];
      for (uint32_t ch = compBegin; ch < (compBegin + numComp); ch++)
        QpParam cQP(tu, ComponentID(ch));
        qp[ch] = cQP.Qp(true);
        qpRem[ch] = qp[ch] % 6;
        qpPer[ch] = qp[ch] / 6;
        quantiserScale[ch] = g_quantScales[0][qpRem[ch]];
        quantiserRightShift[ch] = QUANT_SHIFT + qpPer[ch];
        rightShiftOffset[ch] = 1 << (quantiserRightShift[ch] - 1);
        invquantiserRightShift[ch] = IQUANT_SHIFT;
        add[ch] = 1 << (invquantiserRightShift[ch] - 1);
      for (uint32_t ch = compBegin; ch < (compBegin + numComp); ch++)
        const int  channelBitDepth = cu.cs->sps->getBitDepth(toChannelType((ComponentID)ch));
        paPixelValue[ch] = Pel(std::max<int>(0, ((orgBuf[ch] * quantiserScale[ch] + rightShiftOffset[ch]) >> quantiserRightShift[ch])));
        assert(paPixelValue[ch] < (1 << (channelBitDepth + 1)));
        paRecoValue[ch] = (((paPixelValue[ch] * g_invQuantScales[0][qpRem[ch]]) << qpPer[ch]) + add[ch]) >> invquantiserRightShift[ch];
        paRecoValue[ch] = Pel(ClipBD<int>(paRecoValue[ch], channelBitDepth));//to be checked
    void IntraSearch::preCalcPLTIndexRD(CodingStructure& cs, Partitioner& partitioner, ComponentID compBegin, uint32_t numComp)
      CodingUnit &cu = *cs.getCU(partitioner.chType);
      uint32_t height = cu.block(compBegin).height;
      uint32_t width = cu.block(compBegin).width;
      CPelBuf   orgBuf[3];
      for (int comp = compBegin; comp < (compBegin + numComp); comp++)
        CompArea  area = cu.blocks[comp];
        if (m_pcEncCfg->getLmcs() && (cs.picHeader->getLmcsEnabledFlag() && m_pcReshape->getCTUFlag()))
    1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
          orgBuf[comp] = cs.getPredBuf(area);
          orgBuf[comp] = cs.getOrgBuf(area);
      int rasPos;
      uint32_t scaleX = getComponentScaleX(COMPONENT_Cb, cs.sps->getChromaFormatIdc());
      uint32_t scaleY = getComponentScaleY(COMPONENT_Cb, cs.sps->getChromaFormatIdc());
      for (uint32_t y = 0; y < height; y++)
        for (uint32_t x = 0; x < width; x++)
          rasPos = y * width + x;;
          // chroma discard
          bool discardChroma = (compBegin == COMPONENT_Y) && (y&scaleY || x&scaleX);
          Pel curPel[3];
          for (int comp = compBegin; comp < (compBegin + numComp); comp++)
            uint32_t pX1 = (comp > 0 && compBegin == COMPONENT_Y) ? (x >> scaleX) : x;
            uint32_t pY1 = (comp > 0 && compBegin == COMPONENT_Y) ? (y >> scaleY) : y;
            curPel[comp] = orgBuf[comp].at(pX1, pY1);
          uint8_t  pltIdx = 0;
          double minError = MAX_DOUBLE;
          uint8_t  bestIdx = 0;
          while (pltIdx < cu.curPLTSize[compBegin])
            uint64_t sqrtError = 0;
            for (int comp = compBegin; comp < (discardChroma ? 1 : (compBegin + numComp)); comp++)
              int64_t tmpErr = int64_t(curPel[comp] - cu.curPLT[comp][pltIdx]);
              if (isChroma((ComponentID)comp))
                sqrtError += uint64_t(tmpErr*tmpErr*ENC_CHROMA_WEIGHTING);
                sqrtError += tmpErr*tmpErr;
            m_indexError[pltIdx][rasPos] = (double)sqrtError;
            if (sqrtError < minError)
              minError = (double)sqrtError;
              bestIdx = pltIdx;
          Pel paPixelValue[3], paRecoValue[3];
          calcPixelPredRD(cs, partitioner, curPel, paPixelValue, paRecoValue, compBegin, numComp);
          uint64_t error = 0, rate = 0;
          for (int comp = compBegin; comp < (discardChroma ? 1 : (compBegin + numComp)); comp++)
            int64_t tmpErr = int64_t(curPel[comp] - paRecoValue[comp]);
            if (isChroma((ComponentID)comp))
              error += uint64_t(tmpErr*tmpErr*ENC_CHROMA_WEIGHTING);
              error += tmpErr*tmpErr;
            rate += m_escapeNumBins[paPixelValue[comp]]; // encode quantized escape color
          double rdCost = (double)error + m_pcRdCost->getLambda()*(double)rate;
          m_indexError[cu.curPLTSize[compBegin]][rasPos] = rdCost;
          if (rdCost < minError) 
            minError = rdCost;
            bestIdx = (uint8_t)cu.curPLTSize[compBegin];
          m_minErrorIndexMap[rasPos] = bestIdx; // save the optimal index of the current pixel
    void IntraSearch::deriveIndexMap(CodingStructure& cs, Partitioner& partitioner, ComponentID compBegin, uint32_t numComp, PLTScanMode pltScanMode, double& dMinCost)
      CodingUnit    &cu = *cs.getCU(partitioner.chType);
      TransformUnit &tu = *cs.getTU(partitioner.chType);
      uint32_t      height = cu.block(compBegin).height;
      uint32_t      width = cu.block(compBegin).width;
      int   total     = height*width;
      Pel  *runIndex = tu.getPLTIndex(compBegin);
      bool *runType  = tu.getRunTypes(compBegin);
      m_scanOrder = g_scanOrder[SCAN_UNGROUPED][pltScanMode ? SCAN_TRAV_VER : SCAN_TRAV_HOR][gp_sizeIdxInfo->idxFrom(width)][gp_sizeIdxInfo->idxFrom(height)];
    // Trellis initialization
      for (int i = 0; i < 2; i++)
        memset(m_prevRunTypeRDOQ[i], 0, sizeof(Pel)*NUM_TRELLIS_STATE);
        memset(m_prevRunPosRDOQ[i],  0, sizeof(int)*NUM_TRELLIS_STATE);
        memset(m_stateCostRDOQ[i],  0, sizeof (double)*NUM_TRELLIS_STATE);
      for (int state = 0; state < NUM_TRELLIS_STATE; state++)
        m_statePtRDOQ[state][0] = 0;
    // Context modeling
      const FracBitsAccess& fracBits = m_CABACEstimator->getCtx().getFracBitsAcess();
      BinFracBits fracBitsPltCopyFlagIndex[RUN_IDX_THRE + 1];
      for (int dist = 0; dist <= RUN_IDX_THRE; dist++)
        const unsigned  ctxId = DeriveCtx::CtxPltCopyFlag(PLT_RUN_INDEX, dist);
        fracBitsPltCopyFlagIndex[dist] = fracBits.getFracBitsArray(Ctx::IdxRunModel( ctxId ) );
      BinFracBits fracBitsPltCopyFlagAbove[RUN_IDX_THRE + 1];
      for (int dist = 0; dist <= RUN_IDX_THRE; dist++)
        const unsigned  ctxId = DeriveCtx::CtxPltCopyFlag(PLT_RUN_COPY, dist);
        fracBitsPltCopyFlagAbove[dist] = fracBits.getFracBitsArray(Ctx::CopyRunModel( ctxId ) );
      const BinFracBits fracBitsPltRunType = fracBits.getFracBitsArray( Ctx::RunTypeFlag() );
    // Trellis RDO per CG
      bool contTrellisRD = true;
      for (int subSetId = 0; ( subSetId <= (total - 1) >> LOG2_PALETTE_CG_SIZE ) && contTrellisRD; subSetId++)
        int minSubPos = subSetId << LOG2_PALETTE_CG_SIZE;
        int maxSubPos = minSubPos + (1 << LOG2_PALETTE_CG_SIZE);
        maxSubPos = (maxSubPos > total) ? total : maxSubPos; // if last position is out of the current CU size
        contTrellisRD = deriveSubblockIndexMap(cs, partitioner, compBegin, pltScanMode, minSubPos, maxSubPos, fracBitsPltRunType, fracBitsPltCopyFlagIndex, fracBitsPltCopyFlagAbove, dMinCost, (bool)pltScanMode);
      if (!contTrellisRD)
    // best state at the last scan position
      double  sumRdCost = MAX_DOUBLE;
      uint8_t bestState = 0;
      for (uint8_t state = 0; state < NUM_TRELLIS_STATE; state++)
        if (m_stateCostRDOQ[0][state] < sumRdCost)
          sumRdCost = m_stateCostRDOQ[0][state];
          bestState = state;
         bool checkRunTable  [MAX_CU_BLKSIZE_PLT*MAX_CU_BLKSIZE_PLT];
      uint8_t checkIndexTable[MAX_CU_BLKSIZE_PLT*MAX_CU_BLKSIZE_PLT];
      uint8_t bestStateTable [MAX_CU_BLKSIZE_PLT*MAX_CU_BLKSIZE_PLT];
      uint8_t nextState = bestState;
    // best trellis path
      for (int i = (width*height - 1); i >= 0; i--)
        bestStateTable[i] = nextState;
        int rasterPos = m_scanOrder[i].idx;
        nextState = m_statePtRDOQ[nextState][rasterPos];
    // reconstruct index and runs based on the state pointers
      for (int i = 0; i < (width*height); i++)
        int rasterPos = m_scanOrder[i].idx;
        int  abovePos = (pltScanMode == PLT_SCAN_HORTRAV) ? m_scanOrder[i].idx - width : m_scanOrder[i].idx - 1;
            nextState = bestStateTable[i];
        if ( nextState == 0 ) // same as the previous
          checkRunTable[rasterPos] = checkRunTable[ m_scanOrder[i - 1].idx ];
          if ( checkRunTable[rasterPos] == PLT_RUN_INDEX )
            checkIndexTable[rasterPos] = checkIndexTable[m_scanOrder[i - 1].idx];
            checkIndexTable[rasterPos] = checkIndexTable[ abovePos ];
        else if (nextState == 1) // CopyAbove mode
          checkRunTable[rasterPos] = PLT_RUN_COPY;
          checkIndexTable[rasterPos] = checkIndexTable[abovePos];
        else if (nextState == 2) // Index mode
          checkRunTable[rasterPos] = PLT_RUN_INDEX;
          checkIndexTable[rasterPos] = m_minErrorIndexMap[rasterPos];
    // Escape flag
      m_bestEscape = false;
      for (int pos = 0; pos < (width*height); pos++)
        uint8_t index = checkIndexTable[pos];
        if (index == cu.curPLTSize[compBegin])
          m_bestEscape = true;
    // Horizontal scan v.s vertical scan
      if (sumRdCost < dMinCost)
        cu.useEscape[compBegin] = m_bestEscape;
        m_bestScanRotationMode = pltScanMode;
        for (int pos = 0; pos < (width*height); pos++)
          runIndex[pos] = checkIndexTable[pos];
          runType[pos] = checkRunTable[pos];
        dMinCost = sumRdCost;
    bool IntraSearch::deriveSubblockIndexMap(
      CodingStructure& cs,
      Partitioner&  partitioner,
      ComponentID   compBegin,
      PLTScanMode   pltScanMode,
      int           minSubPos,
      int           maxSubPos,
      const BinFracBits& fracBitsPltRunType,
      const BinFracBits* fracBitsPltIndexINDEX,
      const BinFracBits* fracBitsPltIndexCOPY,
      const double minCost,
      bool         useRotate
      CodingUnit &cu    = *cs.getCU(partitioner.chType);
      uint32_t   height = cu.block(compBegin).height;
      uint32_t   width  = cu.block(compBegin).width;
      int indexMaxValue = cu.curPLTSize[compBegin];
      int refId = 0;
      int currRasterPos, currScanPos, prevScanPos, aboveScanPos, roffset;
      int log2Width = (pltScanMode == PLT_SCAN_HORTRAV) ? floorLog2(width): floorLog2(height);
      int buffersize = (pltScanMode == PLT_SCAN_HORTRAV) ? 2*width: 2*height;
      for (int curPos = minSubPos; curPos < maxSubPos; curPos++)
        currRasterPos = m_scanOrder[curPos].idx;
        prevScanPos = (curPos == 0) ? 0 : (curPos - 1) % buffersize;
        roffset = (curPos >> log2Width) << log2Width;
        aboveScanPos = roffset - (curPos - roffset + 1);
        aboveScanPos %= buffersize;
        currScanPos = curPos % buffersize;
        if ((pltScanMode == PLT_SCAN_HORTRAV && curPos < width) || (pltScanMode == PLT_SCAN_VERTRAV && curPos < height))
          aboveScanPos = -1; // first column/row: above row is not valid
    // Trellis stats: 
    // 1st state: same as previous scanned sample
    // 2nd state: Copy_Above mode
    // 3rd state: Index mode 
    // Loop of current state
        for ( int curState = 0; curState < NUM_TRELLIS_STATE; curState++ ) 
          double    minRdCost          = MAX_DOUBLE;
          int       minState           = 0; // best prevState
          uint8_t   bestRunIndex       = 0;
          bool      bestRunType        = 0;
          bool      bestPrevCodedType  = 0;
          int       bestPrevCodedPos   = 0;
          if ( ( curState == 0 && curPos == 0 ) || ( curState == 1 && aboveScanPos < 0 ) ) // state not available
            m_stateCostRDOQ[1 - refId][curState] = MAX_DOUBLE;
          bool    runType  = 0;
          uint8_t runIndex = 0;
          if ( curState == 1 ) // 2nd state: Copy_Above mode
            runType = PLT_RUN_COPY;
          else if ( curState == 2 ) // 3rd state: Index mode 
            runType = PLT_RUN_INDEX;
            runIndex = m_minErrorIndexMap[currRasterPos];
    // Loop of previous state
          for ( int stateID = 0; stateID < NUM_TRELLIS_STATE; stateID++ ) 
            if ( m_stateCostRDOQ[refId][stateID] == MAX_DOUBLE )
            if ( curState == 0 ) // 1st state: same as previous scanned sample
              runType = m_runMapRDOQ[refId][stateID][prevScanPos];
              runIndex = ( runType == PLT_RUN_INDEX ) ? m_indexMapRDOQ[refId][stateID][ prevScanPos ] : m_indexMapRDOQ[refId][stateID][ aboveScanPos ];
            else if ( curState == 1 ) // 2nd state: Copy_Above mode
              runIndex = m_indexMapRDOQ[refId][stateID][aboveScanPos];
            bool    prevRunType   = m_runMapRDOQ[refId][stateID][prevScanPos];
            uint8_t prevRunIndex  = m_indexMapRDOQ[refId][stateID][prevScanPos];
            uint8_t aboveRunIndex = (aboveScanPos >= 0) ? m_indexMapRDOQ[refId][stateID][aboveScanPos] : 0;
            int      dist = curPos - m_prevRunPosRDOQ[refId][stateID] - 1;
            double rdCost = m_stateCostRDOQ[refId][stateID];
            if ( rdCost >= minRdCost ) continue;
    // Calculate Rd cost 
            bool prevCodedRunType = m_prevRunTypeRDOQ[refId][stateID];
            int  prevCodedPos     = m_prevRunPosRDOQ [refId][stateID];
            const BinFracBits* fracBitsPt = (m_prevRunTypeRDOQ[refId][stateID] == PLT_RUN_INDEX) ? fracBitsPltIndexINDEX : fracBitsPltIndexCOPY;
            rdCost += rateDistOptPLT(runType, runIndex, prevRunType, prevRunIndex, aboveRunIndex, prevCodedRunType, prevCodedPos, curPos, (pltScanMode == PLT_SCAN_HORTRAV) ? width : height, dist, indexMaxValue, fracBitsPt, fracBitsPltRunType);
            if (rdCost < minRdCost) // update minState ( minRdCost )
              minRdCost    = rdCost;
              minState     = stateID;
              bestRunType  = runType;
              bestRunIndex = runIndex;
              bestPrevCodedType = prevCodedRunType;
              bestPrevCodedPos  = prevCodedPos;
    // Update trellis info of current state
          m_stateCostRDOQ  [1 - refId][curState]  = minRdCost;
          m_prevRunTypeRDOQ[1 - refId][curState]  = bestPrevCodedType;
          m_prevRunPosRDOQ [1 - refId][curState]  = bestPrevCodedPos;
          m_statePtRDOQ[curState][currRasterPos] = minState;
          int buffer2update = std::min(buffersize, curPos);
          memcpy(m_indexMapRDOQ[1 - refId][curState], m_indexMapRDOQ[refId][minState], sizeof(uint8_t)*buffer2update);
          memcpy(m_runMapRDOQ[1 - refId][curState], m_runMapRDOQ[refId][minState], sizeof(bool)*buffer2update);
          m_indexMapRDOQ[1 - refId][curState][currScanPos] = bestRunIndex;
          m_runMapRDOQ  [1 - refId][curState][currScanPos] = bestRunType;