QuantRDOQ.cpp

                uint32_t uiBlkPos = cctx.blockPos( iScanPos );

                if (piDstCoeff[ uiBlkPos ])
                {
                  piDstCoeff [ uiBlkPos ] = 0;
                  pdCostCoeff[ iScanPos ] = pdCostCoeff0[ iScanPos ];
                  pdCostSig  [ iScanPos ] = 0;
                }
              }
            } // end if ( d64CostAllZeros < d64BaseCost )
          }
        } // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
      }
      else
      {
        cctx.setSigGroup();
      }
    }
  } //end for (cctx.subSetId)


  //===== estimate last position =====
  if ( iLastScanPos < 0 )
  {
    return;
  }

  double  d64BestCost         = 0;
  int     iBestLastIdxP1      = 0;


  if( !CU::isIntra( *tu.cu ) && isLuma( compID ) && tu.depth == 0 )
  {
    const BinFracBits fracBitsQtRootCbf = fracBits.getFracBitsArray( Ctx::QtRootCbf() );
    d64BestCost  = d64BlockUncodedCost + xGetICost( fracBitsQtRootCbf.intBits[ 0 ] );
    d64BaseCost += xGetICost( fracBitsQtRootCbf.intBits[ 1 ] );
  }
  else
  {
    bool previousCbf       = tu.cbf[COMPONENT_Cb];
    bool lastCbfIsInferred = false;
    if( useIntraSubPartitions )
    {
      bool rootCbfSoFar       = false;
      bool isLastSubPartition = CU::isISPLast(*tu.cu, tu.Y(), compID);
      uint32_t nTus = tu.cu->ispMode == HOR_INTRA_SUBPARTITIONS ? tu.cu->lheight() >> g_aucLog2[tu.lheight()] : tu.cu->lwidth() >> g_aucLog2[tu.lwidth()];
      if( isLastSubPartition )
      {
        TransformUnit* tuPointer = tu.cu->firstTU;
        for( int tuIdx = 0; tuIdx < nTus - 1; tuIdx++ )
        {
          rootCbfSoFar |= TU::getCbfAtDepth(*tuPointer, COMPONENT_Y, tu.depth);
          tuPointer     = tuPointer->next;
        }
        if( !rootCbfSoFar )
        {
          lastCbfIsInferred = true;
        }
      }
      if( !lastCbfIsInferred )
      {
        previousCbf = TU::getPrevTuCbfAtDepth(tu, compID, tu.depth);
      }
    }
    BinFracBits fracBitsQtCbf = fracBits.getFracBitsArray( Ctx::QtCbf[compID]( DeriveCtx::CtxQtCbf( rect.compID, tu.depth, previousCbf, useIntraSubPartitions ) ) );

    if( !lastCbfIsInferred )
    {
      d64BestCost  = d64BlockUncodedCost + xGetICost(fracBitsQtCbf.intBits[0]);
      d64BaseCost += xGetICost(fracBitsQtCbf.intBits[1]);
    }
    else
    {
      d64BestCost  = d64BlockUncodedCost;
    }
  }

  int lastBitsX[LAST_SIGNIFICANT_GROUPS] = { 0 };
  int lastBitsY[LAST_SIGNIFICANT_GROUPS] = { 0 };
  {
#if HEVC_USE_MDCS
    int dim1  = ( cctx.scanType() == SCAN_VER ? uiHeight : uiWidth  );
    int dim2  = ( cctx.scanType() == SCAN_VER ? uiWidth  : uiHeight );
#else
    int dim1 = std::min<int>(JVET_C0024_ZERO_OUT_TH, uiWidth);
    int dim2 = std::min<int>(JVET_C0024_ZERO_OUT_TH, uiHeight);
#endif
    int bitsX = 0;
    int bitsY = 0;
    int ctxId;
    //X-coordinate
    for ( ctxId = 0; ctxId < g_uiGroupIdx[dim1-1]; ctxId++)
    {
      const BinFracBits fB = fracBits.getFracBitsArray( cctx.lastXCtxId(ctxId) );
      lastBitsX[ ctxId ]   = bitsX + fB.intBits[ 0 ];
      bitsX               +=         fB.intBits[ 1 ];
    }
    lastBitsX[ctxId] = bitsX;
    //Y-coordinate
    for ( ctxId = 0; ctxId < g_uiGroupIdx[dim2-1]; ctxId++)
    {
      const BinFracBits fB = fracBits.getFracBitsArray( cctx.lastYCtxId(ctxId) );
      lastBitsY[ ctxId ]   = bitsY + fB.intBits[ 0 ];
      bitsY               +=         fB.intBits[ 1 ];
    }
    lastBitsY[ctxId] = bitsY;
  }


  bool bFoundLast = false;
  for (int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
  {
    d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];
    if (cctx.isSigGroup( iCGScanPos ) )
    {
#if JVET_N0193_LFNST
      uint32_t maxNonZeroPosInCG = iCGSizeM1;
      if( lfnstIdx > 0 && ( ( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8 && cctx.cgPosX() == 0 && cctx.cgPosY() == 0 ) ) )
      {
        maxNonZeroPosInCG = 7;
      }
      for( int iScanPosinCG = maxNonZeroPosInCG; iScanPosinCG >= 0; iScanPosinCG-- )
#else
      for (int iScanPosinCG = iCGSizeM1; iScanPosinCG >= 0; iScanPosinCG--)
#endif
      {
        iScanPos = iCGScanPos * (iCGSizeM1 + 1) + iScanPosinCG;

        if (iScanPos > iLastScanPos)
        {
          continue;
        }
        uint32_t   uiBlkPos     = cctx.blockPos( iScanPos );

        if( piDstCoeff[ uiBlkPos ] )
        {
          uint32_t   uiPosY = uiBlkPos >> uiLog2BlockWidth;
          uint32_t   uiPosX = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
#if HEVC_USE_MDCS
          double d64CostLast  = ( cctx.scanType() == SCAN_VER ? xGetRateLast( lastBitsX, lastBitsY, uiPosY, uiPosX ) : xGetRateLast( lastBitsX, lastBitsY, uiPosX, uiPosY ) );
#else
          double d64CostLast  = xGetRateLast( lastBitsX, lastBitsY, uiPosX, uiPosY );
#endif

          double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];

          if( totalCost < d64BestCost )
          {
            iBestLastIdxP1  = iScanPos + 1;
            d64BestCost     = totalCost;
          }
          if( piDstCoeff[ uiBlkPos ] > 1 )
          {
            bFoundLast = true;
            break;
          }
          d64BaseCost      -= pdCostCoeff[ iScanPos ];
          d64BaseCost      += pdCostCoeff0[ iScanPos ];
        }
        else
        {
          d64BaseCost      -= pdCostSig[ iScanPos ];
        }
      } //end for
      if (bFoundLast)
      {
        break;
      }
    } // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
    DTRACE( g_trace_ctx, D_RDOQ_COST, "%d: %3d, %3d, %dx%d, comp=%d\n", DTRACE_GET_COUNTER( g_trace_ctx, D_RDOQ_COST ), rect.x, rect.y, rect.width, rect.height, compID );
    DTRACE( g_trace_ctx, D_RDOQ_COST, "Uncoded=%d\n", (int64_t)( d64BlockUncodedCost ) );
    DTRACE( g_trace_ctx, D_RDOQ_COST, "Coded  =%d\n", (int64_t)( d64BaseCost ) );

  } // end for


  for ( int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
  {
    int blkPos = cctx.blockPos( scanPos );
    TCoeff level = piDstCoeff[ blkPos ];
    uiAbsSum += level;
    piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
  }

  //===== clean uncoded coefficients =====
  for ( int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
  {
    piDstCoeff[ cctx.blockPos( scanPos ) ] = 0;
  }

#if HEVC_USE_SIGN_HIDING
  if( cctx.signHiding() && uiAbsSum>=2)
  {
#if JVET_N0246_MODIFIED_QUANTSCALES
    const double inverseQuantScale = double(g_invQuantScales[0][cQP.rem]);
#else
    const double inverseQuantScale = double(g_invQuantScales[cQP.rem]);
#endif
    int64_t rdFactor = (int64_t)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per)) / m_dLambda / 16
                               / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth)))
#if HM_QTBT_AS_IN_JEM_QUANT
#else
                              * blkErrScale
#endif
                             + 0.5);

    int lastCG = -1;
    int absSum = 0 ;
    int n ;
    for (int subSet = iCGNum - 1; subSet >= 0; subSet--)
    {
      int  subPos         = subSet << cctx.log2CGSize();
      int  firstNZPosInCG = iCGSizeM1 + 1, lastNZPosInCG = -1;
      absSum = 0 ;

      for( n = iCGSizeM1; n >= 0; --n )
      {
        if( piDstCoeff[ cctx.blockPos( n + subPos )] )
        {
          lastNZPosInCG = n;
          break;
        }
      }

      for( n = 0; n <= iCGSizeM1; n++ )
      {
        if( piDstCoeff[ cctx.blockPos( n + subPos )] )
        {
          firstNZPosInCG = n;
          break;
        }
      }

      for( n = firstNZPosInCG; n <= lastNZPosInCG; n++ )
      {
        absSum += int(piDstCoeff[ cctx.blockPos( n + subPos )]);
      }

      if(lastNZPosInCG>=0 && lastCG==-1)
      {
        lastCG = 1;
      }

      if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
      {
        uint32_t signbit = (piDstCoeff[cctx.blockPos(subPos+firstNZPosInCG)]>0?0:1);
        if( signbit!=(absSum&0x1) )  // hide but need tune
        {
          // calculate the cost
          int64_t minCostInc = std::numeric_limits<int64_t>::max(), curCost = std::numeric_limits<int64_t>::max();
          int minPos = -1, finalChange = 0, curChange = 0;

          for( n = (lastCG == 1 ? lastNZPosInCG : iCGSizeM1); n >= 0; --n )
          {
            uint32_t uiBlkPos   = cctx.blockPos( n + subPos );
            if(piDstCoeff[ uiBlkPos ] != 0 )
            {
              int64_t costUp   = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
              int64_t costDown = rdFactor * (   deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
                -   ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);

              if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
              {
                costDown -= (4<<SCALE_BITS);
              }

              if(costUp<costDown)
              {
                curCost = costUp;
                curChange =  1;
              }
              else
              {
                curChange = -1;
                if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
                {
                  curCost = std::numeric_limits<int64_t>::max();
                }
                else
                {
                  curCost = costDown;
                }
              }
            }
            else
            {
              curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<SCALE_BITS) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
              curChange = 1 ;

              if(n<firstNZPosInCG)
              {
                uint32_t thissignbit = (plSrcCoeff[uiBlkPos]>=0?0:1);
                if(thissignbit != signbit )
                {
                  curCost = std::numeric_limits<int64_t>::max();
                }
              }
            }

            if( curCost<minCostInc)
            {
              minCostInc = curCost;
              finalChange = curChange;
              minPos = uiBlkPos;
            }
          }

          if(piDstCoeff[minPos] == entropyCodingMaximum || piDstCoeff[minPos] == entropyCodingMinimum)
          {
            finalChange = -1;
          }

          if(plSrcCoeff[minPos]>=0)
          {
            piDstCoeff[minPos] += finalChange ;
          }
          else
          {
            piDstCoeff[minPos] -= finalChange ;
          }
        }
      }

      if(lastCG==1)
      {
        lastCG=0 ;
      }
    }
  }
#endif
}

#if JVET_N0280_RESIDUAL_CODING_TS
void QuantRDOQ::xRateDistOptQuantTS( TransformUnit &tu, const ComponentID &compID, const CCoeffBuf &coeffs, TCoeff &absSum, const QpParam &qp, const Ctx &ctx )
{
  const FracBitsAccess& fracBits = ctx.getFracBitsAcess();

  const SPS &sps            = *tu.cs->sps;
  const CompArea &rect      = tu.blocks[compID];
  const uint32_t width      = rect.width;
  const uint32_t height     = rect.height;
  const ChannelType chType  = toChannelType(compID);
  const int channelBitDepth = sps.getBitDepth( chType );

  const bool extendedPrecision     = sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
  const int  maxLog2TrDynamicRange = sps.getMaxLog2TrDynamicRange(chType);

  int transformShift = getTransformShift( channelBitDepth, rect.size(), maxLog2TrDynamicRange );

  if( extendedPrecision )
  {
    transformShift = std::max<int>( 0, transformShift );
  }

        double   blockUncodedCost                   = 0;
#if HEVC_USE_SCALING_LISTS
  const uint32_t log2BlockHeight                    = g_aucLog2[height];
#endif
  const uint32_t maxNumCoeff                        = rect.area();

  CHECK( compID >= MAX_NUM_TBLOCKS, "Invalid component ID" );

#if HEVC_USE_SCALING_LISTS
  int scalingListType = getScalingListType( tu.cu->predMode, compID );
  CHECK( scalingListType >= SCALING_LIST_NUM, "Invalid scaling list" );
#endif

  const TCoeff *srcCoeff = coeffs.buf;
        TCoeff *dstCoeff = tu.getCoeffs( compID ).buf;

  double *costCoeff  = m_pdCostCoeff;
  double *costSig    = m_pdCostSig;
  double *costCoeff0 = m_pdCostCoeff0;

  memset( m_pdCostCoeff,  0, sizeof( double ) *  maxNumCoeff );
  memset( m_pdCostSig,    0, sizeof( double ) *  maxNumCoeff );

#if JVET_N0246_MODIFIED_QUANTSCALES
  const bool   needsSqrt2Scale = TU::needsSqrt2Scale( tu, compID );  // should always be false - transform-skipped blocks don't require sqrt(2) compensation.
  const int    qBits = QUANT_SHIFT + qp.per + transformShift + (needsSqrt2Scale?-1:0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
  const int    quantisationCoefficient = g_quantScales[needsSqrt2Scale?1:0][qp.rem];
  const double errorScale              = xGetErrScaleCoeff( TU::needsSqrt2Scale( tu, compID ), width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth );
#else

  const int qBits = QUANT_SHIFT + qp.per + transformShift;                   // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits

#if HM_QTBT_AS_IN_JEM_QUANT
  const int    quantisationCoefficient = ( TU::needsSqrt2Scale( tu, compID ) ? ( g_quantScales[qp.rem] * 181 ) >> 7 : g_quantScales[qp.rem] );
  const double errorScale              = xGetErrScaleCoeff( TU::needsSqrt2Scale( tu, compID ), width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth );
#else
  const double blkErrScale             = ( TU::needsQP3Offset( tu, compID ) ? 2.0 : 1.0 );
  const int    quantisationCoefficient = g_quantScales[qp.rem];
  const double errorScale              = blkErrScale * xGetErrScaleCoeff( width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth );
#endif
#endif

  const TCoeff entropyCodingMaximum = ( 1 << maxLog2TrDynamicRange ) - 1;

#if HEVC_USE_SIGN_HIDING
  CoeffCodingContext cctx( tu, compID, tu.cs->slice->getSignDataHidingEnabledFlag() );
#else
  CoeffCodingContext cctx( tu, compID );
#endif
  const int sbSizeM1    = ( 1 << cctx.log2CGSize() ) - 1;
  double    baseCost    = 0;
  uint32_t  goRiceParam = 0;

  double *costSigSubBlock = m_pdCostCoeffGroupSig;
  memset( costSigSubBlock, 0, ( maxNumCoeff >> cctx.log2CGSize() ) * sizeof( double ) );

  const int sbNum = width * height >> cctx.log2CGSize();
  int scanPos;
  coeffGroupRDStats rdStats;

  bool anySigCG = false;

  for( int sbId = 0; sbId < sbNum; sbId++ )
  {
    cctx.initSubblock( sbId );

    memset( &rdStats, 0, sizeof (coeffGroupRDStats));

    for( int scanPosInSB = 0; scanPosInSB <= sbSizeM1; scanPosInSB++ )
    {
      scanPos = cctx.minSubPos() + scanPosInSB;
      //===== quantization =====
      uint32_t blkPos = cctx.blockPos( scanPos );

      // set coeff
      const int64_t          tmpLevel    = int64_t( abs( srcCoeff[blkPos] ) ) * quantisationCoefficient;
      const Intermediate_Int levelDouble = (Intermediate_Int)std::min<int64_t>( tmpLevel, std::numeric_limits<Intermediate_Int>::max() - ( Intermediate_Int( 1 ) << ( qBits - 1 ) ) );
            uint32_t         maxAbsLevel = std::min<uint32_t>( uint32_t( entropyCodingMaximum ), uint32_t( ( levelDouble + ( Intermediate_Int( 1 ) << ( qBits - 1 ) ) ) >> qBits ) );

      const double err       = double( levelDouble );
      costCoeff0[ scanPos ]  = err * err * errorScale;
      blockUncodedCost      += costCoeff0[ scanPos ];
      dstCoeff[ blkPos ]     = maxAbsLevel;

      //===== coefficient level estimation =====
            unsigned    ctxIdSig = cctx.sigCtxIdAbsTS( scanPos, dstCoeff );
            uint32_t    cLevel;
      const BinFracBits fracBitsPar = fracBits.getFracBitsArray( cctx.parityCtxIdAbsTS() );

      goRiceParam = cctx.templateAbsSumTS( scanPos, dstCoeff );
      const BinFracBits fracBitsSign = fracBits.getFracBitsArray( Ctx::TsResidualSign( toChannelType(compID) ) );
      const uint8_t     sign         = srcCoeff[ blkPos ] < 0 ? 1 : 0;

      DTRACE_COND( ( maxAbsLevel != 0 ), g_trace_ctx, D_RDOQ_MORE, " uiCtxSig=%d", ctxIdSig );

      const BinFracBits fracBitsSig = fracBits.getFracBitsArray( ctxIdSig );
      cLevel = xGetCodedLevelTS( costCoeff[ scanPos ], costCoeff0[ scanPos ], costSig[ scanPos ],
                                 levelDouble, maxAbsLevel, &fracBitsSig, fracBitsPar, cctx, fracBits, fracBitsSign, sign, goRiceParam, qBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange );
      dstCoeff[ blkPos ]  = cLevel;
      baseCost           += costCoeff[ scanPos ];
      rdStats.d64SigCost += costSig[ scanPos ];

      if( scanPosInSB == 0 )
      {
        rdStats.d64SigCost_0 = costSig[ scanPos ];
      }
      if( dstCoeff[ blkPos ] )
      {
        cctx.setSigGroup();
        rdStats.d64CodedLevelandDist += costCoeff [ scanPos ] - costSig[ scanPos ];
        rdStats.d64UncodedDist       += costCoeff0[ scanPos ];
        if( scanPosInSB != 0 )
        {
          rdStats.iNNZbeforePos0++;
        }
      }
    } //end for (iScanPosinCG)

    if( !cctx.isSigGroup() )
    {
      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId( true ) );
      baseCost += xGetRateSigCoeffGroup( fracBitsSigGroup, 0 ) - rdStats.d64SigCost;
      costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup( fracBitsSigGroup, 0 );
    }
    else if( sbId != sbSizeM1 || anySigCG )
    {
      if( rdStats.iNNZbeforePos0 == 0 )
      {
        baseCost -= rdStats.d64SigCost_0;
        rdStats.d64SigCost -= rdStats.d64SigCost_0;
      }
      // rd-cost if SigCoeffGroupFlag = 0, initialization
      double costZeroSB = baseCost;

      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId( true ) );

      baseCost   += xGetRateSigCoeffGroup( fracBitsSigGroup, 1 );
      costZeroSB += xGetRateSigCoeffGroup( fracBitsSigGroup, 0 );
      costSigSubBlock[ cctx.subSetId() ] = xGetRateSigCoeffGroup( fracBitsSigGroup, 1 );

      costZeroSB += rdStats.d64UncodedDist;         // distortion for resetting non-zero levels to zero levels
      costZeroSB -= rdStats.d64CodedLevelandDist;   // distortion and level cost for keeping all non-zero levels
      costZeroSB -= rdStats.d64SigCost;             // sig cost for all coeffs, including zero levels and non-zerl levels

      if( costZeroSB < baseCost )
      {
        cctx.resetSigGroup();
        baseCost = costZeroSB;
        costSigSubBlock[ cctx.subSetId() ] = xGetRateSigCoeffGroup( fracBitsSigGroup, 0 );

        for( int scanPosInSB = 0; scanPosInSB < sbSizeM1; scanPosInSB++ )
        {
          scanPos = cctx.minSubPos() + scanPosInSB;
          uint32_t blkPos = cctx.blockPos( scanPos );

          if( dstCoeff[ blkPos ] )
          {
            dstCoeff[ blkPos ] = 0;
            costCoeff[ scanPos ] = costCoeff0[ scanPos ];
            costSig[ scanPos] = 0;
          }
        }
      }
      else
      {
        anySigCG = true;
      }
    }
  }

  //===== estimate last position =====
  for( int scanPos = 0; scanPos < maxNumCoeff; scanPos++ )
  {
    int blkPos = cctx.blockPos( scanPos );
    TCoeff level = dstCoeff[ blkPos ];
    absSum += level;
    dstCoeff[ blkPos ] = ( level != 0 && srcCoeff[ blkPos ] < 0 ) ? -level : level;
  }
}

#if JVET_N0413_RDPCM
void QuantRDOQ::forwardRDPCM( TransformUnit &tu, const ComponentID &compID, const CCoeffBuf &coeffs, TCoeff &absSum, const QpParam &qp, const Ctx &ctx )
{
  const FracBitsAccess& fracBits = ctx.getFracBitsAcess();

  const SPS &sps = *tu.cs->sps;
  const CompArea &rect = tu.blocks[compID];
  const uint32_t width = rect.width;
  const uint32_t height = rect.height;
  const ChannelType chType = toChannelType(compID);
  const int channelBitDepth = sps.getBitDepth(chType);

  const bool extendedPrecision = sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
  const int  maxLog2TrDynamicRange = sps.getMaxLog2TrDynamicRange(chType);
  const int  dirMode = tu.cu->bdpcmMode;

  int transformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange);

  if (extendedPrecision)
  {
    transformShift = std::max<int>(0, transformShift);
  }

  double   blockUncodedCost = 0;
#if HEVC_USE_SCALING_LISTS
  const uint32_t log2BlockHeight = g_aucLog2[height];
#endif
  const uint32_t maxNumCoeff = rect.area();

  CHECK(compID >= MAX_NUM_TBLOCKS, "Invalid component ID");

#if HEVC_USE_SCALING_LISTS
  int scalingListType = getScalingListType(tu.cu->predMode, compID);
  CHECK(scalingListType >= SCALING_LIST_NUM, "Invalid scaling list");
#endif

  const TCoeff *srcCoeff = coeffs.buf;
  TCoeff *dstCoeff = tu.getCoeffs(compID).buf;

  double *costCoeff = m_pdCostCoeff;
  double *costSig = m_pdCostSig;
  double *costCoeff0 = m_pdCostCoeff0;

  memset(m_pdCostCoeff, 0, sizeof(double) *  maxNumCoeff);
  memset(m_pdCostSig, 0, sizeof(double) *  maxNumCoeff);
  memset(m_fullCoeff, 0, sizeof(TCoeff) * maxNumCoeff);

#if JVET_N0246_MODIFIED_QUANTSCALES
  const bool   needsSqrt2Scale = TU::needsSqrt2Scale(tu, compID);  // should always be false - transform-skipped blocks don't require sqrt(2) compensation.
  const int    qBits = QUANT_SHIFT + qp.per + transformShift + (needsSqrt2Scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
  const int    quantisationCoefficient = g_quantScales[needsSqrt2Scale ? 1 : 0][qp.rem];
  const double errorScale = xGetErrScaleCoeff(TU::needsSqrt2Scale(tu, compID), width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth);

  TrQuantParams trQuantParams;
  trQuantParams.rightShift = (IQUANT_SHIFT - (transformShift + qp.per));
  trQuantParams.qScale = g_invQuantScales[needsSqrt2Scale ? 1 : 0][qp.rem];
#else

  const int qBits = QUANT_SHIFT + qp.per + transformShift;                   // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits

#if HM_QTBT_AS_IN_JEM_QUANT
  const int    quantisationCoefficient = (TU::needsSqrt2Scale(tu, compID) ? (g_quantScales[qp.rem] * 181) >> 7 : g_quantScales[qp.rem]);
  const double errorScale = xGetErrScaleCoeff(TU::needsSqrt2Scale(tu, compID), width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth);
#else
  const double blkErrScale = (TU::needsQP3Offset(tu, compID) ? 2.0 : 1.0);
  const int    quantisationCoefficient = g_quantScales[qp.rem];
  const double errorScale = blkErrScale * xGetErrScaleCoeff(width, height, qp.rem, maxLog2TrDynamicRange, channelBitDepth);
#endif
#endif

  const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;

#if HEVC_USE_SIGN_HIDING
  CoeffCodingContext cctx(tu, compID, tu.cs->slice->getSignDataHidingEnabledFlag());
#else
  CoeffCodingContext cctx(tu, compID);
#endif
  const int sbSizeM1 = (1 << cctx.log2CGSize()) - 1;
  double    baseCost = 0;
  uint32_t  goRiceParam = 0;

  double *costSigSubBlock = m_pdCostCoeffGroupSig;
  memset(costSigSubBlock, 0, (maxNumCoeff >> cctx.log2CGSize()) * sizeof(double));

  const int sbNum = width * height >> cctx.log2CGSize();
  int scanPos;
  coeffGroupRDStats rdStats;

  bool anySigCG = false;

  for (int sbId = 0; sbId < sbNum; sbId++)
  {
    cctx.initSubblock(sbId);

    memset(&rdStats, 0, sizeof(coeffGroupRDStats));

    for (int scanPosInSB = 0; scanPosInSB <= sbSizeM1; scanPosInSB++)
    {
      scanPos = cctx.minSubPos() + scanPosInSB;
      //===== quantization =====
      uint32_t blkPos = cctx.blockPos(scanPos);

      const int posX = cctx.posX(scanPos);
      const int posY = cctx.posY(scanPos);
      const int posS = (1 == dirMode) ? posX : posY;
      const int posNb = (1 == dirMode) ? (posX - 1) + posY * coeffs.stride : posX + (posY - 1) * coeffs.stride;
      TCoeff predCoeff = (0 != posS) ? m_fullCoeff[posNb] : 0;

      // set coeff
      const int64_t          tmpLevel = int64_t(abs(srcCoeff[blkPos] - predCoeff)) * quantisationCoefficient;
      const Intermediate_Int levelDouble = (Intermediate_Int)std::min<int64_t>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (qBits - 1)));
      uint32_t         maxAbsLevel = std::min<uint32_t>(uint32_t(entropyCodingMaximum), uint32_t((levelDouble + (Intermediate_Int(1) << (qBits - 1))) >> qBits));

      const double err = double(levelDouble);
      costCoeff0[scanPos] = err * err * errorScale;
      blockUncodedCost += costCoeff0[scanPos];
      dstCoeff[blkPos] = maxAbsLevel;

      //===== coefficient level estimation =====
      unsigned    ctxIdSig = cctx.sigCtxIdAbsTS(scanPos, dstCoeff);
      uint32_t    cLevel;
      const BinFracBits fracBitsPar = fracBits.getFracBitsArray(cctx.parityCtxIdAbsTS());

      goRiceParam = cctx.templateAbsSumTS(scanPos, dstCoeff);
      const BinFracBits fracBitsSign = fracBits.getFracBitsArray(Ctx::TsResidualSign(1));
      const uint8_t     sign = srcCoeff[blkPos] - predCoeff < 0 ? 1 : 0;

      DTRACE_COND((maxAbsLevel != 0), g_trace_ctx, D_RDOQ_MORE, " uiCtxSig=%d", ctxIdSig);

      const BinFracBits fracBitsSig = fracBits.getFracBitsArray(ctxIdSig);
      cLevel = xGetCodedLevelTS(costCoeff[scanPos], costCoeff0[scanPos], costSig[scanPos],
        levelDouble, maxAbsLevel, &fracBitsSig, fracBitsPar, cctx, fracBits, fracBitsSign, sign, goRiceParam, qBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange);
      dstCoeff[blkPos] = cLevel;

      if (sign)
      {
        dstCoeff[blkPos] = -dstCoeff[blkPos];
      }
      xDequantSample( m_fullCoeff[blkPos], dstCoeff[blkPos], trQuantParams );
      m_fullCoeff[blkPos] += predCoeff;

      baseCost += costCoeff[scanPos];
      rdStats.d64SigCost += costSig[scanPos];

      if (scanPosInSB == 0)
      {
        rdStats.d64SigCost_0 = costSig[scanPos];
      }
      if (dstCoeff[blkPos])
      {
        cctx.setSigGroup();
        rdStats.d64CodedLevelandDist += costCoeff[scanPos] - costSig[scanPos];
        rdStats.d64UncodedDist += costCoeff0[scanPos];
        if (scanPosInSB != 0)
        {
          rdStats.iNNZbeforePos0++;
        }
      }
    } //end for (iScanPosinCG)

    if (!cctx.isSigGroup())
    {
      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray(cctx.sigGroupCtxId(true));
      baseCost += xGetRateSigCoeffGroup(fracBitsSigGroup, 0) - rdStats.d64SigCost;
      costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup(fracBitsSigGroup, 0);
    }
    else if (sbId != sbSizeM1 || anySigCG)
    {
      if (rdStats.iNNZbeforePos0 == 0)
      {
        baseCost -= rdStats.d64SigCost_0;
        rdStats.d64SigCost -= rdStats.d64SigCost_0;
      }
      // rd-cost if SigCoeffGroupFlag = 0, initialization
      double costZeroSB = baseCost;

      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray(cctx.sigGroupCtxId(true));

      baseCost += xGetRateSigCoeffGroup(fracBitsSigGroup, 1);
      costZeroSB += xGetRateSigCoeffGroup(fracBitsSigGroup, 0);
      costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup(fracBitsSigGroup, 1);

      costZeroSB += rdStats.d64UncodedDist;         // distortion for resetting non-zero levels to zero levels
      costZeroSB -= rdStats.d64CodedLevelandDist;   // distortion and level cost for keeping all non-zero levels
      costZeroSB -= rdStats.d64SigCost;             // sig cost for all coeffs, including zero levels and non-zerl levels

      if (costZeroSB < baseCost)
      {
        cctx.resetSigGroup();
        baseCost = costZeroSB;
        costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup(fracBitsSigGroup, 0);

        for (int scanPosInSB = 0; scanPosInSB < sbSizeM1; scanPosInSB++)
        {
          scanPos = cctx.minSubPos() + scanPosInSB;
          uint32_t blkPos = cctx.blockPos(scanPos);

          const int posX = cctx.posX(scanPos);
          const int posY = cctx.posY(scanPos);
          const int posS = (1 == dirMode) ? posX : posY;
          const int posNb = (1 == dirMode) ? (posX - 1) + posY * coeffs.stride : posX + (posY - 1) * coeffs.stride;
          m_fullCoeff[scanPos] = (0 != posS) ? m_fullCoeff[posNb] : 0;

          if (dstCoeff[blkPos])
          {
            dstCoeff[blkPos] = 0;
            costCoeff[scanPos] = costCoeff0[scanPos];
            costSig[scanPos] = 0;
          }
        }
      }
      else
      {
        anySigCG = true;
      }
    }
  }

  //===== estimate last position =====
  for (int scanPos = 0; scanPos < maxNumCoeff; scanPos++)
  {
    int blkPos = cctx.blockPos(scanPos);
    TCoeff level = dstCoeff[blkPos];
    absSum += abs(level);
  }
}

void QuantRDOQ::xDequantSample(TCoeff& pRes, TCoeff& coeff, const TrQuantParams& trQuantParams)
{
  // xDequant
  if (trQuantParams.rightShift > 0)
  {
    const Intermediate_Int qAdd = Intermediate_Int(1) << (trQuantParams.rightShift - 1);
    pRes = TCoeff((Intermediate_Int(coeff) * trQuantParams.qScale + qAdd) >> trQuantParams.rightShift);
  }
  else
  {
    pRes = TCoeff((Intermediate_Int(coeff) * trQuantParams.qScale) << -trQuantParams.rightShift);
  }
}
#endif
inline uint32_t QuantRDOQ::xGetCodedLevelTS(       double&             codedCost,
                                                   double&             codedCost0,
                                                   double&             codedCostSig,
                                                   Intermediate_Int    levelDouble,
                                                   uint32_t            maxAbsLevel,
                                             const BinFracBits*        fracBitsSig,
                                             const BinFracBits&        fracBitsPar,
                                             const CoeffCodingContext& cctx,
                                             const FracBitsAccess&     fracBitsAccess,
                                             const BinFracBits&        fracBitsSign,
                                             const uint8_t             sign,
                                                   uint16_t            ricePar,
                                                   int                 qBits,
                                                   double              errorScale,
                                                   bool                isLast,
                                                   bool                useLimitedPrefixLength,
                                                   const int           maxLog2TrDynamicRange
                                           ) const
{
  double   currCostSig  = 0;
  uint32_t bestAbsLevel = 0;

  if( !isLast && maxAbsLevel < 3 )
  {
    codedCostSig = xGetRateSigCoef( *fracBitsSig, 0 );
    codedCost    = codedCost0 + codedCostSig;
    if( maxAbsLevel == 0 )
    {
      return bestAbsLevel;
    }
  }
  else
  {
    codedCost = MAX_DOUBLE;
  }

  if( !isLast )
  {
    currCostSig = xGetRateSigCoef( *fracBitsSig, 1 );
  }

  uint32_t minAbsLevel = ( maxAbsLevel > 1 ? maxAbsLevel - 1 : 1 );
  for( int absLevel = maxAbsLevel; absLevel >= minAbsLevel ; absLevel-- )
  {
    double err       = double( levelDouble  - ( Intermediate_Int( absLevel ) << qBits ) );
    double currCost  = err * err * errorScale + xGetICost( xGetICRateTS( absLevel, fracBitsPar, cctx, fracBitsAccess, fracBitsSign, sign, ricePar, useLimitedPrefixLength, maxLog2TrDynamicRange ) );
           currCost += currCostSig;

    if( currCost < codedCost )
    {
      bestAbsLevel = absLevel;
      codedCost    = currCost;
      codedCostSig = currCostSig;
    }
  }

  return bestAbsLevel;
}

inline int QuantRDOQ::xGetICRateTS( const uint32_t            absLevel,
                                    const BinFracBits&        fracBitsPar,
                                    const CoeffCodingContext& cctx,
                                    const FracBitsAccess&     fracBitsAccess,
                                    const BinFracBits&        fracBitsSign,
                                    const uint8_t             sign,
                                    const uint16_t            ricePar,
                                    const bool                useLimitedPrefixLength,
                                    const int                 maxLog2TrDynamicRange  ) const
{
  int rate = fracBitsSign.intBits[sign];

  const uint16_t     ctxGt1      = cctx.greaterXCtxIdAbsTS( 0 );
  const BinFracBits &fracBitsGt1 = fracBitsAccess.getFracBitsArray( ctxGt1 );

  if( absLevel > 1 )
  {
    rate += fracBitsGt1.intBits[1];
    rate += fracBitsPar.intBits[( absLevel - 2 ) & 1];

          int cutoffVal = 2;
    const int numGtBins = 4;
    for( int i = 0; i < numGtBins; i++ )
    {
      if( absLevel >= cutoffVal )
      {
        const uint16_t ctxGtX = cctx.greaterXCtxIdAbsTS( cutoffVal>>1 );
        const BinFracBits &fracBitsGtX = fracBitsAccess.getFracBitsArray( ctxGtX );
        unsigned gtX = ( absLevel >= ( cutoffVal + 2 ) );
        rate += fracBitsGtX.intBits[gtX];
      }
      cutoffVal += 2;
    }

    if( absLevel >= cutoffVal )
    {
      uint32_t symbol = ( absLevel - cutoffVal ) >> 1;
      uint32_t length;
      const int threshold = COEF_REMAIN_BIN_REDUCTION;
      if( symbol < ( threshold << ricePar ) )
      {
        length = symbol >> ricePar;
        rate  += ( length + 1 + ricePar ) << SCALE_BITS;
      }
      else if( useLimitedPrefixLength )
      {
        const uint32_t maximumPrefixLength = ( 32 - ( COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange ) );

        uint32_t prefixLength = 0;
        uint32_t suffix = ( symbol >> ricePar ) - COEF_REMAIN_BIN_REDUCTION;

        while( ( prefixLength < maximumPrefixLength ) && ( suffix > ( ( 2 << prefixLength ) - 2 ) ) )
        {
          prefixLength++;
        }

        const uint32_t suffixLength = ( prefixLength == maximumPrefixLength ) ? ( maxLog2TrDynamicRange - ricePar ) : ( prefixLength + 1/*separator*/ );

        rate += ( COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ricePar ) << SCALE_BITS;
      }
      else
      {
        length = ricePar;
        symbol = symbol - ( threshold << ricePar );
        while( symbol >= ( 1 << length ) )
        {
          symbol -= ( 1 << ( length++ ) );
        }
        rate += ( threshold + length + 1 - ricePar + length ) << SCALE_BITS;
      }
    }
  }
  else if( absLevel == 1 )
  {
    rate += fracBitsGt1.intBits[0];
  }
  else
  {
    rate = 0;
  }
  return rate;
}
#endif

//! \}