Skip to content
Snippets Groups Projects
Forked from jvet / VVCSoftware_VTM
3122 commits behind the upstream repository.
RdCost.cpp 142.52 KiB
/* The copyright in this software is being made available under the BSD
 * License, included below. This software may be subject to other third party
 * and contributor rights, including patent rights, and no such rights are
 * granted under this license.
 *
 * Copyright (c) 2010-2020, ITU/ISO/IEC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

/** \file     RdCost.cpp
    \brief    RD cost computation class
*/

#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP

#include "RdCost.h"

#include "Rom.h"
#include "UnitPartitioner.h"

#include <limits>

//! \ingroup CommonLib
//! \{


FpDistFunc RdCost::m_afpDistortFunc[DF_TOTAL_FUNCTIONS] = { nullptr, };

RdCost::RdCost()
{
  init();
}

RdCost::~RdCost()
{
}

#if WCG_EXT
double RdCost::calcRdCost( uint64_t fracBits, Distortion distortion, bool useUnadjustedLambda )
#else
double RdCost::calcRdCost( uint64_t fracBits, Distortion distortion )
#endif
{
  if (m_costMode == COST_LOSSLESS_CODING && 0 != distortion && m_isLosslessRDCost)
  {
    return MAX_DOUBLE;
  }
#if WCG_EXT
  return ( useUnadjustedLambda ? m_DistScaleUnadjusted : m_DistScale ) * double( distortion ) + double( fracBits );
#else
  return m_DistScale * double( distortion ) + double( fracBits );
#endif
}

void RdCost::setLambda( double dLambda, const BitDepths &bitDepths )
{
  m_dLambda             = dLambda;
  m_DistScale           = double(1<<SCALE_BITS) / m_dLambda;
  m_dLambdaMotionSAD    = sqrt(m_dLambda);
}

#if JVET_S0234_ACT_CRS_FIX
void RdCost::lambdaAdjustColorTrans(bool forward, ComponentID componentID, bool applyChromaScale, int* resScaleInv)
#else
void RdCost::lambdaAdjustColorTrans(bool forward, ComponentID componentID)
#endif
{
  if (m_resetStore)
  {
    for (uint8_t component = 0; component < MAX_NUM_COMPONENT; component++)
    {
      ComponentID compID = (ComponentID)component;
      int       delta_QP = DELTA_QP_ACT[compID];
      double lamdbaAdjustRate = pow(2.0, delta_QP / 3.0);

      m_lambdaStore[0][component] = m_dLambda;
      m_DistScaleStore[0][component] = m_DistScale;

      m_lambdaStore[1][component] = m_dLambda * lamdbaAdjustRate;
      m_DistScaleStore[1][component] = double(1 << SCALE_BITS) / m_lambdaStore[1][component];
    }
    m_resetStore = false;
  }

  if (forward)
  {
    CHECK(m_pairCheck == 1, "lambda has been already adjusted");
    m_pairCheck = 1;
  }
  else
  {
    CHECK(m_pairCheck == 0, "lambda has not been adjusted");
    m_pairCheck = 0;
  }

  m_dLambda = m_lambdaStore[m_pairCheck][componentID];
  m_DistScale = m_DistScaleStore[m_pairCheck][componentID];
#if JVET_S0234_ACT_CRS_FIX
  if (applyChromaScale)
  {
    CHECK(m_pairCheck == 0 || componentID == COMPONENT_Y, "wrong lambda adjustment for CS");
    double cResScale = (double)(1 << CSCALE_FP_PREC) / (double)(*resScaleInv);
    m_dLambda = m_dLambda / (cResScale*cResScale);
    m_DistScale = double(1 << SCALE_BITS) / m_dLambda;
  }
#endif
  if (m_pairCheck == 0)
  {
    CHECK(m_DistScale != m_DistScaleUnadjusted, "lambda should be adjusted to the original value");
  }
}

// Initialize Function Pointer by [eDFunc]
void RdCost::init()
{
  m_afpDistortFunc[DF_SSE    ] = RdCost::xGetSSE;
  m_afpDistortFunc[DF_SSE2   ] = RdCost::xGetSSE;
  m_afpDistortFunc[DF_SSE4   ] = RdCost::xGetSSE4;
  m_afpDistortFunc[DF_SSE8   ] = RdCost::xGetSSE8;
  m_afpDistortFunc[DF_SSE16  ] = RdCost::xGetSSE16;
  m_afpDistortFunc[DF_SSE32  ] = RdCost::xGetSSE32;
  m_afpDistortFunc[DF_SSE64  ] = RdCost::xGetSSE64;
  m_afpDistortFunc[DF_SSE16N ] = RdCost::xGetSSE16N;

  m_afpDistortFunc[DF_SAD    ] = RdCost::xGetSAD;
  m_afpDistortFunc[DF_SAD2   ] = RdCost::xGetSAD;
  m_afpDistortFunc[DF_SAD4   ] = RdCost::xGetSAD4;
  m_afpDistortFunc[DF_SAD8   ] = RdCost::xGetSAD8;
  m_afpDistortFunc[DF_SAD16  ] = RdCost::xGetSAD16;
  m_afpDistortFunc[DF_SAD32  ] = RdCost::xGetSAD32;
  m_afpDistortFunc[DF_SAD64  ] = RdCost::xGetSAD64;
  m_afpDistortFunc[DF_SAD16N ] = RdCost::xGetSAD16N;

  m_afpDistortFunc[DF_SAD12  ] = RdCost::xGetSAD12;
  m_afpDistortFunc[DF_SAD24  ] = RdCost::xGetSAD24;
  m_afpDistortFunc[DF_SAD48  ] = RdCost::xGetSAD48;

  m_afpDistortFunc[DF_HAD    ] = RdCost::xGetHADs;
  m_afpDistortFunc[DF_HAD2   ] = RdCost::xGetHADs;
  m_afpDistortFunc[DF_HAD4   ] = RdCost::xGetHADs;
  m_afpDistortFunc[DF_HAD8   ] = RdCost::xGetHADs;
  m_afpDistortFunc[DF_HAD16  ] = RdCost::xGetHADs;
  m_afpDistortFunc[DF_HAD32  ] = RdCost::xGetHADs;
  m_afpDistortFunc[DF_HAD64  ] = RdCost::xGetHADs;
  m_afpDistortFunc[DF_HAD16N ] = RdCost::xGetHADs;

  m_afpDistortFunc[DF_MRSAD    ] = RdCost::xGetMRSAD;
  m_afpDistortFunc[DF_MRSAD2   ] = RdCost::xGetMRSAD;
  m_afpDistortFunc[DF_MRSAD4   ] = RdCost::xGetMRSAD4;
  m_afpDistortFunc[DF_MRSAD8   ] = RdCost::xGetMRSAD8;
  m_afpDistortFunc[DF_MRSAD16  ] = RdCost::xGetMRSAD16;
  m_afpDistortFunc[DF_MRSAD32  ] = RdCost::xGetMRSAD32;
  m_afpDistortFunc[DF_MRSAD64  ] = RdCost::xGetMRSAD64;
  m_afpDistortFunc[DF_MRSAD16N ] = RdCost::xGetMRSAD16N;

  m_afpDistortFunc[DF_MRSAD12  ] = RdCost::xGetMRSAD12;
  m_afpDistortFunc[DF_MRSAD24  ] = RdCost::xGetMRSAD24;
  m_afpDistortFunc[DF_MRSAD48  ] = RdCost::xGetMRSAD48;

  m_afpDistortFunc[DF_MRHAD    ] = RdCost::xGetMRHADs;
  m_afpDistortFunc[DF_MRHAD2   ] = RdCost::xGetMRHADs;
  m_afpDistortFunc[DF_MRHAD4   ] = RdCost::xGetMRHADs;
  m_afpDistortFunc[DF_MRHAD8   ] = RdCost::xGetMRHADs;
  m_afpDistortFunc[DF_MRHAD16  ] = RdCost::xGetMRHADs;
  m_afpDistortFunc[DF_MRHAD32  ] = RdCost::xGetMRHADs;
  m_afpDistortFunc[DF_MRHAD64  ] = RdCost::xGetMRHADs;
  m_afpDistortFunc[DF_MRHAD16N ] = RdCost::xGetMRHADs;

  m_afpDistortFunc[DF_SAD_FULL_NBIT   ] = RdCost::xGetSAD_full;
  m_afpDistortFunc[DF_SAD_FULL_NBIT2  ] = RdCost::xGetSAD_full;
  m_afpDistortFunc[DF_SAD_FULL_NBIT4  ] = RdCost::xGetSAD_full;
  m_afpDistortFunc[DF_SAD_FULL_NBIT8  ] = RdCost::xGetSAD_full;
  m_afpDistortFunc[DF_SAD_FULL_NBIT16 ] = RdCost::xGetSAD_full;
  m_afpDistortFunc[DF_SAD_FULL_NBIT32 ] = RdCost::xGetSAD_full;
  m_afpDistortFunc[DF_SAD_FULL_NBIT64 ] = RdCost::xGetSAD_full;
  m_afpDistortFunc[DF_SAD_FULL_NBIT16N] = RdCost::xGetSAD_full;

#if WCG_EXT
  m_afpDistortFunc[DF_SSE_WTD   ] = RdCost::xGetSSE_WTD;
  m_afpDistortFunc[DF_SSE2_WTD  ] = RdCost::xGetSSE2_WTD;
  m_afpDistortFunc[DF_SSE4_WTD  ] = RdCost::xGetSSE4_WTD;
  m_afpDistortFunc[DF_SSE8_WTD  ] = RdCost::xGetSSE8_WTD;
  m_afpDistortFunc[DF_SSE16_WTD ] = RdCost::xGetSSE16_WTD;
  m_afpDistortFunc[DF_SSE32_WTD ] = RdCost::xGetSSE32_WTD;
  m_afpDistortFunc[DF_SSE64_WTD ] = RdCost::xGetSSE64_WTD;
  m_afpDistortFunc[DF_SSE16N_WTD] = RdCost::xGetSSE16N_WTD;
#endif

  m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD;

  m_afpDistortFunc[DF_SAD_WITH_MASK] = RdCost::xGetSADwMask;

#if ENABLE_SIMD_OPT_DIST
#ifdef TARGET_SIMD_X86
  initRdCostX86();
#endif
#endif

  m_costMode                   = COST_STANDARD_LOSSY;

  m_motionLambda               = 0;
  m_iCostScale                 = 0;
  m_resetStore = true;
  m_pairCheck    = 0;
}


#if ENABLE_SPLIT_PARALLELISM

void RdCost::copyState( const RdCost& other )
{
  m_costMode      = other.m_costMode;
  m_dLambda       = other.m_dLambda;
  m_DistScale     = other.m_DistScale;
  memcpy( m_distortionWeight, other.m_distortionWeight, sizeof( m_distortionWeight ) );
  m_mvPredictor   = other.m_mvPredictor;
  m_motionLambda  = other.m_motionLambda;
  m_iCostScale    = other.m_iCostScale;
  m_dLambdaMotionSAD = other.m_dLambdaMotionSAD;
#if WCG_EXT
  m_dLambda_unadjusted  = other.m_dLambda_unadjusted ;
  m_DistScaleUnadjusted = other.m_DistScaleUnadjusted;
#endif
}
#endif

void RdCost::setDistParam( DistParam &rcDP, const CPelBuf &org, const Pel* piRefY, int iRefStride, int bitDepth, ComponentID compID, int subShiftMode, int step, bool useHadamard )
{
  rcDP.bitDepth   = bitDepth;
  rcDP.compID     = compID;

  // set Original & Curr Pointer / Stride
  rcDP.org        = org;

  rcDP.cur.buf    = piRefY;
  rcDP.cur.stride = iRefStride;

  // set Block Width / Height
  rcDP.cur.width    = org.width;
  rcDP.cur.height   = org.height;
  rcDP.step         = step;
  rcDP.maximumDistortionForEarlyExit = std::numeric_limits<Distortion>::max();

  int DFOffset = ( rcDP.useMR ? DF_MRSAD - DF_SAD : 0 );
  if( !useHadamard )
  {
    if( org.width == 12 )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD12 + DFOffset ];
    }
    else if( org.width == 24 )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD24 + DFOffset ];
    }
    else if( org.width == 48 )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD48 + DFOffset ];
    }
    else if( isPowerOf2( org.width ) )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD + DFOffset + floorLog2( org.width ) ];
    }
    else
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD + DFOffset ];
    }
  }
  else if( isPowerOf2( org.width ) )
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_HAD + DFOffset + floorLog2( org.width ) ];
  }
  else
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_HAD + DFOffset ];
  }

  // initialize
  rcDP.subShift  = 0;

  if( subShiftMode == 1 )
  {
    if( rcDP.org.height > 32 && ( rcDP.org.height & 15 ) == 0 )
    {
      rcDP.subShift = 4;
    }
    else if( rcDP.org.height > 16 && ( rcDP.org.height & 7 ) == 0 )
    {
      rcDP.subShift = 3;
    }
    else if( rcDP.org.height > 8 && ( rcDP.org.height & 3 ) == 0 )
    {
      rcDP.subShift = 2;
    }
    else if( ( rcDP.org.height & 1 ) == 0 )
    {
      rcDP.subShift = 1;
    }
  }
  else if( subShiftMode == 2 )
  {
    if( rcDP.org.height > 8 && rcDP.org.width <= 64 )
    {
      rcDP.subShift = 1;
    }
  }
  else if( subShiftMode == 3 )
  {
    if (rcDP.org.height > 8 )
    {
      rcDP.subShift = 1;
    }
  }
}

void RdCost::setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &cur, int bitDepth, ComponentID compID, bool useHadamard )
{
  rcDP.org          = org;
  rcDP.cur          = cur;
  rcDP.step         = 1;
  rcDP.subShift     = 0;
  rcDP.bitDepth     = bitDepth;
  rcDP.compID       = compID;

  const int DFOffset = ( rcDP.useMR ? DF_MRSAD - DF_SAD : 0 );

  if( !useHadamard )
  {
    if( org.width == 12 )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD12 + DFOffset ];
    }
    else if( org.width == 24 )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD24 + DFOffset ];
    }
    else if( org.width == 48 )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD48 + DFOffset ];
    }
    else if( isPowerOf2( org.width) )
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD + DFOffset + floorLog2( org.width ) ];
    }
    else
    {
      rcDP.distFunc = m_afpDistortFunc[ DF_SAD + DFOffset ];
    }
  }
  else
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_HAD + DFOffset + floorLog2( org.width ) ];
  }

  rcDP.maximumDistortionForEarlyExit = std::numeric_limits<Distortion>::max();
}

void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard, bool bioApplied )
{
  rcDP.bitDepth   = bitDepth;
  rcDP.compID     = compID;

  rcDP.org.buf    = pOrg;
  rcDP.org.stride = iOrgStride;
  rcDP.org.width  = width;
  rcDP.org.height = height;

  rcDP.cur.buf    = piRefY;
  rcDP.cur.stride = iRefStride;
  rcDP.cur.width  = width;
  rcDP.cur.height = height;
  rcDP.subShift = subShiftMode;
  rcDP.step       = step;
  rcDP.maximumDistortionForEarlyExit = std::numeric_limits<Distortion>::max();
  CHECK( useHadamard || rcDP.useMR, "only used in xDMVRCost with these default parameters (so far...)" );
  if ( bioApplied )
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_SAD_INTERMEDIATE_BITDEPTH ];
    return;
  }

  if( width == 12 )
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_SAD12 ];
  }
  else if( width == 24 )
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_SAD24 ];
  }
  else if( width == 48 )
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_SAD48 ];
  }
  else
  {
    rcDP.distFunc = m_afpDistortFunc[ DF_SAD + floorLog2( width ) ];
  }
}

#if WCG_EXT
Distortion RdCost::getDistPart( const CPelBuf &org, const CPelBuf &cur, int bitDepth, const ComponentID compID, DFunc eDFunc, const CPelBuf *orgLuma )
#else
Distortion RdCost::getDistPart( const CPelBuf &org, const CPelBuf &cur, int bitDepth, const ComponentID compID, DFunc eDFunc )
#endif
{
  DistParam cDtParam;

  cDtParam.org        = org;
  cDtParam.cur        = cur;
  cDtParam.step       = 1;
  cDtParam.bitDepth   = bitDepth;
  cDtParam.compID     = compID;

#if WCG_EXT
  if( orgLuma )
  {
    cDtParam.cShiftX = getComponentScaleX(compID,  m_cf);
    cDtParam.cShiftY = getComponentScaleY(compID,  m_cf);
    if( isChroma(compID) )
    {
      cDtParam.orgLuma  = *orgLuma;
    }
    else
    {
      cDtParam.orgLuma  = org;
    }
  }
#endif

  if( isPowerOf2( org.width ) )
  {
    cDtParam.distFunc = m_afpDistortFunc[eDFunc + floorLog2(org.width)];
  }
  else
  {
    cDtParam.distFunc = m_afpDistortFunc[eDFunc];
  }

  if (isChroma(compID))
  {
    return ((Distortion) (m_distortionWeight[ MAP_CHROMA(compID) ] * cDtParam.distFunc( cDtParam )));
  }
  else
  {
    return cDtParam.distFunc( cDtParam );
  }
}

// ====================================================================================================================
// Distortion functions
// ====================================================================================================================

// --------------------------------------------------------------------------------------------------------------------
// SAD
// --------------------------------------------------------------------------------------------------------------------

Distortion RdCost::xGetSAD_full( const DistParam& rcDtParam )
{
  CHECK( rcDtParam.applyWeight, "Cannot apply weight when using full-bit SAD!" );
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  int  height      = rcDtParam.org.height;
  int  width       = rcDtParam.org.width;
  int  iSubShift   = rcDtParam.subShift;
  int  iSubStep    = ( 1 << iSubShift );
  int  iStrideCur  = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg  = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

#define SAD_OP( ADDR ) uiSum += abs( piOrg[ADDR] - piCur[ADDR] );
#define SAD_INC piOrg += iStrideOrg; piCur += iStrideCur;

  SIZE_AWARE_PER_EL_OP( SAD_OP, SAD_INC )

#undef SAD_OP
#undef SAD_INC

  uiSum <<= iSubShift;
  return uiSum;
}

Distortion RdCost::xGetSAD( const DistParam& rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg           = rcDtParam.org.buf;
  const Pel* piCur           = rcDtParam.cur.buf;
  const int  iCols           = rcDtParam.org.width;
        int  iRows           = rcDtParam.org.height;
  const int  iSubShift       = rcDtParam.subShift;
  const int  iSubStep        = ( 1 << iSubShift );
  const int  iStrideCur      = rcDtParam.cur.stride * iSubStep;
  const int  iStrideOrg      = rcDtParam.org.stride * iSubStep;
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows -= iSubStep )
  {
    for (int n = 0; n < iCols; n++ )
    {
      uiSum += abs( piOrg[n] - piCur[n] );
    }
    if (rcDtParam.maximumDistortionForEarlyExit < ( uiSum >> distortionShift ))
    {
      return ( uiSum >> distortionShift );
    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return ( uiSum >> distortionShift );
}

Distortion RdCost::xGetSAD4( const DistParam& rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iSubShift     = rcDtParam.subShift;
  int  iSubStep      = ( 1 << iSubShift );
  int  iStrideCur    = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg    = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows -= iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD8( const DistParam& rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );
    uiSum += abs( piOrg[4] - piCur[4] );
    uiSum += abs( piOrg[5] - piCur[5] );
    uiSum += abs( piOrg[6] - piCur[6] );
    uiSum += abs( piOrg[7] - piCur[7] );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD16( const DistParam& rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows -= iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );
    uiSum += abs( piOrg[4] - piCur[4] );
    uiSum += abs( piOrg[5] - piCur[5] );
    uiSum += abs( piOrg[6] - piCur[6] );
    uiSum += abs( piOrg[7] - piCur[7] );
    uiSum += abs( piOrg[8] - piCur[8] );
    uiSum += abs( piOrg[9] - piCur[9] );
    uiSum += abs( piOrg[10] - piCur[10] );
    uiSum += abs( piOrg[11] - piCur[11] );
    uiSum += abs( piOrg[12] - piCur[12] );
    uiSum += abs( piOrg[13] - piCur[13] );
    uiSum += abs( piOrg[14] - piCur[14] );
    uiSum += abs( piOrg[15] - piCur[15] );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD12( const DistParam& rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );
    uiSum += abs( piOrg[4] - piCur[4] );
    uiSum += abs( piOrg[5] - piCur[5] );
    uiSum += abs( piOrg[6] - piCur[6] );
    uiSum += abs( piOrg[7] - piCur[7] );
    uiSum += abs( piOrg[8] - piCur[8] );
    uiSum += abs( piOrg[9] - piCur[9] );
    uiSum += abs( piOrg[10] - piCur[10] );
    uiSum += abs( piOrg[11] - piCur[11] );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD16N( const DistParam &rcDtParam )
{
  const Pel* piOrg  = rcDtParam.org.buf;
  const Pel* piCur  = rcDtParam.cur.buf;
  int  iRows        = rcDtParam.org.height;
  int  iCols        = rcDtParam.org.width;
  int  iSubShift    = rcDtParam.subShift;
  int  iSubStep     = ( 1 << iSubShift );
  int  iStrideCur   = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg   = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows-=iSubStep )
  {
    for (int n = 0; n < iCols; n+=16 )
    {
      uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
      uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
      uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
      uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
      uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
      uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
      uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
      uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
      uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
      uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
      uiSum += abs( piOrg[n+10] - piCur[n+10] );
      uiSum += abs( piOrg[n+11] - piCur[n+11] );
      uiSum += abs( piOrg[n+12] - piCur[n+12] );
      uiSum += abs( piOrg[n+13] - piCur[n+13] );
      uiSum += abs( piOrg[n+14] - piCur[n+14] );
      uiSum += abs( piOrg[n+15] - piCur[n+15] );
    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD32( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );
    uiSum += abs( piOrg[4] - piCur[4] );
    uiSum += abs( piOrg[5] - piCur[5] );
    uiSum += abs( piOrg[6] - piCur[6] );
    uiSum += abs( piOrg[7] - piCur[7] );
    uiSum += abs( piOrg[8] - piCur[8] );
    uiSum += abs( piOrg[9] - piCur[9] );
    uiSum += abs( piOrg[10] - piCur[10] );
    uiSum += abs( piOrg[11] - piCur[11] );
    uiSum += abs( piOrg[12] - piCur[12] );
    uiSum += abs( piOrg[13] - piCur[13] );
    uiSum += abs( piOrg[14] - piCur[14] );
    uiSum += abs( piOrg[15] - piCur[15] );
    uiSum += abs( piOrg[16] - piCur[16] );
    uiSum += abs( piOrg[17] - piCur[17] );
    uiSum += abs( piOrg[18] - piCur[18] );
    uiSum += abs( piOrg[19] - piCur[19] );
    uiSum += abs( piOrg[20] - piCur[20] );
    uiSum += abs( piOrg[21] - piCur[21] );
    uiSum += abs( piOrg[22] - piCur[22] );
    uiSum += abs( piOrg[23] - piCur[23] );
    uiSum += abs( piOrg[24] - piCur[24] );
    uiSum += abs( piOrg[25] - piCur[25] );
    uiSum += abs( piOrg[26] - piCur[26] );
    uiSum += abs( piOrg[27] - piCur[27] );
    uiSum += abs( piOrg[28] - piCur[28] );
    uiSum += abs( piOrg[29] - piCur[29] );
    uiSum += abs( piOrg[30] - piCur[30] );
    uiSum += abs( piOrg[31] - piCur[31] );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD24( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );
    uiSum += abs( piOrg[4] - piCur[4] );
    uiSum += abs( piOrg[5] - piCur[5] );
    uiSum += abs( piOrg[6] - piCur[6] );
    uiSum += abs( piOrg[7] - piCur[7] );
    uiSum += abs( piOrg[8] - piCur[8] );
    uiSum += abs( piOrg[9] - piCur[9] );
    uiSum += abs( piOrg[10] - piCur[10] );
    uiSum += abs( piOrg[11] - piCur[11] );
    uiSum += abs( piOrg[12] - piCur[12] );
    uiSum += abs( piOrg[13] - piCur[13] );
    uiSum += abs( piOrg[14] - piCur[14] );
    uiSum += abs( piOrg[15] - piCur[15] );
    uiSum += abs( piOrg[16] - piCur[16] );
    uiSum += abs( piOrg[17] - piCur[17] );
    uiSum += abs( piOrg[18] - piCur[18] );
    uiSum += abs( piOrg[19] - piCur[19] );
    uiSum += abs( piOrg[20] - piCur[20] );
    uiSum += abs( piOrg[21] - piCur[21] );
    uiSum += abs( piOrg[22] - piCur[22] );
    uiSum += abs( piOrg[23] - piCur[23] );
    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD64( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );
    uiSum += abs( piOrg[4] - piCur[4] );
    uiSum += abs( piOrg[5] - piCur[5] );
    uiSum += abs( piOrg[6] - piCur[6] );
    uiSum += abs( piOrg[7] - piCur[7] );
    uiSum += abs( piOrg[8] - piCur[8] );
    uiSum += abs( piOrg[9] - piCur[9] );
    uiSum += abs( piOrg[10] - piCur[10] );
    uiSum += abs( piOrg[11] - piCur[11] );
    uiSum += abs( piOrg[12] - piCur[12] );
    uiSum += abs( piOrg[13] - piCur[13] );
    uiSum += abs( piOrg[14] - piCur[14] );
    uiSum += abs( piOrg[15] - piCur[15] );
    uiSum += abs( piOrg[16] - piCur[16] );
    uiSum += abs( piOrg[17] - piCur[17] );
    uiSum += abs( piOrg[18] - piCur[18] );
    uiSum += abs( piOrg[19] - piCur[19] );
    uiSum += abs( piOrg[20] - piCur[20] );
    uiSum += abs( piOrg[21] - piCur[21] );
    uiSum += abs( piOrg[22] - piCur[22] );
    uiSum += abs( piOrg[23] - piCur[23] );
    uiSum += abs( piOrg[24] - piCur[24] );
    uiSum += abs( piOrg[25] - piCur[25] );
    uiSum += abs( piOrg[26] - piCur[26] );
    uiSum += abs( piOrg[27] - piCur[27] );
    uiSum += abs( piOrg[28] - piCur[28] );
    uiSum += abs( piOrg[29] - piCur[29] );
    uiSum += abs( piOrg[30] - piCur[30] );
    uiSum += abs( piOrg[31] - piCur[31] );
    uiSum += abs( piOrg[32] - piCur[32] );
    uiSum += abs( piOrg[33] - piCur[33] );
    uiSum += abs( piOrg[34] - piCur[34] );
    uiSum += abs( piOrg[35] - piCur[35] );
    uiSum += abs( piOrg[36] - piCur[36] );
    uiSum += abs( piOrg[37] - piCur[37] );
    uiSum += abs( piOrg[38] - piCur[38] );
    uiSum += abs( piOrg[39] - piCur[39] );
    uiSum += abs( piOrg[40] - piCur[40] );
    uiSum += abs( piOrg[41] - piCur[41] );
    uiSum += abs( piOrg[42] - piCur[42] );
    uiSum += abs( piOrg[43] - piCur[43] );
    uiSum += abs( piOrg[44] - piCur[44] );
    uiSum += abs( piOrg[45] - piCur[45] );
    uiSum += abs( piOrg[46] - piCur[46] );
    uiSum += abs( piOrg[47] - piCur[47] );
    uiSum += abs( piOrg[48] - piCur[48] );
    uiSum += abs( piOrg[49] - piCur[49] );
    uiSum += abs( piOrg[50] - piCur[50] );
    uiSum += abs( piOrg[51] - piCur[51] );
    uiSum += abs( piOrg[52] - piCur[52] );
    uiSum += abs( piOrg[53] - piCur[53] );
    uiSum += abs( piOrg[54] - piCur[54] );
    uiSum += abs( piOrg[55] - piCur[55] );
    uiSum += abs( piOrg[56] - piCur[56] );
    uiSum += abs( piOrg[57] - piCur[57] );
    uiSum += abs( piOrg[58] - piCur[58] );
    uiSum += abs( piOrg[59] - piCur[59] );
    uiSum += abs( piOrg[60] - piCur[60] );
    uiSum += abs( piOrg[61] - piCur[61] );
    uiSum += abs( piOrg[62] - piCur[62] );
    uiSum += abs( piOrg[63] - piCur[63] );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetSAD48( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  Distortion uiSum = 0;

  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] );
    uiSum += abs( piOrg[1] - piCur[1] );
    uiSum += abs( piOrg[2] - piCur[2] );
    uiSum += abs( piOrg[3] - piCur[3] );
    uiSum += abs( piOrg[4] - piCur[4] );
    uiSum += abs( piOrg[5] - piCur[5] );
    uiSum += abs( piOrg[6] - piCur[6] );
    uiSum += abs( piOrg[7] - piCur[7] );
    uiSum += abs( piOrg[8] - piCur[8] );
    uiSum += abs( piOrg[9] - piCur[9] );
    uiSum += abs( piOrg[10] - piCur[10] );
    uiSum += abs( piOrg[11] - piCur[11] );
    uiSum += abs( piOrg[12] - piCur[12] );
    uiSum += abs( piOrg[13] - piCur[13] );
    uiSum += abs( piOrg[14] - piCur[14] );
    uiSum += abs( piOrg[15] - piCur[15] );
    uiSum += abs( piOrg[16] - piCur[16] );
    uiSum += abs( piOrg[17] - piCur[17] );
    uiSum += abs( piOrg[18] - piCur[18] );
    uiSum += abs( piOrg[19] - piCur[19] );
    uiSum += abs( piOrg[20] - piCur[20] );
    uiSum += abs( piOrg[21] - piCur[21] );
    uiSum += abs( piOrg[22] - piCur[22] );
    uiSum += abs( piOrg[23] - piCur[23] );
    uiSum += abs( piOrg[24] - piCur[24] );
    uiSum += abs( piOrg[25] - piCur[25] );
    uiSum += abs( piOrg[26] - piCur[26] );
    uiSum += abs( piOrg[27] - piCur[27] );
    uiSum += abs( piOrg[28] - piCur[28] );
    uiSum += abs( piOrg[29] - piCur[29] );
    uiSum += abs( piOrg[30] - piCur[30] );
    uiSum += abs( piOrg[31] - piCur[31] );
    uiSum += abs( piOrg[32] - piCur[32] );
    uiSum += abs( piOrg[33] - piCur[33] );
    uiSum += abs( piOrg[34] - piCur[34] );
    uiSum += abs( piOrg[35] - piCur[35] );
    uiSum += abs( piOrg[36] - piCur[36] );
    uiSum += abs( piOrg[37] - piCur[37] );
    uiSum += abs( piOrg[38] - piCur[38] );
    uiSum += abs( piOrg[39] - piCur[39] );
    uiSum += abs( piOrg[40] - piCur[40] );
    uiSum += abs( piOrg[41] - piCur[41] );
    uiSum += abs( piOrg[42] - piCur[42] );
    uiSum += abs( piOrg[43] - piCur[43] );
    uiSum += abs( piOrg[44] - piCur[44] );
    uiSum += abs( piOrg[45] - piCur[45] );
    uiSum += abs( piOrg[46] - piCur[46] );
    uiSum += abs( piOrg[47] - piCur[47] );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}




// --------------------------------------------------------------------------------------------------------------------
// MRSAD
// --------------------------------------------------------------------------------------------------------------------

Distortion RdCost::xGetMRSAD( const DistParam& rcDtParam )
{
  const Pel* piOrg           = rcDtParam.org.buf;
  const Pel* piCur           = rcDtParam.cur.buf;
  const int  iCols           = rcDtParam.org.width;
        int  iRows           = rcDtParam.org.height;
  const int  iSubShift       = rcDtParam.subShift;
  const int  iSubStep        = ( 1 << iSubShift );
  const int  iStrideCur      = rcDtParam.cur.stride * iSubStep;
  const int  iStrideOrg      = rcDtParam.org.stride * iSubStep;
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    for( int n = 0; n < iCols; n++ )
    {
      deltaSum += ( piOrg[n] - piCur[n] );
    }
  }

  const Pel offset  = Pel( deltaSum / ( iCols * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum = 0;
  for( ; iRows != 0; iRows -= iSubStep )
  {
    for (int n = 0; n < iCols; n++ )
    {
      uiSum += abs( piOrg[n] - piCur[n] - offset );
    }
    if (rcDtParam.maximumDistortionForEarlyExit < ( uiSum >> distortionShift ))
    {
      return ( uiSum >> distortionShift );
    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }
  uiSum <<= iSubShift;
  return ( uiSum >> distortionShift );
}


Distortion RdCost::xGetMRSAD4( const DistParam& rcDtParam )
{
  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iSubShift     = rcDtParam.subShift;
  int  iSubStep      = ( 1 << iSubShift );
  int  iStrideCur    = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg    = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[0] - piCur[0] );
    deltaSum += ( piOrg[1] - piCur[1] );
    deltaSum += ( piOrg[2] - piCur[2] );
    deltaSum += ( piOrg[3] - piCur[3] );
  }

  const Pel offset  = Pel( deltaSum / ( 4 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows -= iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] - offset );
    uiSum += abs( piOrg[1] - piCur[1] - offset );
    uiSum += abs( piOrg[2] - piCur[2] - offset );
    uiSum += abs( piOrg[3] - piCur[3] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}


Distortion RdCost::xGetMRSAD8( const DistParam& rcDtParam )
{
  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[0] - piCur[0] );
    deltaSum += ( piOrg[1] - piCur[1] );
    deltaSum += ( piOrg[2] - piCur[2] );
    deltaSum += ( piOrg[3] - piCur[3] );
    deltaSum += ( piOrg[4] - piCur[4] );
    deltaSum += ( piOrg[5] - piCur[5] );
    deltaSum += ( piOrg[6] - piCur[6] );
    deltaSum += ( piOrg[7] - piCur[7] );
  }

  const Pel offset  = Pel( deltaSum / ( 8 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[0] - piCur[0] - offset );
    uiSum += abs( piOrg[1] - piCur[1] - offset );
    uiSum += abs( piOrg[2] - piCur[2] - offset );
    uiSum += abs( piOrg[3] - piCur[3] - offset );
    uiSum += abs( piOrg[4] - piCur[4] - offset );
    uiSum += abs( piOrg[5] - piCur[5] - offset );
    uiSum += abs( piOrg[6] - piCur[6] - offset );
    uiSum += abs( piOrg[7] - piCur[7] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetMRSAD16( const DistParam& rcDtParam )
{
  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[ 0] - piCur[ 0] );
    deltaSum += ( piOrg[ 1] - piCur[ 1] );
    deltaSum += ( piOrg[ 2] - piCur[ 2] );
    deltaSum += ( piOrg[ 3] - piCur[ 3] );
    deltaSum += ( piOrg[ 4] - piCur[ 4] );
    deltaSum += ( piOrg[ 5] - piCur[ 5] );
    deltaSum += ( piOrg[ 6] - piCur[ 6] );
    deltaSum += ( piOrg[ 7] - piCur[ 7] );
    deltaSum += ( piOrg[ 8] - piCur[ 8] );
    deltaSum += ( piOrg[ 9] - piCur[ 9] );
    deltaSum += ( piOrg[10] - piCur[10] );
    deltaSum += ( piOrg[11] - piCur[11] );
    deltaSum += ( piOrg[12] - piCur[12] );
    deltaSum += ( piOrg[13] - piCur[13] );
    deltaSum += ( piOrg[14] - piCur[14] );
    deltaSum += ( piOrg[15] - piCur[15] );
  }

  const Pel offset  = Pel( deltaSum / ( 16 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows -= iSubStep )
  {
    uiSum += abs( piOrg[ 0] - piCur[ 0] - offset );
    uiSum += abs( piOrg[ 1] - piCur[ 1] - offset );
    uiSum += abs( piOrg[ 2] - piCur[ 2] - offset );
    uiSum += abs( piOrg[ 3] - piCur[ 3] - offset );
    uiSum += abs( piOrg[ 4] - piCur[ 4] - offset );
    uiSum += abs( piOrg[ 5] - piCur[ 5] - offset );
    uiSum += abs( piOrg[ 6] - piCur[ 6] - offset );
    uiSum += abs( piOrg[ 7] - piCur[ 7] - offset );
    uiSum += abs( piOrg[ 8] - piCur[ 8] - offset );
    uiSum += abs( piOrg[ 9] - piCur[ 9] - offset );
    uiSum += abs( piOrg[10] - piCur[10] - offset );
    uiSum += abs( piOrg[11] - piCur[11] - offset );
    uiSum += abs( piOrg[12] - piCur[12] - offset );
    uiSum += abs( piOrg[13] - piCur[13] - offset );
    uiSum += abs( piOrg[14] - piCur[14] - offset );
    uiSum += abs( piOrg[15] - piCur[15] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetMRSAD12( const DistParam& rcDtParam )
{
  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[ 0] - piCur[ 0] );
    deltaSum += ( piOrg[ 1] - piCur[ 1] );
    deltaSum += ( piOrg[ 2] - piCur[ 2] );
    deltaSum += ( piOrg[ 3] - piCur[ 3] );
    deltaSum += ( piOrg[ 4] - piCur[ 4] );
    deltaSum += ( piOrg[ 5] - piCur[ 5] );
    deltaSum += ( piOrg[ 6] - piCur[ 6] );
    deltaSum += ( piOrg[ 7] - piCur[ 7] );
    deltaSum += ( piOrg[ 8] - piCur[ 8] );
    deltaSum += ( piOrg[ 9] - piCur[ 9] );
    deltaSum += ( piOrg[10] - piCur[10] );
    deltaSum += ( piOrg[11] - piCur[11] );
  }

  const Pel offset  = Pel( deltaSum / ( 12 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[ 0] - piCur[ 0] - offset );
    uiSum += abs( piOrg[ 1] - piCur[ 1] - offset );
    uiSum += abs( piOrg[ 2] - piCur[ 2] - offset );
    uiSum += abs( piOrg[ 3] - piCur[ 3] - offset );
    uiSum += abs( piOrg[ 4] - piCur[ 4] - offset );
    uiSum += abs( piOrg[ 5] - piCur[ 5] - offset );
    uiSum += abs( piOrg[ 6] - piCur[ 6] - offset );
    uiSum += abs( piOrg[ 7] - piCur[ 7] - offset );
    uiSum += abs( piOrg[ 8] - piCur[ 8] - offset );
    uiSum += abs( piOrg[ 9] - piCur[ 9] - offset );
    uiSum += abs( piOrg[10] - piCur[10] - offset );
    uiSum += abs( piOrg[11] - piCur[11] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetMRSAD16N( const DistParam &rcDtParam )
{
  const Pel* piOrg  = rcDtParam.org.buf;
  const Pel* piCur  = rcDtParam.cur.buf;
  int  iRows        = rcDtParam.org.height;
  int  iCols        = rcDtParam.org.width;
  int  iSubShift    = rcDtParam.subShift;
  int  iSubStep     = ( 1 << iSubShift );
  int  iStrideCur   = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg   = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    for( int n = 0; n < iCols; n += 16 )
    {
      deltaSum += ( piOrg[n+ 0] - piCur[n+ 0] );
      deltaSum += ( piOrg[n+ 1] - piCur[n+ 1] );
      deltaSum += ( piOrg[n+ 2] - piCur[n+ 2] );
      deltaSum += ( piOrg[n+ 3] - piCur[n+ 3] );
      deltaSum += ( piOrg[n+ 4] - piCur[n+ 4] );
      deltaSum += ( piOrg[n+ 5] - piCur[n+ 5] );
      deltaSum += ( piOrg[n+ 6] - piCur[n+ 6] );
      deltaSum += ( piOrg[n+ 7] - piCur[n+ 7] );
      deltaSum += ( piOrg[n+ 8] - piCur[n+ 8] );
      deltaSum += ( piOrg[n+ 9] - piCur[n+ 9] );
      deltaSum += ( piOrg[n+10] - piCur[n+10] );
      deltaSum += ( piOrg[n+11] - piCur[n+11] );
      deltaSum += ( piOrg[n+12] - piCur[n+12] );
      deltaSum += ( piOrg[n+13] - piCur[n+13] );
      deltaSum += ( piOrg[n+14] - piCur[n+14] );
      deltaSum += ( piOrg[n+15] - piCur[n+15] );
    }
  }

  const Pel offset  = Pel( deltaSum / ( iCols * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows-=iSubStep )
  {
    for (int n = 0; n < iCols; n+=16 )
    {
      uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] - offset );
      uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] - offset );
      uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] - offset );
      uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] - offset );
      uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] - offset );
      uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] - offset );
      uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] - offset );
      uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] - offset );
      uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] - offset );
      uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] - offset );
      uiSum += abs( piOrg[n+10] - piCur[n+10] - offset );
      uiSum += abs( piOrg[n+11] - piCur[n+11] - offset );
      uiSum += abs( piOrg[n+12] - piCur[n+12] - offset );
      uiSum += abs( piOrg[n+13] - piCur[n+13] - offset );
      uiSum += abs( piOrg[n+14] - piCur[n+14] - offset );
      uiSum += abs( piOrg[n+15] - piCur[n+15] - offset );
    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetMRSAD32( const DistParam &rcDtParam )
{
  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[ 0] - piCur[ 0] );
    deltaSum += ( piOrg[ 1] - piCur[ 1] );
    deltaSum += ( piOrg[ 2] - piCur[ 2] );
    deltaSum += ( piOrg[ 3] - piCur[ 3] );
    deltaSum += ( piOrg[ 4] - piCur[ 4] );
    deltaSum += ( piOrg[ 5] - piCur[ 5] );
    deltaSum += ( piOrg[ 6] - piCur[ 6] );
    deltaSum += ( piOrg[ 7] - piCur[ 7] );
    deltaSum += ( piOrg[ 8] - piCur[ 8] );
    deltaSum += ( piOrg[ 9] - piCur[ 9] );
    deltaSum += ( piOrg[10] - piCur[10] );
    deltaSum += ( piOrg[11] - piCur[11] );
    deltaSum += ( piOrg[12] - piCur[12] );
    deltaSum += ( piOrg[13] - piCur[13] );
    deltaSum += ( piOrg[14] - piCur[14] );
    deltaSum += ( piOrg[15] - piCur[15] );
    deltaSum += ( piOrg[16] - piCur[16] );
    deltaSum += ( piOrg[17] - piCur[17] );
    deltaSum += ( piOrg[18] - piCur[18] );
    deltaSum += ( piOrg[19] - piCur[19] );
    deltaSum += ( piOrg[20] - piCur[20] );
    deltaSum += ( piOrg[21] - piCur[21] );
    deltaSum += ( piOrg[22] - piCur[22] );
    deltaSum += ( piOrg[23] - piCur[23] );
    deltaSum += ( piOrg[24] - piCur[24] );
    deltaSum += ( piOrg[25] - piCur[25] );
    deltaSum += ( piOrg[26] - piCur[26] );
    deltaSum += ( piOrg[27] - piCur[27] );
    deltaSum += ( piOrg[28] - piCur[28] );
    deltaSum += ( piOrg[29] - piCur[29] );
    deltaSum += ( piOrg[30] - piCur[30] );
    deltaSum += ( piOrg[31] - piCur[31] );
  }

  const Pel offset  = Pel( deltaSum / ( 32 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[ 0] - piCur[ 0] - offset );
    uiSum += abs( piOrg[ 1] - piCur[ 1] - offset );
    uiSum += abs( piOrg[ 2] - piCur[ 2] - offset );
    uiSum += abs( piOrg[ 3] - piCur[ 3] - offset );
    uiSum += abs( piOrg[ 4] - piCur[ 4] - offset );
    uiSum += abs( piOrg[ 5] - piCur[ 5] - offset );
    uiSum += abs( piOrg[ 6] - piCur[ 6] - offset );
    uiSum += abs( piOrg[ 7] - piCur[ 7] - offset );
    uiSum += abs( piOrg[ 8] - piCur[ 8] - offset );
    uiSum += abs( piOrg[ 9] - piCur[ 9] - offset );
    uiSum += abs( piOrg[10] - piCur[10] - offset );
    uiSum += abs( piOrg[11] - piCur[11] - offset );
    uiSum += abs( piOrg[12] - piCur[12] - offset );
    uiSum += abs( piOrg[13] - piCur[13] - offset );
    uiSum += abs( piOrg[14] - piCur[14] - offset );
    uiSum += abs( piOrg[15] - piCur[15] - offset );
    uiSum += abs( piOrg[16] - piCur[16] - offset );
    uiSum += abs( piOrg[17] - piCur[17] - offset );
    uiSum += abs( piOrg[18] - piCur[18] - offset );
    uiSum += abs( piOrg[19] - piCur[19] - offset );
    uiSum += abs( piOrg[20] - piCur[20] - offset );
    uiSum += abs( piOrg[21] - piCur[21] - offset );
    uiSum += abs( piOrg[22] - piCur[22] - offset );
    uiSum += abs( piOrg[23] - piCur[23] - offset );
    uiSum += abs( piOrg[24] - piCur[24] - offset );
    uiSum += abs( piOrg[25] - piCur[25] - offset );
    uiSum += abs( piOrg[26] - piCur[26] - offset );
    uiSum += abs( piOrg[27] - piCur[27] - offset );
    uiSum += abs( piOrg[28] - piCur[28] - offset );
    uiSum += abs( piOrg[29] - piCur[29] - offset );
    uiSum += abs( piOrg[30] - piCur[30] - offset );
    uiSum += abs( piOrg[31] - piCur[31] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetMRSAD24( const DistParam &rcDtParam )
{
  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[ 0] - piCur[ 0] );
    deltaSum += ( piOrg[ 1] - piCur[ 1] );
    deltaSum += ( piOrg[ 2] - piCur[ 2] );
    deltaSum += ( piOrg[ 3] - piCur[ 3] );
    deltaSum += ( piOrg[ 4] - piCur[ 4] );
    deltaSum += ( piOrg[ 5] - piCur[ 5] );
    deltaSum += ( piOrg[ 6] - piCur[ 6] );
    deltaSum += ( piOrg[ 7] - piCur[ 7] );
    deltaSum += ( piOrg[ 8] - piCur[ 8] );
    deltaSum += ( piOrg[ 9] - piCur[ 9] );
    deltaSum += ( piOrg[10] - piCur[10] );
    deltaSum += ( piOrg[11] - piCur[11] );
    deltaSum += ( piOrg[12] - piCur[12] );
    deltaSum += ( piOrg[13] - piCur[13] );
    deltaSum += ( piOrg[14] - piCur[14] );
    deltaSum += ( piOrg[15] - piCur[15] );
    deltaSum += ( piOrg[16] - piCur[16] );
    deltaSum += ( piOrg[17] - piCur[17] );
    deltaSum += ( piOrg[18] - piCur[18] );
    deltaSum += ( piOrg[19] - piCur[19] );
    deltaSum += ( piOrg[20] - piCur[20] );
    deltaSum += ( piOrg[21] - piCur[21] );
    deltaSum += ( piOrg[22] - piCur[22] );
    deltaSum += ( piOrg[23] - piCur[23] );
  }

  const Pel offset  = Pel( deltaSum / ( 24 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[ 0] - piCur[ 0] - offset );
    uiSum += abs( piOrg[ 1] - piCur[ 1] - offset );
    uiSum += abs( piOrg[ 2] - piCur[ 2] - offset );
    uiSum += abs( piOrg[ 3] - piCur[ 3] - offset );
    uiSum += abs( piOrg[ 4] - piCur[ 4] - offset );
    uiSum += abs( piOrg[ 5] - piCur[ 5] - offset );
    uiSum += abs( piOrg[ 6] - piCur[ 6] - offset );
    uiSum += abs( piOrg[ 7] - piCur[ 7] - offset );
    uiSum += abs( piOrg[ 8] - piCur[ 8] - offset );
    uiSum += abs( piOrg[ 9] - piCur[ 9] - offset );
    uiSum += abs( piOrg[10] - piCur[10] - offset );
    uiSum += abs( piOrg[11] - piCur[11] - offset );
    uiSum += abs( piOrg[12] - piCur[12] - offset );
    uiSum += abs( piOrg[13] - piCur[13] - offset );
    uiSum += abs( piOrg[14] - piCur[14] - offset );
    uiSum += abs( piOrg[15] - piCur[15] - offset );
    uiSum += abs( piOrg[16] - piCur[16] - offset );
    uiSum += abs( piOrg[17] - piCur[17] - offset );
    uiSum += abs( piOrg[18] - piCur[18] - offset );
    uiSum += abs( piOrg[19] - piCur[19] - offset );
    uiSum += abs( piOrg[20] - piCur[20] - offset );
    uiSum += abs( piOrg[21] - piCur[21] - offset );
    uiSum += abs( piOrg[22] - piCur[22] - offset );
    uiSum += abs( piOrg[23] - piCur[23] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetMRSAD64( const DistParam &rcDtParam )
{
  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[ 0] - piCur[ 0] );
    deltaSum += ( piOrg[ 1] - piCur[ 1] );
    deltaSum += ( piOrg[ 2] - piCur[ 2] );
    deltaSum += ( piOrg[ 3] - piCur[ 3] );
    deltaSum += ( piOrg[ 4] - piCur[ 4] );
    deltaSum += ( piOrg[ 5] - piCur[ 5] );
    deltaSum += ( piOrg[ 6] - piCur[ 6] );
    deltaSum += ( piOrg[ 7] - piCur[ 7] );
    deltaSum += ( piOrg[ 8] - piCur[ 8] );
    deltaSum += ( piOrg[ 9] - piCur[ 9] );
    deltaSum += ( piOrg[10] - piCur[10] );
    deltaSum += ( piOrg[11] - piCur[11] );
    deltaSum += ( piOrg[12] - piCur[12] );
    deltaSum += ( piOrg[13] - piCur[13] );
    deltaSum += ( piOrg[14] - piCur[14] );
    deltaSum += ( piOrg[15] - piCur[15] );
    deltaSum += ( piOrg[16] - piCur[16] );
    deltaSum += ( piOrg[17] - piCur[17] );
    deltaSum += ( piOrg[18] - piCur[18] );
    deltaSum += ( piOrg[19] - piCur[19] );
    deltaSum += ( piOrg[20] - piCur[20] );
    deltaSum += ( piOrg[21] - piCur[21] );
    deltaSum += ( piOrg[22] - piCur[22] );
    deltaSum += ( piOrg[23] - piCur[23] );
    deltaSum += ( piOrg[24] - piCur[24] );
    deltaSum += ( piOrg[25] - piCur[25] );
    deltaSum += ( piOrg[26] - piCur[26] );
    deltaSum += ( piOrg[27] - piCur[27] );
    deltaSum += ( piOrg[28] - piCur[28] );
    deltaSum += ( piOrg[29] - piCur[29] );
    deltaSum += ( piOrg[30] - piCur[30] );
    deltaSum += ( piOrg[31] - piCur[31] );
    deltaSum += ( piOrg[32] - piCur[32] );
    deltaSum += ( piOrg[33] - piCur[33] );
    deltaSum += ( piOrg[34] - piCur[34] );
    deltaSum += ( piOrg[35] - piCur[35] );
    deltaSum += ( piOrg[36] - piCur[36] );
    deltaSum += ( piOrg[37] - piCur[37] );
    deltaSum += ( piOrg[38] - piCur[38] );
    deltaSum += ( piOrg[39] - piCur[39] );
    deltaSum += ( piOrg[40] - piCur[40] );
    deltaSum += ( piOrg[41] - piCur[41] );
    deltaSum += ( piOrg[42] - piCur[42] );
    deltaSum += ( piOrg[43] - piCur[43] );
    deltaSum += ( piOrg[44] - piCur[44] );
    deltaSum += ( piOrg[45] - piCur[45] );
    deltaSum += ( piOrg[46] - piCur[46] );
    deltaSum += ( piOrg[47] - piCur[47] );
    deltaSum += ( piOrg[48] - piCur[48] );
    deltaSum += ( piOrg[49] - piCur[49] );
    deltaSum += ( piOrg[50] - piCur[50] );
    deltaSum += ( piOrg[51] - piCur[51] );
    deltaSum += ( piOrg[52] - piCur[52] );
    deltaSum += ( piOrg[53] - piCur[53] );
    deltaSum += ( piOrg[54] - piCur[54] );
    deltaSum += ( piOrg[55] - piCur[55] );
    deltaSum += ( piOrg[56] - piCur[56] );
    deltaSum += ( piOrg[57] - piCur[57] );
    deltaSum += ( piOrg[58] - piCur[58] );
    deltaSum += ( piOrg[59] - piCur[59] );
    deltaSum += ( piOrg[60] - piCur[60] );
    deltaSum += ( piOrg[61] - piCur[61] );
    deltaSum += ( piOrg[62] - piCur[62] );
    deltaSum += ( piOrg[63] - piCur[63] );
  }

  const Pel offset  = Pel( deltaSum / ( 64 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[ 0] - piCur[ 0] - offset );
    uiSum += abs( piOrg[ 1] - piCur[ 1] - offset );
    uiSum += abs( piOrg[ 2] - piCur[ 2] - offset );
    uiSum += abs( piOrg[ 3] - piCur[ 3] - offset );
    uiSum += abs( piOrg[ 4] - piCur[ 4] - offset );
    uiSum += abs( piOrg[ 5] - piCur[ 5] - offset );
    uiSum += abs( piOrg[ 6] - piCur[ 6] - offset );
    uiSum += abs( piOrg[ 7] - piCur[ 7] - offset );
    uiSum += abs( piOrg[ 8] - piCur[ 8] - offset );
    uiSum += abs( piOrg[ 9] - piCur[ 9] - offset );
    uiSum += abs( piOrg[10] - piCur[10] - offset );
    uiSum += abs( piOrg[11] - piCur[11] - offset );
    uiSum += abs( piOrg[12] - piCur[12] - offset );
    uiSum += abs( piOrg[13] - piCur[13] - offset );
    uiSum += abs( piOrg[14] - piCur[14] - offset );
    uiSum += abs( piOrg[15] - piCur[15] - offset );
    uiSum += abs( piOrg[16] - piCur[16] - offset );
    uiSum += abs( piOrg[17] - piCur[17] - offset );
    uiSum += abs( piOrg[18] - piCur[18] - offset );
    uiSum += abs( piOrg[19] - piCur[19] - offset );
    uiSum += abs( piOrg[20] - piCur[20] - offset );
    uiSum += abs( piOrg[21] - piCur[21] - offset );
    uiSum += abs( piOrg[22] - piCur[22] - offset );
    uiSum += abs( piOrg[23] - piCur[23] - offset );
    uiSum += abs( piOrg[24] - piCur[24] - offset );
    uiSum += abs( piOrg[25] - piCur[25] - offset );
    uiSum += abs( piOrg[26] - piCur[26] - offset );
    uiSum += abs( piOrg[27] - piCur[27] - offset );
    uiSum += abs( piOrg[28] - piCur[28] - offset );
    uiSum += abs( piOrg[29] - piCur[29] - offset );
    uiSum += abs( piOrg[30] - piCur[30] - offset );
    uiSum += abs( piOrg[31] - piCur[31] - offset );
    uiSum += abs( piOrg[32] - piCur[32] - offset );
    uiSum += abs( piOrg[33] - piCur[33] - offset );
    uiSum += abs( piOrg[34] - piCur[34] - offset );
    uiSum += abs( piOrg[35] - piCur[35] - offset );
    uiSum += abs( piOrg[36] - piCur[36] - offset );
    uiSum += abs( piOrg[37] - piCur[37] - offset );
    uiSum += abs( piOrg[38] - piCur[38] - offset );
    uiSum += abs( piOrg[39] - piCur[39] - offset );
    uiSum += abs( piOrg[40] - piCur[40] - offset );
    uiSum += abs( piOrg[41] - piCur[41] - offset );
    uiSum += abs( piOrg[42] - piCur[42] - offset );
    uiSum += abs( piOrg[43] - piCur[43] - offset );
    uiSum += abs( piOrg[44] - piCur[44] - offset );
    uiSum += abs( piOrg[45] - piCur[45] - offset );
    uiSum += abs( piOrg[46] - piCur[46] - offset );
    uiSum += abs( piOrg[47] - piCur[47] - offset );
    uiSum += abs( piOrg[48] - piCur[48] - offset );
    uiSum += abs( piOrg[49] - piCur[49] - offset );
    uiSum += abs( piOrg[50] - piCur[50] - offset );
    uiSum += abs( piOrg[51] - piCur[51] - offset );
    uiSum += abs( piOrg[52] - piCur[52] - offset );
    uiSum += abs( piOrg[53] - piCur[53] - offset );
    uiSum += abs( piOrg[54] - piCur[54] - offset );
    uiSum += abs( piOrg[55] - piCur[55] - offset );
    uiSum += abs( piOrg[56] - piCur[56] - offset );
    uiSum += abs( piOrg[57] - piCur[57] - offset );
    uiSum += abs( piOrg[58] - piCur[58] - offset );
    uiSum += abs( piOrg[59] - piCur[59] - offset );
    uiSum += abs( piOrg[60] - piCur[60] - offset );
    uiSum += abs( piOrg[61] - piCur[61] - offset );
    uiSum += abs( piOrg[62] - piCur[62] - offset );
    uiSum += abs( piOrg[63] - piCur[63] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

Distortion RdCost::xGetMRSAD48( const DistParam &rcDtParam )
{
  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iSubShift        = rcDtParam.subShift;
  int  iSubStep         = ( 1 << iSubShift );
  int  iStrideCur       = rcDtParam.cur.stride * iSubStep;
  int  iStrideOrg       = rcDtParam.org.stride * iSubStep;

  int32_t deltaSum = 0;
  for( int r = iRows; r != 0; r-=iSubStep, piOrg += iStrideOrg, piCur += iStrideCur )
  {
    deltaSum += ( piOrg[ 0] - piCur[ 0] );
    deltaSum += ( piOrg[ 1] - piCur[ 1] );
    deltaSum += ( piOrg[ 2] - piCur[ 2] );
    deltaSum += ( piOrg[ 3] - piCur[ 3] );
    deltaSum += ( piOrg[ 4] - piCur[ 4] );
    deltaSum += ( piOrg[ 5] - piCur[ 5] );
    deltaSum += ( piOrg[ 6] - piCur[ 6] );
    deltaSum += ( piOrg[ 7] - piCur[ 7] );
    deltaSum += ( piOrg[ 8] - piCur[ 8] );
    deltaSum += ( piOrg[ 9] - piCur[ 9] );
    deltaSum += ( piOrg[10] - piCur[10] );
    deltaSum += ( piOrg[11] - piCur[11] );
    deltaSum += ( piOrg[12] - piCur[12] );
    deltaSum += ( piOrg[13] - piCur[13] );
    deltaSum += ( piOrg[14] - piCur[14] );
    deltaSum += ( piOrg[15] - piCur[15] );
    deltaSum += ( piOrg[16] - piCur[16] );
    deltaSum += ( piOrg[17] - piCur[17] );
    deltaSum += ( piOrg[18] - piCur[18] );
    deltaSum += ( piOrg[19] - piCur[19] );
    deltaSum += ( piOrg[20] - piCur[20] );
    deltaSum += ( piOrg[21] - piCur[21] );
    deltaSum += ( piOrg[22] - piCur[22] );
    deltaSum += ( piOrg[23] - piCur[23] );
    deltaSum += ( piOrg[24] - piCur[24] );
    deltaSum += ( piOrg[25] - piCur[25] );
    deltaSum += ( piOrg[26] - piCur[26] );
    deltaSum += ( piOrg[27] - piCur[27] );
    deltaSum += ( piOrg[28] - piCur[28] );
    deltaSum += ( piOrg[29] - piCur[29] );
    deltaSum += ( piOrg[30] - piCur[30] );
    deltaSum += ( piOrg[31] - piCur[31] );
    deltaSum += ( piOrg[32] - piCur[32] );
    deltaSum += ( piOrg[33] - piCur[33] );
    deltaSum += ( piOrg[34] - piCur[34] );
    deltaSum += ( piOrg[35] - piCur[35] );
    deltaSum += ( piOrg[36] - piCur[36] );
    deltaSum += ( piOrg[37] - piCur[37] );
    deltaSum += ( piOrg[38] - piCur[38] );
    deltaSum += ( piOrg[39] - piCur[39] );
    deltaSum += ( piOrg[40] - piCur[40] );
    deltaSum += ( piOrg[41] - piCur[41] );
    deltaSum += ( piOrg[42] - piCur[42] );
    deltaSum += ( piOrg[43] - piCur[43] );
    deltaSum += ( piOrg[44] - piCur[44] );
    deltaSum += ( piOrg[45] - piCur[45] );
    deltaSum += ( piOrg[46] - piCur[46] );
    deltaSum += ( piOrg[47] - piCur[47] );
  }

  const Pel offset  = Pel( deltaSum / ( 48 * ( iRows >> iSubShift ) ) );
  piOrg             = rcDtParam.org.buf;
  piCur             = rcDtParam.cur.buf;
  Distortion uiSum  = 0;
  for( ; iRows != 0; iRows-=iSubStep )
  {
    uiSum += abs( piOrg[ 0] - piCur[ 0] - offset );
    uiSum += abs( piOrg[ 1] - piCur[ 1] - offset );
    uiSum += abs( piOrg[ 2] - piCur[ 2] - offset );
    uiSum += abs( piOrg[ 3] - piCur[ 3] - offset );
    uiSum += abs( piOrg[ 4] - piCur[ 4] - offset );
    uiSum += abs( piOrg[ 5] - piCur[ 5] - offset );
    uiSum += abs( piOrg[ 6] - piCur[ 6] - offset );
    uiSum += abs( piOrg[ 7] - piCur[ 7] - offset );
    uiSum += abs( piOrg[ 8] - piCur[ 8] - offset );
    uiSum += abs( piOrg[ 9] - piCur[ 9] - offset );
    uiSum += abs( piOrg[10] - piCur[10] - offset );
    uiSum += abs( piOrg[11] - piCur[11] - offset );
    uiSum += abs( piOrg[12] - piCur[12] - offset );
    uiSum += abs( piOrg[13] - piCur[13] - offset );
    uiSum += abs( piOrg[14] - piCur[14] - offset );
    uiSum += abs( piOrg[15] - piCur[15] - offset );
    uiSum += abs( piOrg[16] - piCur[16] - offset );
    uiSum += abs( piOrg[17] - piCur[17] - offset );
    uiSum += abs( piOrg[18] - piCur[18] - offset );
    uiSum += abs( piOrg[19] - piCur[19] - offset );
    uiSum += abs( piOrg[20] - piCur[20] - offset );
    uiSum += abs( piOrg[21] - piCur[21] - offset );
    uiSum += abs( piOrg[22] - piCur[22] - offset );
    uiSum += abs( piOrg[23] - piCur[23] - offset );
    uiSum += abs( piOrg[24] - piCur[24] - offset );
    uiSum += abs( piOrg[25] - piCur[25] - offset );
    uiSum += abs( piOrg[26] - piCur[26] - offset );
    uiSum += abs( piOrg[27] - piCur[27] - offset );
    uiSum += abs( piOrg[28] - piCur[28] - offset );
    uiSum += abs( piOrg[29] - piCur[29] - offset );
    uiSum += abs( piOrg[30] - piCur[30] - offset );
    uiSum += abs( piOrg[31] - piCur[31] - offset );
    uiSum += abs( piOrg[32] - piCur[32] - offset );
    uiSum += abs( piOrg[33] - piCur[33] - offset );
    uiSum += abs( piOrg[34] - piCur[34] - offset );
    uiSum += abs( piOrg[35] - piCur[35] - offset );
    uiSum += abs( piOrg[36] - piCur[36] - offset );
    uiSum += abs( piOrg[37] - piCur[37] - offset );
    uiSum += abs( piOrg[38] - piCur[38] - offset );
    uiSum += abs( piOrg[39] - piCur[39] - offset );
    uiSum += abs( piOrg[40] - piCur[40] - offset );
    uiSum += abs( piOrg[41] - piCur[41] - offset );
    uiSum += abs( piOrg[42] - piCur[42] - offset );
    uiSum += abs( piOrg[43] - piCur[43] - offset );
    uiSum += abs( piOrg[44] - piCur[44] - offset );
    uiSum += abs( piOrg[45] - piCur[45] - offset );
    uiSum += abs( piOrg[46] - piCur[46] - offset );
    uiSum += abs( piOrg[47] - piCur[47] - offset );

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  uiSum <<= iSubShift;
  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}

// --------------------------------------------------------------------------------------------------------------------
// SSE
// --------------------------------------------------------------------------------------------------------------------

Distortion RdCost::xGetSSE( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }

  const Pel* piOrg      = rcDtParam.org.buf;
  const Pel* piCur      = rcDtParam.cur.buf;
  int  iRows            = rcDtParam.org.height;
  int  iCols            = rcDtParam.org.width;
  int  iStrideCur       = rcDtParam.cur.stride;
  int  iStrideOrg       = rcDtParam.org.stride;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;

  Intermediate_Int iTemp;

  for( ; iRows != 0; iRows-- )
  {
    for (int n = 0; n < iCols; n++ )
    {
      iTemp = piOrg[n  ] - piCur[n  ];
      uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  return ( uiSum );
}

Distortion RdCost::xGetSSE4( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 4, "Invalid size" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }

  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iStrideOrg    = rcDtParam.org.stride;
  int  iStrideCur    = rcDtParam.cur.stride;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;

  Intermediate_Int  iTemp;

  for( ; iRows != 0; iRows-- )
  {

    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  return ( uiSum );
}

Distortion RdCost::xGetSSE8( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 8, "Invalid size" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }

  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iStrideOrg    = rcDtParam.org.stride;
  int  iStrideCur    = rcDtParam.cur.stride;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;

  Intermediate_Int  iTemp;

  for( ; iRows != 0; iRows-- )
  {
    iTemp = piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[4] - piCur[4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[5] - piCur[5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[6] - piCur[6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[7] - piCur[7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  return ( uiSum );
}

Distortion RdCost::xGetSSE16( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 16, "Invalid size" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }

  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iStrideOrg    = rcDtParam.org.stride;
  int  iStrideCur    = rcDtParam.cur.stride;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;

  Intermediate_Int  iTemp;

  for( ; iRows != 0; iRows-- )
  {

    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  return ( uiSum );
}

Distortion RdCost::xGetSSE16N( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }
  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iCols         = rcDtParam.org.width;
  int  iStrideOrg    = rcDtParam.org.stride;
  int  iStrideCur    = rcDtParam.cur.stride;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;

  Intermediate_Int  iTemp;

  for( ; iRows != 0; iRows-- )
  {
    for (int n = 0; n < iCols; n+=16 )
    {

      iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+10] - piCur[n+10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+11] - piCur[n+11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+12] - piCur[n+12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+13] - piCur[n+13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+14] - piCur[n+14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      iTemp = piOrg[n+15] - piCur[n+15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);

    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  return ( uiSum );
}

Distortion RdCost::xGetSSE32( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 32, "Invalid size" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }

  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iStrideOrg    = rcDtParam.org.stride;
  int  iStrideCur    = rcDtParam.cur.stride;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;

  Intermediate_Int  iTemp;

  for( ; iRows != 0; iRows-- )
  {

    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  return ( uiSum );
}

Distortion RdCost::xGetSSE64( const DistParam &rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 64, "Invalid size" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }

  const Pel* piOrg   = rcDtParam.org.buf;
  const Pel* piCur   = rcDtParam.cur.buf;
  int  iRows         = rcDtParam.org.height;
  int  iStrideOrg    = rcDtParam.org.stride;
  int  iStrideCur    = rcDtParam.cur.stride;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;

  Intermediate_Int  iTemp;

  for( ; iRows != 0; iRows-- )
  {
    iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[32] - piCur[32]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[33] - piCur[33]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[34] - piCur[34]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[35] - piCur[35]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[36] - piCur[36]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[37] - piCur[37]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[38] - piCur[38]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[39] - piCur[39]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[40] - piCur[40]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[41] - piCur[41]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[42] - piCur[42]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[43] - piCur[43]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[44] - piCur[44]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[45] - piCur[45]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[46] - piCur[46]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[47] - piCur[47]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[48] - piCur[48]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[49] - piCur[49]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[50] - piCur[50]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[51] - piCur[51]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[52] - piCur[52]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[53] - piCur[53]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[54] - piCur[54]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[55] - piCur[55]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[56] - piCur[56]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[57] - piCur[57]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[58] - piCur[58]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[59] - piCur[59]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[60] - piCur[60]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[61] - piCur[61]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[62] - piCur[62]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    iTemp = piOrg[63] - piCur[63]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);

    piOrg += iStrideOrg;
    piCur += iStrideCur;
  }

  return ( uiSum );
}

// --------------------------------------------------------------------------------------------------------------------
// HADAMARD with step (used in fractional search)
// --------------------------------------------------------------------------------------------------------------------

Distortion RdCost::xCalcHADs2x2( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iStep )
{
  Distortion satd = 0;
  TCoeff diff[4], m[4];
  CHECK( iStep != 1, "Invalid step" );
  diff[0] = piOrg[0             ] - piCur[0];
  diff[1] = piOrg[1             ] - piCur[1];
  diff[2] = piOrg[iStrideOrg    ] - piCur[0 + iStrideCur];
  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
  m[0] = diff[0] + diff[2];
  m[1] = diff[1] + diff[3];
  m[2] = diff[0] - diff[2];
  m[3] = diff[1] - diff[3];

#if JVET_R0164_MEAN_SCALED_SATD
  satd += abs(m[0] + m[1]) >> 2;
#else
  satd += abs(m[0] + m[1]);
#endif
  satd += abs(m[0] - m[1]);
  satd += abs(m[2] + m[3]);
  satd += abs(m[2] - m[3]);

  return satd;
}

Distortion RdCost::xCalcHADs4x4( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iStep )
{
  int k;
  Distortion satd = 0;
  TCoeff diff[16], m[16], d[16];

  CHECK( iStep != 1, "Invalid step" );
  for( k = 0; k < 16; k+=4 )
  {
    diff[k+0] = piOrg[0] - piCur[0];
    diff[k+1] = piOrg[1] - piCur[1];
    diff[k+2] = piOrg[2] - piCur[2];
    diff[k+3] = piOrg[3] - piCur[3];

    piCur += iStrideCur;
    piOrg += iStrideOrg;
  }

  /*===== hadamard transform =====*/
  m[ 0] = diff[ 0] + diff[12];
  m[ 1] = diff[ 1] + diff[13];
  m[ 2] = diff[ 2] + diff[14];
  m[ 3] = diff[ 3] + diff[15];
  m[ 4] = diff[ 4] + diff[ 8];
  m[ 5] = diff[ 5] + diff[ 9];
  m[ 6] = diff[ 6] + diff[10];
  m[ 7] = diff[ 7] + diff[11];
  m[ 8] = diff[ 4] - diff[ 8];
  m[ 9] = diff[ 5] - diff[ 9];
  m[10] = diff[ 6] - diff[10];
  m[11] = diff[ 7] - diff[11];
  m[12] = diff[ 0] - diff[12];
  m[13] = diff[ 1] - diff[13];
  m[14] = diff[ 2] - diff[14];
  m[15] = diff[ 3] - diff[15];

  d[ 0] = m[ 0] + m[ 4];
  d[ 1] = m[ 1] + m[ 5];
  d[ 2] = m[ 2] + m[ 6];
  d[ 3] = m[ 3] + m[ 7];
  d[ 4] = m[ 8] + m[12];
  d[ 5] = m[ 9] + m[13];
  d[ 6] = m[10] + m[14];
  d[ 7] = m[11] + m[15];
  d[ 8] = m[ 0] - m[ 4];
  d[ 9] = m[ 1] - m[ 5];
  d[10] = m[ 2] - m[ 6];
  d[11] = m[ 3] - m[ 7];
  d[12] = m[12] - m[ 8];
  d[13] = m[13] - m[ 9];
  d[14] = m[14] - m[10];
  d[15] = m[15] - m[11];

  m[ 0] = d[ 0] + d[ 3];
  m[ 1] = d[ 1] + d[ 2];
  m[ 2] = d[ 1] - d[ 2];
  m[ 3] = d[ 0] - d[ 3];
  m[ 4] = d[ 4] + d[ 7];
  m[ 5] = d[ 5] + d[ 6];
  m[ 6] = d[ 5] - d[ 6];
  m[ 7] = d[ 4] - d[ 7];
  m[ 8] = d[ 8] + d[11];
  m[ 9] = d[ 9] + d[10];
  m[10] = d[ 9] - d[10];
  m[11] = d[ 8] - d[11];
  m[12] = d[12] + d[15];
  m[13] = d[13] + d[14];
  m[14] = d[13] - d[14];
  m[15] = d[12] - d[15];

  d[ 0] = m[ 0] + m[ 1];
  d[ 1] = m[ 0] - m[ 1];
  d[ 2] = m[ 2] + m[ 3];
  d[ 3] = m[ 3] - m[ 2];
  d[ 4] = m[ 4] + m[ 5];
  d[ 5] = m[ 4] - m[ 5];
  d[ 6] = m[ 6] + m[ 7];
  d[ 7] = m[ 7] - m[ 6];
  d[ 8] = m[ 8] + m[ 9];
  d[ 9] = m[ 8] - m[ 9];
  d[10] = m[10] + m[11];
  d[11] = m[11] - m[10];
  d[12] = m[12] + m[13];
  d[13] = m[12] - m[13];
  d[14] = m[14] + m[15];
  d[15] = m[15] - m[14];

  for (k=0; k<16; ++k)
  {
    satd += abs(d[k]);
  }

#if JVET_R0164_MEAN_SCALED_SATD
  satd -= abs(d[0]);
  satd += abs(d[0]) >> 2;
#endif
  satd  = ((satd+1)>>1);

  return satd;
}

Distortion RdCost::xCalcHADs8x8( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iStep )
{
  int k, i, j, jj;
  Distortion sad = 0;
  TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
  CHECK( iStep != 1, "Invalid step" );
  for( k = 0; k < 64; k += 8 )
  {
    diff[k+0] = piOrg[0] - piCur[0];
    diff[k+1] = piOrg[1] - piCur[1];
    diff[k+2] = piOrg[2] - piCur[2];
    diff[k+3] = piOrg[3] - piCur[3];
    diff[k+4] = piOrg[4] - piCur[4];
    diff[k+5] = piOrg[5] - piCur[5];
    diff[k+6] = piOrg[6] - piCur[6];
    diff[k+7] = piOrg[7] - piCur[7];

    piCur += iStrideCur;
    piOrg += iStrideOrg;
  }

  //horizontal
  for (j=0; j < 8; j++)
  {
    jj = j << 3;
    m2[j][0] = diff[jj  ] + diff[jj+4];
    m2[j][1] = diff[jj+1] + diff[jj+5];
    m2[j][2] = diff[jj+2] + diff[jj+6];
    m2[j][3] = diff[jj+3] + diff[jj+7];
    m2[j][4] = diff[jj  ] - diff[jj+4];
    m2[j][5] = diff[jj+1] - diff[jj+5];
    m2[j][6] = diff[jj+2] - diff[jj+6];
    m2[j][7] = diff[jj+3] - diff[jj+7];

    m1[j][0] = m2[j][0] + m2[j][2];
    m1[j][1] = m2[j][1] + m2[j][3];
    m1[j][2] = m2[j][0] - m2[j][2];
    m1[j][3] = m2[j][1] - m2[j][3];
    m1[j][4] = m2[j][4] + m2[j][6];
    m1[j][5] = m2[j][5] + m2[j][7];
    m1[j][6] = m2[j][4] - m2[j][6];
    m1[j][7] = m2[j][5] - m2[j][7];

    m2[j][0] = m1[j][0] + m1[j][1];
    m2[j][1] = m1[j][0] - m1[j][1];
    m2[j][2] = m1[j][2] + m1[j][3];
    m2[j][3] = m1[j][2] - m1[j][3];
    m2[j][4] = m1[j][4] + m1[j][5];
    m2[j][5] = m1[j][4] - m1[j][5];
    m2[j][6] = m1[j][6] + m1[j][7];
    m2[j][7] = m1[j][6] - m1[j][7];
  }

  //vertical
  for (i=0; i < 8; i++)
  {
    m3[0][i] = m2[0][i] + m2[4][i];
    m3[1][i] = m2[1][i] + m2[5][i];
    m3[2][i] = m2[2][i] + m2[6][i];
    m3[3][i] = m2[3][i] + m2[7][i];
    m3[4][i] = m2[0][i] - m2[4][i];
    m3[5][i] = m2[1][i] - m2[5][i];
    m3[6][i] = m2[2][i] - m2[6][i];
    m3[7][i] = m2[3][i] - m2[7][i];

    m1[0][i] = m3[0][i] + m3[2][i];
    m1[1][i] = m3[1][i] + m3[3][i];
    m1[2][i] = m3[0][i] - m3[2][i];
    m1[3][i] = m3[1][i] - m3[3][i];
    m1[4][i] = m3[4][i] + m3[6][i];
    m1[5][i] = m3[5][i] + m3[7][i];
    m1[6][i] = m3[4][i] - m3[6][i];
    m1[7][i] = m3[5][i] - m3[7][i];

    m2[0][i] = m1[0][i] + m1[1][i];
    m2[1][i] = m1[0][i] - m1[1][i];
    m2[2][i] = m1[2][i] + m1[3][i];
    m2[3][i] = m1[2][i] - m1[3][i];
    m2[4][i] = m1[4][i] + m1[5][i];
    m2[5][i] = m1[4][i] - m1[5][i];
    m2[6][i] = m1[6][i] + m1[7][i];
    m2[7][i] = m1[6][i] - m1[7][i];
  }

  for (i = 0; i < 8; i++)
  {
    for (j = 0; j < 8; j++)
    {
      sad += abs(m2[i][j]);
    }
  }

#if JVET_R0164_MEAN_SCALED_SATD
  sad -= abs(m2[0][0]);
  sad += abs(m2[0][0]) >> 2;
#endif
  sad  = ((sad+2)>>2);

  return sad;
}
Distortion RdCost::xCalcHADs16x8( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur )
{   //need to add SIMD implementation ,JCA
  int k, i, j, jj, sad = 0;
  int diff[128], m1[8][16], m2[8][16];
  for( k = 0; k < 128; k += 16 )
  {
    diff[k + 0] = piOrg[0] - piCur[0];
    diff[k + 1] = piOrg[1] - piCur[1];
    diff[k + 2] = piOrg[2] - piCur[2];
    diff[k + 3] = piOrg[3] - piCur[3];
    diff[k + 4] = piOrg[4] - piCur[4];
    diff[k + 5] = piOrg[5] - piCur[5];
    diff[k + 6] = piOrg[6] - piCur[6];
    diff[k + 7] = piOrg[7] - piCur[7];

    diff[k + 8] = piOrg[8] - piCur[8];
    diff[k + 9] = piOrg[9] - piCur[9];
    diff[k + 10] = piOrg[10] - piCur[10];
    diff[k + 11] = piOrg[11] - piCur[11];
    diff[k + 12] = piOrg[12] - piCur[12];
    diff[k + 13] = piOrg[13] - piCur[13];
    diff[k + 14] = piOrg[14] - piCur[14];
    diff[k + 15] = piOrg[15] - piCur[15];

    piCur += iStrideCur;
    piOrg += iStrideOrg;
  }

  //horizontal
  for( j = 0; j < 8; j++ )
  {
    jj = j << 4;

    m2[j][0] = diff[jj    ] + diff[jj + 8];
    m2[j][1] = diff[jj + 1] + diff[jj + 9];
    m2[j][2] = diff[jj + 2] + diff[jj + 10];
    m2[j][3] = diff[jj + 3] + diff[jj + 11];
    m2[j][4] = diff[jj + 4] + diff[jj + 12];
    m2[j][5] = diff[jj + 5] + diff[jj + 13];
    m2[j][6] = diff[jj + 6] + diff[jj + 14];
    m2[j][7] = diff[jj + 7] + diff[jj + 15];
    m2[j][8] = diff[jj    ] - diff[jj + 8];
    m2[j][9] = diff[jj + 1] - diff[jj + 9];
    m2[j][10] = diff[jj + 2] - diff[jj + 10];
    m2[j][11] = diff[jj + 3] - diff[jj + 11];
    m2[j][12] = diff[jj + 4] - diff[jj + 12];
    m2[j][13] = diff[jj + 5] - diff[jj + 13];
    m2[j][14] = diff[jj + 6] - diff[jj + 14];
    m2[j][15] = diff[jj + 7] - diff[jj + 15];

    m1[j][0] = m2[j][0] + m2[j][4];
    m1[j][1] = m2[j][1] + m2[j][5];
    m1[j][2] = m2[j][2] + m2[j][6];
    m1[j][3] = m2[j][3] + m2[j][7];
    m1[j][4] = m2[j][0] - m2[j][4];
    m1[j][5] = m2[j][1] - m2[j][5];
    m1[j][6] = m2[j][2] - m2[j][6];
    m1[j][7] = m2[j][3] - m2[j][7];
    m1[j][8] = m2[j][8] + m2[j][12];
    m1[j][9] = m2[j][9] + m2[j][13];
    m1[j][10] = m2[j][10] + m2[j][14];
    m1[j][11] = m2[j][11] + m2[j][15];
    m1[j][12] = m2[j][8] - m2[j][12];
    m1[j][13] = m2[j][9] - m2[j][13];
    m1[j][14] = m2[j][10] - m2[j][14];
    m1[j][15] = m2[j][11] - m2[j][15];

    m2[j][0] = m1[j][0] + m1[j][2];
    m2[j][1] = m1[j][1] + m1[j][3];
    m2[j][2] = m1[j][0] - m1[j][2];
    m2[j][3] = m1[j][1] - m1[j][3];
    m2[j][4] = m1[j][4] + m1[j][6];
    m2[j][5] = m1[j][5] + m1[j][7];
    m2[j][6] = m1[j][4] - m1[j][6];
    m2[j][7] = m1[j][5] - m1[j][7];
    m2[j][8] = m1[j][8] + m1[j][10];
    m2[j][9] = m1[j][9] + m1[j][11];
    m2[j][10] = m1[j][8] - m1[j][10];
    m2[j][11] = m1[j][9] - m1[j][11];
    m2[j][12] = m1[j][12] + m1[j][14];
    m2[j][13] = m1[j][13] + m1[j][15];
    m2[j][14] = m1[j][12] - m1[j][14];
    m2[j][15] = m1[j][13] - m1[j][15];

    m1[j][0] = m2[j][0] + m2[j][1];
    m1[j][1] = m2[j][0] - m2[j][1];
    m1[j][2] = m2[j][2] + m2[j][3];
    m1[j][3] = m2[j][2] - m2[j][3];
    m1[j][4] = m2[j][4] + m2[j][5];
    m1[j][5] = m2[j][4] - m2[j][5];
    m1[j][6] = m2[j][6] + m2[j][7];
    m1[j][7] = m2[j][6] - m2[j][7];
    m1[j][8] = m2[j][8] + m2[j][9];
    m1[j][9] = m2[j][8] - m2[j][9];
    m1[j][10] = m2[j][10] + m2[j][11];
    m1[j][11] = m2[j][10] - m2[j][11];
    m1[j][12] = m2[j][12] + m2[j][13];
    m1[j][13] = m2[j][12] - m2[j][13];
    m1[j][14] = m2[j][14] + m2[j][15];
    m1[j][15] = m2[j][14] - m2[j][15];
  }

  //vertical
  for( i = 0; i < 16; i++ )
  {
    m2[0][i] = m1[0][i] + m1[4][i];
    m2[1][i] = m1[1][i] + m1[5][i];
    m2[2][i] = m1[2][i] + m1[6][i];
    m2[3][i] = m1[3][i] + m1[7][i];
    m2[4][i] = m1[0][i] - m1[4][i];
    m2[5][i] = m1[1][i] - m1[5][i];
    m2[6][i] = m1[2][i] - m1[6][i];
    m2[7][i] = m1[3][i] - m1[7][i];

    m1[0][i] = m2[0][i] + m2[2][i];
    m1[1][i] = m2[1][i] + m2[3][i];
    m1[2][i] = m2[0][i] - m2[2][i];
    m1[3][i] = m2[1][i] - m2[3][i];
    m1[4][i] = m2[4][i] + m2[6][i];
    m1[5][i] = m2[5][i] + m2[7][i];
    m1[6][i] = m2[4][i] - m2[6][i];
    m1[7][i] = m2[5][i] - m2[7][i];

    m2[0][i] = m1[0][i] + m1[1][i];
    m2[1][i] = m1[0][i] - m1[1][i];
    m2[2][i] = m1[2][i] + m1[3][i];
    m2[3][i] = m1[2][i] - m1[3][i];
    m2[4][i] = m1[4][i] + m1[5][i];
    m2[5][i] = m1[4][i] - m1[5][i];
    m2[6][i] = m1[6][i] + m1[7][i];
    m2[7][i] = m1[6][i] - m1[7][i];
  }

  for( i = 0; i < 8; i++ )
  {
    for( j = 0; j < 16; j++ )
    {
      sad += abs( m2[i][j] );
    }
  }

#if JVET_R0164_MEAN_SCALED_SATD
  sad -= abs(m2[0][0]);
  sad += abs(m2[0][0]) >> 2;
#endif
  sad  = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );

  return sad;
}

Distortion RdCost::xCalcHADs8x16( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur )
{
  int k, i, j, jj, sad = 0;
  int diff[128], m1[16][8], m2[16][8];
  for( k = 0; k < 128; k += 8 )
  {
    diff[k + 0] = piOrg[0] - piCur[0];
    diff[k + 1] = piOrg[1] - piCur[1];
    diff[k + 2] = piOrg[2] - piCur[2];
    diff[k + 3] = piOrg[3] - piCur[3];
    diff[k + 4] = piOrg[4] - piCur[4];
    diff[k + 5] = piOrg[5] - piCur[5];
    diff[k + 6] = piOrg[6] - piCur[6];
    diff[k + 7] = piOrg[7] - piCur[7];

    piCur += iStrideCur;
    piOrg += iStrideOrg;
  }

  //horizontal
  for( j = 0; j < 16; j++ )
  {
    jj = j << 3;

    m2[j][0] = diff[jj] + diff[jj + 4];
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
    m2[j][4] = diff[jj] - diff[jj + 4];
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
    m2[j][7] = diff[jj + 3] - diff[jj + 7];

    m1[j][0] = m2[j][0] + m2[j][2];
    m1[j][1] = m2[j][1] + m2[j][3];
    m1[j][2] = m2[j][0] - m2[j][2];
    m1[j][3] = m2[j][1] - m2[j][3];
    m1[j][4] = m2[j][4] + m2[j][6];
    m1[j][5] = m2[j][5] + m2[j][7];
    m1[j][6] = m2[j][4] - m2[j][6];
    m1[j][7] = m2[j][5] - m2[j][7];

    m2[j][0] = m1[j][0] + m1[j][1];
    m2[j][1] = m1[j][0] - m1[j][1];
    m2[j][2] = m1[j][2] + m1[j][3];
    m2[j][3] = m1[j][2] - m1[j][3];
    m2[j][4] = m1[j][4] + m1[j][5];
    m2[j][5] = m1[j][4] - m1[j][5];
    m2[j][6] = m1[j][6] + m1[j][7];
    m2[j][7] = m1[j][6] - m1[j][7];
  }

  //vertical
  for( i = 0; i < 8; i++ )
  {
    m1[0][i] = m2[0][i] + m2[8][i];
    m1[1][i] = m2[1][i] + m2[9][i];
    m1[2][i] = m2[2][i] + m2[10][i];
    m1[3][i] = m2[3][i] + m2[11][i];
    m1[4][i] = m2[4][i] + m2[12][i];
    m1[5][i] = m2[5][i] + m2[13][i];
    m1[6][i] = m2[6][i] + m2[14][i];
    m1[7][i] = m2[7][i] + m2[15][i];
    m1[8][i] = m2[0][i] - m2[8][i];
    m1[9][i] = m2[1][i] - m2[9][i];
    m1[10][i] = m2[2][i] - m2[10][i];
    m1[11][i] = m2[3][i] - m2[11][i];
    m1[12][i] = m2[4][i] - m2[12][i];
    m1[13][i] = m2[5][i] - m2[13][i];
    m1[14][i] = m2[6][i] - m2[14][i];
    m1[15][i] = m2[7][i] - m2[15][i];

    m2[0][i] = m1[0][i] + m1[4][i];
    m2[1][i] = m1[1][i] + m1[5][i];
    m2[2][i] = m1[2][i] + m1[6][i];
    m2[3][i] = m1[3][i] + m1[7][i];
    m2[4][i] = m1[0][i] - m1[4][i];
    m2[5][i] = m1[1][i] - m1[5][i];
    m2[6][i] = m1[2][i] - m1[6][i];
    m2[7][i] = m1[3][i] - m1[7][i];
    m2[8][i] = m1[8][i] + m1[12][i];
    m2[9][i] = m1[9][i] + m1[13][i];
    m2[10][i] = m1[10][i] + m1[14][i];
    m2[11][i] = m1[11][i] + m1[15][i];
    m2[12][i] = m1[8][i] - m1[12][i];
    m2[13][i] = m1[9][i] - m1[13][i];
    m2[14][i] = m1[10][i] - m1[14][i];
    m2[15][i] = m1[11][i] - m1[15][i];

    m1[0][i] = m2[0][i] + m2[2][i];
    m1[1][i] = m2[1][i] + m2[3][i];
    m1[2][i] = m2[0][i] - m2[2][i];
    m1[3][i] = m2[1][i] - m2[3][i];
    m1[4][i] = m2[4][i] + m2[6][i];
    m1[5][i] = m2[5][i] + m2[7][i];
    m1[6][i] = m2[4][i] - m2[6][i];
    m1[7][i] = m2[5][i] - m2[7][i];
    m1[8][i] = m2[8][i] + m2[10][i];
    m1[9][i] = m2[9][i] + m2[11][i];
    m1[10][i] = m2[8][i] - m2[10][i];
    m1[11][i] = m2[9][i] - m2[11][i];
    m1[12][i] = m2[12][i] + m2[14][i];
    m1[13][i] = m2[13][i] + m2[15][i];
    m1[14][i] = m2[12][i] - m2[14][i];
    m1[15][i] = m2[13][i] - m2[15][i];

    m2[0][i] = m1[0][i] + m1[1][i];
    m2[1][i] = m1[0][i] - m1[1][i];
    m2[2][i] = m1[2][i] + m1[3][i];
    m2[3][i] = m1[2][i] - m1[3][i];
    m2[4][i] = m1[4][i] + m1[5][i];
    m2[5][i] = m1[4][i] - m1[5][i];
    m2[6][i] = m1[6][i] + m1[7][i];
    m2[7][i] = m1[6][i] - m1[7][i];
    m2[8][i] = m1[8][i] + m1[9][i];
    m2[9][i] = m1[8][i] - m1[9][i];
    m2[10][i] = m1[10][i] + m1[11][i];
    m2[11][i] = m1[10][i] - m1[11][i];
    m2[12][i] = m1[12][i] + m1[13][i];
    m2[13][i] = m1[12][i] - m1[13][i];
    m2[14][i] = m1[14][i] + m1[15][i];
    m2[15][i] = m1[14][i] - m1[15][i];
  }

  for( i = 0; i < 16; i++ )
  {
    for( j = 0; j < 8; j++ )
    {
      sad += abs( m2[i][j] );
    }
  }

#if JVET_R0164_MEAN_SCALED_SATD
  sad -= abs(m2[0][0]);
  sad += abs(m2[0][0]) >> 2;
#endif
  sad  = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );

  return sad;
}
Distortion RdCost::xCalcHADs4x8( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur )
{
  int k, i, j, jj, sad = 0;
  int diff[32], m1[8][4], m2[8][4];
  for( k = 0; k < 32; k += 4 )
  {
    diff[k + 0] = piOrg[0] - piCur[0];
    diff[k + 1] = piOrg[1] - piCur[1];
    diff[k + 2] = piOrg[2] - piCur[2];
    diff[k + 3] = piOrg[3] - piCur[3];

    piCur += iStrideCur;
    piOrg += iStrideOrg;
  }

  //horizontal
  for( j = 0; j < 8; j++ )
  {
    jj = j << 2;
    m2[j][0] = diff[jj] + diff[jj + 2];
    m2[j][1] = diff[jj + 1] + diff[jj + 3];
    m2[j][2] = diff[jj] - diff[jj + 2];
    m2[j][3] = diff[jj + 1] - diff[jj + 3];

    m1[j][0] = m2[j][0] + m2[j][1];
    m1[j][1] = m2[j][0] - m2[j][1];
    m1[j][2] = m2[j][2] + m2[j][3];
    m1[j][3] = m2[j][2] - m2[j][3];
  }

  //vertical
  for( i = 0; i < 4; i++ )
  {
    m2[0][i] = m1[0][i] + m1[4][i];
    m2[1][i] = m1[1][i] + m1[5][i];
    m2[2][i] = m1[2][i] + m1[6][i];
    m2[3][i] = m1[3][i] + m1[7][i];
    m2[4][i] = m1[0][i] - m1[4][i];
    m2[5][i] = m1[1][i] - m1[5][i];
    m2[6][i] = m1[2][i] - m1[6][i];
    m2[7][i] = m1[3][i] - m1[7][i];

    m1[0][i] = m2[0][i] + m2[2][i];
    m1[1][i] = m2[1][i] + m2[3][i];
    m1[2][i] = m2[0][i] - m2[2][i];
    m1[3][i] = m2[1][i] - m2[3][i];
    m1[4][i] = m2[4][i] + m2[6][i];
    m1[5][i] = m2[5][i] + m2[7][i];
    m1[6][i] = m2[4][i] - m2[6][i];
    m1[7][i] = m2[5][i] - m2[7][i];

    m2[0][i] = m1[0][i] + m1[1][i];
    m2[1][i] = m1[0][i] - m1[1][i];
    m2[2][i] = m1[2][i] + m1[3][i];
    m2[3][i] = m1[2][i] - m1[3][i];
    m2[4][i] = m1[4][i] + m1[5][i];
    m2[5][i] = m1[4][i] - m1[5][i];
    m2[6][i] = m1[6][i] + m1[7][i];
    m2[7][i] = m1[6][i] - m1[7][i];
  }

  for( i = 0; i < 8; i++ )
  {
    for( j = 0; j < 4; j++ )
    {
      sad += abs( m2[i][j] );
    }
  }

#if JVET_R0164_MEAN_SCALED_SATD
  sad -= abs(m2[0][0]);
  sad += abs(m2[0][0]) >> 2;
#endif
  sad  = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );

  return sad;
}

Distortion RdCost::xCalcHADs8x4( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur )
{
  int k, i, j, jj, sad = 0;
  int diff[32], m1[4][8], m2[4][8];
  for( k = 0; k < 32; k += 8 )
  {
    diff[k + 0] = piOrg[0] - piCur[0];
    diff[k + 1] = piOrg[1] - piCur[1];
    diff[k + 2] = piOrg[2] - piCur[2];
    diff[k + 3] = piOrg[3] - piCur[3];
    diff[k + 4] = piOrg[4] - piCur[4];
    diff[k + 5] = piOrg[5] - piCur[5];
    diff[k + 6] = piOrg[6] - piCur[6];
    diff[k + 7] = piOrg[7] - piCur[7];

    piCur += iStrideCur;
    piOrg += iStrideOrg;
  }

  //horizontal
  for( j = 0; j < 4; j++ )
  {
    jj = j << 3;

    m2[j][0] = diff[jj] + diff[jj + 4];
    m2[j][1] = diff[jj + 1] + diff[jj + 5];
    m2[j][2] = diff[jj + 2] + diff[jj + 6];
    m2[j][3] = diff[jj + 3] + diff[jj + 7];
    m2[j][4] = diff[jj] - diff[jj + 4];
    m2[j][5] = diff[jj + 1] - diff[jj + 5];
    m2[j][6] = diff[jj + 2] - diff[jj + 6];
    m2[j][7] = diff[jj + 3] - diff[jj + 7];

    m1[j][0] = m2[j][0] + m2[j][2];
    m1[j][1] = m2[j][1] + m2[j][3];
    m1[j][2] = m2[j][0] - m2[j][2];
    m1[j][3] = m2[j][1] - m2[j][3];
    m1[j][4] = m2[j][4] + m2[j][6];
    m1[j][5] = m2[j][5] + m2[j][7];
    m1[j][6] = m2[j][4] - m2[j][6];
    m1[j][7] = m2[j][5] - m2[j][7];

    m2[j][0] = m1[j][0] + m1[j][1];
    m2[j][1] = m1[j][0] - m1[j][1];
    m2[j][2] = m1[j][2] + m1[j][3];
    m2[j][3] = m1[j][2] - m1[j][3];
    m2[j][4] = m1[j][4] + m1[j][5];
    m2[j][5] = m1[j][4] - m1[j][5];
    m2[j][6] = m1[j][6] + m1[j][7];
    m2[j][7] = m1[j][6] - m1[j][7];
  }
  //vertical
  for( i = 0; i < 8; i++ )
  {
    m1[0][i] = m2[0][i] + m2[2][i];
    m1[1][i] = m2[1][i] + m2[3][i];
    m1[2][i] = m2[0][i] - m2[2][i];
    m1[3][i] = m2[1][i] - m2[3][i];

    m2[0][i] = m1[0][i] + m1[1][i];
    m2[1][i] = m1[0][i] - m1[1][i];
    m2[2][i] = m1[2][i] + m1[3][i];
    m2[3][i] = m1[2][i] - m1[3][i];
  }

  for( i = 0; i < 4; i++ )
  {
    for( j = 0; j < 8; j++ )
    {
      sad += abs( m2[i][j] );
    }
  }

#if JVET_R0164_MEAN_SCALED_SATD
  sad -= abs(m2[0][0]);
  sad += abs(m2[0][0]) >> 2;
#endif
  sad  = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );

  return sad;
}

Distortion RdCost::xGetHADs( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetHADsw( rcDtParam );
  }
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iRows = rcDtParam.org.height;
  const int  iCols = rcDtParam.org.width;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const int  iStep = rcDtParam.step;

  int  x = 0, y = 0;

  Distortion uiSum = 0;

  if( iCols > iRows && ( iRows & 7 ) == 0 && ( iCols & 15 ) == 0 )
  {
    for( y = 0; y < iRows; y += 8 )
    {
      for( x = 0; x < iCols; x += 16 )
      {
        uiSum += xCalcHADs16x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
      }
      piOrg += iStrideOrg * 8;
      piCur += iStrideCur * 8;
    }
  }
  else if( iCols < iRows && ( iCols & 7 ) == 0 && ( iRows & 15 ) == 0 )
  {
    for( y = 0; y < iRows; y += 16 )
    {
      for( x = 0; x < iCols; x += 8 )
      {
        uiSum += xCalcHADs8x16( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
      }
      piOrg += iStrideOrg * 16;
      piCur += iStrideCur * 16;
    }
  }
  else if( iCols > iRows && ( iRows & 3 ) == 0 && ( iCols & 7 ) == 0 )
  {
    for( y = 0; y < iRows; y += 4 )
    {
      for( x = 0; x < iCols; x += 8 )
      {
        uiSum += xCalcHADs8x4( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
      }
      piOrg += iStrideOrg * 4;
      piCur += iStrideCur * 4;
    }
  }
  else if( iCols < iRows && ( iCols & 3 ) == 0 && ( iRows & 7 ) == 0 )
  {
    for( y = 0; y < iRows; y += 8 )
    {
      for( x = 0; x < iCols; x += 4 )
      {
        uiSum += xCalcHADs4x8( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur );
      }
      piOrg += iStrideOrg * 8;
      piCur += iStrideCur * 8;
    }
  }
  else if( ( iRows % 8 == 0 ) && ( iCols % 8 == 0 ) )
  {
    int  iOffsetOrg = iStrideOrg << 3;
    int  iOffsetCur = iStrideCur << 3;
    for( y = 0; y < iRows; y += 8 )
    {
      for( x = 0; x < iCols; x += 8 )
      {
        uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
      }
      piOrg += iOffsetOrg;
      piCur += iOffsetCur;
    }
  }
  else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) )
  {
    int  iOffsetOrg = iStrideOrg << 2;
    int  iOffsetCur = iStrideCur << 2;

    for( y = 0; y < iRows; y += 4 )
    {
      for( x = 0; x < iCols; x += 4 )
      {
        uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
      }
      piOrg += iOffsetOrg;
      piCur += iOffsetCur;
    }
  }
  else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) )
  {
    int  iOffsetOrg = iStrideOrg << 1;
    int  iOffsetCur = iStrideCur << 1;
    for( y = 0; y < iRows; y += 2 )
    {
      for( x = 0; x < iCols; x += 2 )
      {
        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
      }
      piOrg += iOffsetOrg;
      piCur += iOffsetCur;
    }
  }
  else
  {
    THROW( "Invalid size" );
  }

  return (uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth));
}


#if WCG_EXT
uint32_t   RdCost::m_signalType                 = RESHAPE_SIGNAL_NULL;
double     RdCost::m_chromaWeight               = 1.0;
int        RdCost::m_lumaBD                     = 10;
std::vector<double> RdCost::m_reshapeLumaLevelToWeightPLUT;
std::vector<double> RdCost::m_lumaLevelToWeightPLUT;

void RdCost::saveUnadjustedLambda()
{
  m_dLambda_unadjusted = m_dLambda;
  m_DistScaleUnadjusted = m_DistScale;
}

void RdCost::initLumaLevelToWeightTable()
{
  for (int i = 0; i < LUMA_LEVEL_TO_DQP_LUT_MAXSIZE; i++) {
    double x = i;
    double y;

/*
    //always false
    if (isSDR)  // set SDR weight table
    {
      y = 0.03*x - 3.0;        // this is the Equation used to derive the luma qp LUT for SDR in ST-2084
      y = y<0 ? 0 : (y>12 ? 12 : y);

    }
    else
*/
    { // set SDR weight table
      y = 0.015*x - 1.5 - 6;   // this is the Equation used to derive the luma qp LUT for HDR in MPEG HDR anchor3.2 (JCTCX-X1020)
      y = y<-3 ? -3 : (y>6 ? 6 : y);
    }

    m_lumaLevelToWeightPLUT[i] = pow(2.0, y / 3.0);      // or power(10, dQp/10)      they are almost equal
  }
}

void RdCost::initLumaLevelToWeightTableReshape()
{
  int lutSize = 1 << m_lumaBD;
  if (m_reshapeLumaLevelToWeightPLUT.empty())
    m_reshapeLumaLevelToWeightPLUT.resize(lutSize, 1.0);
  if (m_lumaLevelToWeightPLUT.empty())
    m_lumaLevelToWeightPLUT.resize(lutSize, 1.0);
  if (m_signalType == RESHAPE_SIGNAL_PQ)
  {
    for (int i = 0; i < (1 << m_lumaBD); i++)
    {
      double x = m_lumaBD < 10 ? i << (10 - m_lumaBD) : m_lumaBD > 10 ? i >> (m_lumaBD - 10) : i;
      double y;
      y = 0.015*x - 1.5 - 6;
      y = y < -3 ? -3 : (y > 6 ? 6 : y);
      m_reshapeLumaLevelToWeightPLUT[i] = pow(2.0, y / 3.0);
      m_lumaLevelToWeightPLUT[i] = m_reshapeLumaLevelToWeightPLUT[i];
    }
  }
}

void RdCost::updateReshapeLumaLevelToWeightTableChromaMD(std::vector<Pel>& ILUT)
{
  for (int i = 0; i < (1 << m_lumaBD); i++)
  {
    m_reshapeLumaLevelToWeightPLUT[i] = m_lumaLevelToWeightPLUT[ILUT[i]];
  }
}

void RdCost::restoreReshapeLumaLevelToWeightTable()
{
  for (int i = 0; i < (1 << m_lumaBD); i++)
  {
    m_reshapeLumaLevelToWeightPLUT.at(i) = m_lumaLevelToWeightPLUT.at(i);
  }
}

void RdCost::updateReshapeLumaLevelToWeightTable(SliceReshapeInfo &sliceReshape, Pel *wtTable, double cwt)
{
  if (m_signalType == RESHAPE_SIGNAL_SDR || m_signalType == RESHAPE_SIGNAL_HLG)
  {
    if (sliceReshape.getSliceReshapeModelPresentFlag())
    {
      double wBin = 1.0;
      double weight = 1.0;
      int histLens = (1 << m_lumaBD) / PIC_CODE_CW_BINS;

      for (int i = 0; i < PIC_CODE_CW_BINS; i++)
      {
        if ((i < sliceReshape.reshaperModelMinBinIdx) || (i > sliceReshape.reshaperModelMaxBinIdx))
          weight = 1.0;
        else
        {
          if (sliceReshape.reshaperModelBinCWDelta[i] == 1 || (sliceReshape.reshaperModelBinCWDelta[i] == -1 * histLens))
            weight = wBin;
          else
          {
            weight = (double)wtTable[i] / (double)histLens;
            weight = weight*weight;
          }
        }
        for (int j = 0; j < histLens; j++)
        {
          int ii = i*histLens + j;
          m_reshapeLumaLevelToWeightPLUT[ii] = weight;
        }
      }
      m_chromaWeight = cwt;
    }
    else
    {
      THROW("updateReshapeLumaLevelToWeightTable ERROR!!");
    }
  }
  else
  {
    THROW("updateReshapeLumaLevelToWeightTable not support other signal types!!");
  }
}

Distortion RdCost::getWeightedMSE(int compIdx, const Pel org, const Pel cur, const uint32_t uiShift, const Pel orgLuma)
{
  Distortion distortionVal = 0;
  Intermediate_Int iTemp = org - cur;
  CHECK( org<0, "");

  if (compIdx == COMPONENT_Y)
  {
     CHECK(org!=orgLuma, "");
  }
  // use luma to get weight
  double weight = 1.0;
  if (m_signalType == RESHAPE_SIGNAL_SDR || m_signalType == RESHAPE_SIGNAL_HLG)
  {
    if (compIdx == COMPONENT_Y)
    {
      weight = m_reshapeLumaLevelToWeightPLUT[orgLuma];
    }
    else
    {
      weight = m_chromaWeight;
    }
  }
  else
  {
    weight = m_reshapeLumaLevelToWeightPLUT[orgLuma];
  }
  int64_t fixedPTweight = (int64_t)(weight * (double)(1 << 16));
  Intermediate_Int mse = Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> 16);
  distortionVal = Distortion( mse >> uiShift);
  return distortionVal;
}

Distortion RdCost::xGetSSE_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );  // ignore it for now
  }
        int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iCols = rcDtParam.org.width;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma        = rcDtParam.orgLuma.buf;
  const int  iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
  for( ; iRows != 0; iRows-- )
  {
    for (int n = 0; n < iCols; n++ )
    {
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n  ], piCur[n  ], uiShift, piOrgLuma[n<<cShift]);
    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;

    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}

Distortion RdCost::xGetSSE2_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 2, "" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam ); // ignore it for now
  }

  int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma           = rcDtParam.orgLuma.buf;
  const size_t  iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
  for( ; iRows != 0; iRows-- )
  {
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[0  ], piCur[0  ], uiShift, piOrgLuma[size_t(0)<<cShift]);   // piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[1  ], piCur[1  ], uiShift, piOrgLuma[size_t(1)<<cShift]);   // piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    piOrg += iStrideOrg;
    piCur += iStrideCur;
    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}

Distortion RdCost::xGetSSE4_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 4, "" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam ); // ignore it for now
  }

        int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma        = rcDtParam.orgLuma.buf;
  const size_t  iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
  for( ; iRows != 0; iRows-- )
  {
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[0  ], piCur[0  ], uiShift, piOrgLuma[size_t(0)<<cShift]);   // piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[1  ], piCur[1  ], uiShift, piOrgLuma[size_t(1)<<cShift] );   // piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[2  ], piCur[2  ], uiShift, piOrgLuma[size_t(2)<<cShift] );   // piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[3  ], piCur[3  ], uiShift, piOrgLuma[size_t(3)<<cShift] );   // piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    piOrg += iStrideOrg;
    piCur += iStrideCur;
    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}

Distortion RdCost::xGetSSE8_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 8, "" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }

        int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma        = rcDtParam.orgLuma.buf;
  const size_t  iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
  for( ; iRows != 0; iRows-- )
  {
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[0  ], piCur[0  ], uiShift, piOrgLuma[0  ]);   // piOrg[0] - piCur[0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[1  ], piCur[1  ], uiShift, piOrgLuma[size_t(1)<<cShift  ]);  // piOrg[1] - piCur[1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[2  ], piCur[2  ], uiShift, piOrgLuma[size_t(2)<<cShift  ]);  //piOrg[2] - piCur[2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[3  ], piCur[3  ], uiShift, piOrgLuma[size_t(3)<<cShift  ]);  // piOrg[3] - piCur[3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[4  ], piCur[4  ], uiShift, piOrgLuma[size_t(4)<<cShift  ]);  // piOrg[4] - piCur[4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[5  ], piCur[5  ], uiShift, piOrgLuma[size_t(5)<<cShift  ]);  // piOrg[5] - piCur[5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[6  ], piCur[6  ], uiShift, piOrgLuma[size_t(6)<<cShift  ]);  // piOrg[6] - piCur[6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[7  ], piCur[7  ], uiShift, piOrgLuma[size_t(7)<<cShift  ]);  // piOrg[7] - piCur[7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    piOrg += iStrideOrg;
    piCur += iStrideCur;
    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}

Distortion RdCost::xGetSSE16_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 16, "" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }
        int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma        = rcDtParam.orgLuma.buf;
  const size_t  iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;
  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
  for( ; iRows != 0; iRows-- )
  {
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[0  ], piCur[0  ], uiShift, piOrgLuma[0  ]);  // piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[1  ], piCur[1  ], uiShift, piOrgLuma[size_t(1)<<cShift  ]);  //piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[2  ], piCur[2  ], uiShift, piOrgLuma[size_t(2)<<cShift  ]);  //piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[3  ], piCur[3  ], uiShift, piOrgLuma[size_t(3)<<cShift  ]);  //piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[4  ], piCur[4  ], uiShift, piOrgLuma[size_t(4)<<cShift  ]);  //piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[5  ], piCur[5  ], uiShift, piOrgLuma[size_t(5)<<cShift  ]);  //piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[6  ], piCur[6  ], uiShift, piOrgLuma[size_t(6)<<cShift  ]);  //piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[7  ], piCur[7  ], uiShift, piOrgLuma[size_t(7)<<cShift  ]);  //piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[8  ], piCur[8  ], uiShift, piOrgLuma[size_t(8)<<cShift  ]);  //piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[9  ], piCur[9  ], uiShift, piOrgLuma[size_t(9)<<cShift  ]);  //piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[10 ], piCur[10 ], uiShift, piOrgLuma[size_t(10)<<cShift  ]);  //piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[11 ], piCur[11 ], uiShift, piOrgLuma[size_t(11)<<cShift  ]);  //piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[12 ], piCur[12 ], uiShift, piOrgLuma[size_t(12)<<cShift  ]);  //piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[13 ], piCur[13 ], uiShift, piOrgLuma[size_t(13)<<cShift  ]);  //piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[14 ], piCur[14 ], uiShift, piOrgLuma[size_t(14)<<cShift  ]);  //piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[15 ], piCur[15 ], uiShift, piOrgLuma[size_t(15)<<cShift  ]);  //piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    piOrg += iStrideOrg;
    piCur += iStrideCur;

    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}

Distortion RdCost::xGetSSE16N_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }
        int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iCols = rcDtParam.org.width;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma        = rcDtParam.orgLuma.buf;
  const size_t  iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;
  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
  for( ; iRows != 0; iRows-- )
  {
    for (int n = 0; n < iCols; n+=16 )
    {
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+0 ], piCur[n+0 ], uiShift, piOrgLuma[size_t(n+0)<<cShift ]);  // iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+1 ], piCur[n+1 ], uiShift, piOrgLuma[size_t(n+1)<<cShift ]);  // iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+2 ], piCur[n+2 ], uiShift, piOrgLuma[size_t(n+2)<<cShift ]);  // iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+3 ], piCur[n+3 ], uiShift, piOrgLuma[size_t(n+3)<<cShift ]);  // iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+4 ], piCur[n+4 ], uiShift, piOrgLuma[size_t(n+4)<<cShift ]);  // iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+5 ], piCur[n+5 ], uiShift, piOrgLuma[size_t(n+5)<<cShift ]);  // iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+6 ], piCur[n+6 ], uiShift, piOrgLuma[size_t(n+6)<<cShift ]);  // iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+7 ], piCur[n+7 ], uiShift, piOrgLuma[size_t(n+7)<<cShift ]);  // iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+8 ], piCur[n+8 ], uiShift, piOrgLuma[size_t(n+8)<<cShift ]);  // iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+9 ], piCur[n+9 ], uiShift, piOrgLuma[size_t(n+9)<<cShift ]);  // iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+10], piCur[n+10], uiShift, piOrgLuma[size_t(n+10)<<cShift ]);  // iTemp = piOrg[n+10] - piCur[n+10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+11], piCur[n+11], uiShift, piOrgLuma[size_t(n+11)<<cShift ]);  // iTemp = piOrg[n+11] - piCur[n+11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+12], piCur[n+12], uiShift, piOrgLuma[size_t(n+12)<<cShift]);  // iTemp = piOrg[n+12] - piCur[n+12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+13], piCur[n+13], uiShift, piOrgLuma[size_t(n+13)<<cShift ]);  // iTemp = piOrg[n+13] - piCur[n+13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+14], piCur[n+14], uiShift, piOrgLuma[size_t(n+14)<<cShift ]);  // iTemp = piOrg[n+14] - piCur[n+14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
      uiSum += getWeightedMSE(rcDtParam.compID, piOrg[n+15], piCur[n+15], uiShift, piOrgLuma[size_t(n+15)<<cShift ]);  // iTemp = piOrg[n+15] - piCur[n+15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    }
    piOrg += iStrideOrg;
    piCur += iStrideCur;
    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}

Distortion RdCost::xGetSSE32_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 32, "" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }
        int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma        = rcDtParam.orgLuma.buf;
  const size_t  iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1;
  for( ; iRows != 0; iRows-- )
  {
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[0 ], piCur[0 ], uiShift, piOrgLuma[size_t(0) ]);  // iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[1 ], piCur[1 ], uiShift, piOrgLuma[size_t(1)<<cShift ]);  // iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[2 ], piCur[2 ], uiShift, piOrgLuma[size_t(2)<<cShift ]);  // iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[3 ], piCur[3 ], uiShift, piOrgLuma[size_t(3)<<cShift ]);  // iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[4 ], piCur[4 ], uiShift, piOrgLuma[size_t(4)<<cShift ]);  // iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[5 ], piCur[5 ], uiShift, piOrgLuma[size_t(5)<<cShift ]);  // iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[6 ], piCur[6 ], uiShift, piOrgLuma[size_t(6)<<cShift ]);  // iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[7 ], piCur[7 ], uiShift, piOrgLuma[size_t(7)<<cShift ]);  // iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[8 ], piCur[8 ], uiShift, piOrgLuma[size_t(8)<<cShift ]);  // iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[9 ], piCur[9 ], uiShift, piOrgLuma[size_t(9)<<cShift ]);  // iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[10], piCur[10], uiShift, piOrgLuma[size_t(10)<<cShift ]);  // iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[11], piCur[11], uiShift, piOrgLuma[size_t(11)<<cShift ]);  // iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[12], piCur[12], uiShift, piOrgLuma[size_t(12)<<cShift ]);  // iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[13], piCur[13], uiShift, piOrgLuma[size_t(13)<<cShift ]);  // iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[14], piCur[14], uiShift, piOrgLuma[size_t(14)<<cShift ]);  // iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[15], piCur[15], uiShift, piOrgLuma[size_t(15)<<cShift ]);  // iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[16], piCur[16], uiShift, piOrgLuma[size_t(16)<<cShift ]);  //  iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[17], piCur[17], uiShift, piOrgLuma[size_t(17)<<cShift ]);  //  iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[18], piCur[18], uiShift, piOrgLuma[size_t(18)<<cShift ]);  //  iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[19], piCur[19], uiShift, piOrgLuma[size_t(19)<<cShift ]);  //  iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[20], piCur[20], uiShift, piOrgLuma[size_t(20)<<cShift ]);  //  iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[21], piCur[21], uiShift, piOrgLuma[size_t(21)<<cShift ]);  //  iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[22], piCur[22], uiShift, piOrgLuma[size_t(22)<<cShift ]);  //  iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[23], piCur[23], uiShift, piOrgLuma[size_t(23)<<cShift ]);  //  iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[24], piCur[24], uiShift, piOrgLuma[size_t(24)<<cShift ]);  //  iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[25], piCur[25], uiShift, piOrgLuma[size_t(25)<<cShift ]);  //  iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[26], piCur[26], uiShift, piOrgLuma[size_t(26)<<cShift ]);  //  iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[27], piCur[27], uiShift, piOrgLuma[size_t(27)<<cShift ]);  //  iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[28], piCur[28], uiShift, piOrgLuma[size_t(28)<<cShift ]);  //  iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[29], piCur[29], uiShift, piOrgLuma[size_t(29)<<cShift ]);  //  iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[30], piCur[30], uiShift, piOrgLuma[size_t(30)<<cShift ]);  //  iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[31], piCur[31], uiShift, piOrgLuma[size_t(31)<<cShift ]);  //  iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    piOrg += iStrideOrg;
    piCur += iStrideCur;
    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}

Distortion RdCost::xGetSSE64_WTD( const DistParam &rcDtParam )
{
  if( rcDtParam.applyWeight )
  {
    CHECK( rcDtParam.org.width != 64, "" );
    return RdCostWeightPrediction::xGetSSEw( rcDtParam );
  }
        int  iRows = rcDtParam.org.height;
  const Pel* piOrg = rcDtParam.org.buf;
  const Pel* piCur = rcDtParam.cur.buf;
  const int  iStrideCur = rcDtParam.cur.stride;
  const int  iStrideOrg = rcDtParam.org.stride;
  const Pel* piOrgLuma        = rcDtParam.orgLuma.buf;
  const size_t iStrideOrgLuma   = rcDtParam.orgLuma.stride;
  const size_t  cShift  = rcDtParam.cShiftX;
  const size_t  cShiftY = rcDtParam.cShiftY;

  Distortion uiSum   = 0;
  uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT((rcDtParam.bitDepth)) << 1;
  for( ; iRows != 0; iRows-- )
  {
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[0 ], piCur[0 ], uiShift, piOrgLuma[size_t(0) ]);  // iTemp = piOrg[ 0] - piCur[ 0]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[1 ], piCur[1 ], uiShift, piOrgLuma[size_t(1)<<cShift ]);  // iTemp = piOrg[ 1] - piCur[ 1]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[2 ], piCur[2 ], uiShift, piOrgLuma[size_t(2)<<cShift ]);  // iTemp = piOrg[ 2] - piCur[ 2]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[3 ], piCur[3 ], uiShift, piOrgLuma[size_t(3)<<cShift ]);  // iTemp = piOrg[ 3] - piCur[ 3]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[4 ], piCur[4 ], uiShift, piOrgLuma[size_t(4)<<cShift ]);  // iTemp = piOrg[ 4] - piCur[ 4]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[5 ], piCur[5 ], uiShift, piOrgLuma[size_t(5)<<cShift ]);  // iTemp = piOrg[ 5] - piCur[ 5]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[6 ], piCur[6 ], uiShift, piOrgLuma[size_t(6)<<cShift ]);  // iTemp = piOrg[ 6] - piCur[ 6]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[7 ], piCur[7 ], uiShift, piOrgLuma[size_t(7)<<cShift ]);  // iTemp = piOrg[ 7] - piCur[ 7]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[8 ], piCur[8 ], uiShift, piOrgLuma[size_t(8)<<cShift ]);  // iTemp = piOrg[ 8] - piCur[ 8]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[9 ], piCur[9 ], uiShift, piOrgLuma[size_t(9)<<cShift ]);  // iTemp = piOrg[ 9] - piCur[ 9]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[10], piCur[10], uiShift, piOrgLuma[size_t(10)<<cShift]);  // iTemp = piOrg[10] - piCur[10]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[11], piCur[11], uiShift, piOrgLuma[size_t(11)<<cShift]);  // iTemp = piOrg[11] - piCur[11]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[12], piCur[12], uiShift, piOrgLuma[size_t(12)<<cShift]);  // iTemp = piOrg[12] - piCur[12]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[13], piCur[13], uiShift, piOrgLuma[size_t(13)<<cShift]);  // iTemp = piOrg[13] - piCur[13]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[14], piCur[14], uiShift, piOrgLuma[size_t(14)<<cShift]);  // iTemp = piOrg[14] - piCur[14]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[15], piCur[15], uiShift, piOrgLuma[size_t(15)<<cShift]);  // iTemp = piOrg[15] - piCur[15]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[16], piCur[16], uiShift, piOrgLuma[size_t(16)<<cShift]);  //  iTemp = piOrg[16] - piCur[16]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[17], piCur[17], uiShift, piOrgLuma[size_t(17)<<cShift]);  //  iTemp = piOrg[17] - piCur[17]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[18], piCur[18], uiShift, piOrgLuma[size_t(18)<<cShift]);  //  iTemp = piOrg[18] - piCur[18]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[19], piCur[19], uiShift, piOrgLuma[size_t(19)<<cShift]);  //  iTemp = piOrg[19] - piCur[19]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[20], piCur[20], uiShift, piOrgLuma[size_t(20)<<cShift]);  //  iTemp = piOrg[20] - piCur[20]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[21], piCur[21], uiShift, piOrgLuma[size_t(21)<<cShift]);  //  iTemp = piOrg[21] - piCur[21]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[22], piCur[22], uiShift, piOrgLuma[size_t(22)<<cShift]);  //  iTemp = piOrg[22] - piCur[22]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[23], piCur[23], uiShift, piOrgLuma[size_t(23)<<cShift]);  //  iTemp = piOrg[23] - piCur[23]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[24], piCur[24], uiShift, piOrgLuma[size_t(24)<<cShift]);  //  iTemp = piOrg[24] - piCur[24]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[25], piCur[25], uiShift, piOrgLuma[size_t(25)<<cShift]);  //  iTemp = piOrg[25] - piCur[25]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[26], piCur[26], uiShift, piOrgLuma[size_t(26)<<cShift]);  //  iTemp = piOrg[26] - piCur[26]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[27], piCur[27], uiShift, piOrgLuma[size_t(27)<<cShift]);  //  iTemp = piOrg[27] - piCur[27]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[28], piCur[28], uiShift, piOrgLuma[size_t(28)<<cShift]);  //  iTemp = piOrg[28] - piCur[28]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[29], piCur[29], uiShift, piOrgLuma[size_t(29)<<cShift]);  //  iTemp = piOrg[29] - piCur[29]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[30], piCur[30], uiShift, piOrgLuma[size_t(30)<<cShift]);  //  iTemp = piOrg[30] - piCur[30]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[31], piCur[31], uiShift, piOrgLuma[size_t(31)<<cShift]);  //  iTemp = piOrg[31] - piCur[31]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[32], piCur[32], uiShift, piOrgLuma[size_t(32)<<cShift]);  // iTemp = piOrg[32] - piCur[32]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[33], piCur[33], uiShift, piOrgLuma[size_t(33)<<cShift]);  // iTemp = piOrg[33] - piCur[33]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[34], piCur[34], uiShift, piOrgLuma[size_t(34)<<cShift]);  // iTemp = piOrg[34] - piCur[34]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[35], piCur[35], uiShift, piOrgLuma[size_t(35)<<cShift]);  // iTemp = piOrg[35] - piCur[35]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[36], piCur[36], uiShift, piOrgLuma[size_t(36)<<cShift]);  // iTemp = piOrg[36] - piCur[36]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[37], piCur[37], uiShift, piOrgLuma[size_t(37)<<cShift]);  // iTemp = piOrg[37] - piCur[37]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[38], piCur[38], uiShift, piOrgLuma[size_t(38)<<cShift]);  // iTemp = piOrg[38] - piCur[38]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[39], piCur[39], uiShift, piOrgLuma[size_t(39)<<cShift]);  // iTemp = piOrg[39] - piCur[39]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[40], piCur[40], uiShift, piOrgLuma[size_t(40)<<cShift]);  // iTemp = piOrg[40] - piCur[40]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[41], piCur[41], uiShift, piOrgLuma[size_t(41)<<cShift]);  // iTemp = piOrg[41] - piCur[41]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[42], piCur[42], uiShift, piOrgLuma[size_t(42)<<cShift]);  // iTemp = piOrg[42] - piCur[42]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[43], piCur[43], uiShift, piOrgLuma[size_t(43)<<cShift]);  // iTemp = piOrg[43] - piCur[43]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[44], piCur[44], uiShift, piOrgLuma[size_t(44)<<cShift]);  // iTemp = piOrg[44] - piCur[44]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[45], piCur[45], uiShift, piOrgLuma[size_t(45)<<cShift]);  // iTemp = piOrg[45] - piCur[45]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[46], piCur[46], uiShift, piOrgLuma[size_t(46)<<cShift]);  // iTemp = piOrg[46] - piCur[46]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[47], piCur[47], uiShift, piOrgLuma[size_t(47)<<cShift]);  // iTemp = piOrg[47] - piCur[47]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[48], piCur[48], uiShift, piOrgLuma[size_t(48)<<cShift]);  // iTemp = piOrg[48] - piCur[48]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[49], piCur[49], uiShift, piOrgLuma[size_t(49)<<cShift]);  // iTemp = piOrg[49] - piCur[49]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[50], piCur[50], uiShift, piOrgLuma[size_t(50)<<cShift]);  // iTemp = piOrg[50] - piCur[50]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[51], piCur[51], uiShift, piOrgLuma[size_t(51)<<cShift]);  // iTemp = piOrg[51] - piCur[51]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[52], piCur[52], uiShift, piOrgLuma[size_t(52)<<cShift]);  // iTemp = piOrg[52] - piCur[52]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[53], piCur[53], uiShift, piOrgLuma[size_t(53)<<cShift]);  // iTemp = piOrg[53] - piCur[53]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[54], piCur[54], uiShift, piOrgLuma[size_t(54)<<cShift]);  // iTemp = piOrg[54] - piCur[54]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[55], piCur[55], uiShift, piOrgLuma[size_t(55)<<cShift]);  // iTemp = piOrg[55] - piCur[55]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[56], piCur[56], uiShift, piOrgLuma[size_t(56)<<cShift]);  // iTemp = piOrg[56] - piCur[56]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[57], piCur[57], uiShift, piOrgLuma[size_t(57)<<cShift]);  // iTemp = piOrg[57] - piCur[57]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[58], piCur[58], uiShift, piOrgLuma[size_t(58)<<cShift]);  // iTemp = piOrg[58] - piCur[58]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[59], piCur[59], uiShift, piOrgLuma[size_t(59)<<cShift]);  // iTemp = piOrg[59] - piCur[59]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[60], piCur[60], uiShift, piOrgLuma[size_t(60)<<cShift]);  // iTemp = piOrg[60] - piCur[60]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[61], piCur[61], uiShift, piOrgLuma[size_t(61)<<cShift]);  // iTemp = piOrg[61] - piCur[61]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[62], piCur[62], uiShift, piOrgLuma[size_t(62)<<cShift]);  // iTemp = piOrg[62] - piCur[62]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    uiSum += getWeightedMSE(rcDtParam.compID, piOrg[63], piCur[63], uiShift, piOrgLuma[size_t(63)<<cShift]);  // iTemp = piOrg[63] - piCur[63]; uiSum += Distortion(( iTemp * iTemp ) >> uiShift);
    piOrg += iStrideOrg;
    piCur += iStrideCur;

    piOrgLuma += iStrideOrgLuma<<cShiftY;
  }
  return ( uiSum );
}
#endif


Pel orgCopy[MAX_CU_SIZE * MAX_CU_SIZE];

#if _OPENMP
#pragma omp threadprivate(orgCopy)
#endif

Distortion RdCost::xGetMRHADs( const DistParam &rcDtParam )
{
  const Pel offset = rcDtParam.org.meanDiff( rcDtParam.cur );

  PelBuf modOrg( orgCopy, rcDtParam.org );

  modOrg.copyFrom( rcDtParam.org );
  modOrg.subtract( offset );

  DistParam modDistParam = rcDtParam;
  modDistParam.org = modOrg;

  return m_afpDistortFunc[DF_HAD]( modDistParam );
}
void RdCost::setDistParam( DistParam &rcDP, const CPelBuf &org, const Pel* piRefY, int iRefStride, const Pel* mask, int iMaskStride, int stepX, int iMaskStride2, int bitDepth, ComponentID compID)
{
  rcDP.bitDepth     = bitDepth;
  rcDP.compID       = compID;

  // set Original & Curr Pointer / Stride
  rcDP.org          = org;
  rcDP.cur.buf      = piRefY;
  rcDP.cur.stride   = iRefStride;

  // set Mask
  rcDP.mask         = mask;
  rcDP.maskStride   = iMaskStride;
  rcDP.stepX = stepX;
  rcDP.maskStride2 = iMaskStride2;

  // set Block Width / Height
  rcDP.cur.width    = org.width;
  rcDP.cur.height   = org.height;
  rcDP.maximumDistortionForEarlyExit = std::numeric_limits<Distortion>::max();

  // set Cost function for motion estimation with Mask
  rcDP.distFunc = m_afpDistortFunc[ DF_SAD_WITH_MASK ];
}

Distortion RdCost::xGetSADwMask( const DistParam& rcDtParam )
{
  if ( rcDtParam.applyWeight )
  {
    return RdCostWeightPrediction::xGetSADw( rcDtParam );
  }

  const Pel* org           = rcDtParam.org.buf;
  const Pel* cur           = rcDtParam.cur.buf;
  const Pel* mask          = rcDtParam.mask;
  const int  cols           = rcDtParam.org.width;
  int        rows           = rcDtParam.org.height;
  const int  subShift       = rcDtParam.subShift;
  const int  subStep        = ( 1 << subShift);
  const int  strideCur      = rcDtParam.cur.stride * subStep;
  const int  strideOrg      = rcDtParam.org.stride * subStep;
  const int  strideMask     = rcDtParam.maskStride * subStep;
  const int  stepX = rcDtParam.stepX;
  const int  strideMask2 = rcDtParam.maskStride2;
  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);

  Distortion sum = 0;
  for (; rows != 0; rows -= subStep)
  {
    for (int n = 0; n < cols; n++)
    {
      sum += abs(org[n] - cur[n]) * *mask;
      mask += stepX;
    }
    org += strideOrg;
    cur += strideCur;
    mask += strideMask;
    mask += strideMask2;
  }
  sum <<= subShift;
  return (sum >> distortionShift );
}
//! \}