/* The copyright in this software is being made available under the BSD
 * License, included below. This software may be subject to other third party
 * and contributor rights, including patent rights, and no such rights are
 * granted under this license.
 *
 * Copyright (c) 2010-2022, ITU/ISO/IEC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

/** \file     Prediction.cpp
    \brief    prediction class
*/

#include "InterPrediction.h"

#include "Buffer.h"
#include "UnitTools.h"
#include "MCTS.h"

#include <memory.h>
#include <algorithm>

#if INTER_LIC || (TM_AMVP || TM_MRG) || JVET_W0090_ARMC_TM
#include "Reshape.h"
#endif

#if ENABLE_SIMD_TMP
#include "CommonDefX86.h"
#endif

//! \ingroup CommonLib
//! \{

// ====================================================================================================================
// Constructor / destructor / initialize
// ====================================================================================================================

InterPrediction::InterPrediction()
:
#if INTER_LIC
  m_storeBeforeLIC  (false),
#endif
#if INTER_LIC || (TM_AMVP || TM_MRG) // note: already refactor
  m_pcReshape            ( nullptr ),
#endif
#if INTER_LIC
  m_pcLICRefLeftTemplate ( nullptr ),
  m_pcLICRefAboveTemplate( nullptr ),
  m_pcLICRecLeftTemplate ( nullptr ),
  m_pcLICRecAboveTemplate( nullptr ),
#endif
#if TM_AMVP || TM_MRG
  m_pcCurTplLeft ( nullptr ),
  m_pcCurTplAbove( nullptr ),
  m_pcRefTplLeft ( nullptr ),
  m_pcRefTplAbove( nullptr ),
#endif
  m_currChromaFormat( NUM_CHROMA_FORMAT )
, m_maxCompIDToPred ( MAX_NUM_COMPONENT )
, m_pcRdCost        ( nullptr )
, m_storedMv        ( nullptr )
, m_skipPROF (false)
, m_encOnly  (false)
, m_isBi     (false)
, m_gradX0(nullptr)
, m_gradY0(nullptr)
, m_gradX1(nullptr)
, m_gradY1(nullptr)
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
, m_absGx(nullptr)
, m_absGy(nullptr)
, m_dIx(nullptr)
, m_dIy(nullptr)
, m_dI(nullptr)
, m_signGxGy(nullptr)
, m_tmpx_pixel_32bit(nullptr)
, m_tmpy_pixel_32bit(nullptr)
, m_sumAbsGX_pixel_32bit(nullptr)
, m_sumAbsGY_pixel_32bit(nullptr)
, m_sumDIX_pixel_32bit(nullptr)
, m_sumDIY_pixel_32bit(nullptr)
, m_sumSignGY_GX_pixel_32bit(nullptr)
#endif
, m_subPuMC(false)
{
  for( uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++ )
  {
    for( uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++ )
    {
      m_acYuvPred[refList][ch] = nullptr;
    }
  }

  for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ )
  {
    for( uint32_t i = 0; i < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL; i++ )
    {
      for( uint32_t j = 0; j < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL; j++ )
      {
        m_filteredBlock[i][j][c] = nullptr;
      }

      m_filteredBlockTmp[i][c] = nullptr;
    }
  }
  m_cYuvPredTempDMVRL1 = nullptr;
  m_cYuvPredTempDMVRL0 = nullptr;
  for (uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++)
  {
    m_cRefSamplesDMVRL0[ch] = nullptr;
    m_cRefSamplesDMVRL1[ch] = nullptr;
  }

#if INTER_LIC
  m_LICMultApprox[0] = 0;
  for (int k = 1; k < 64; k++)
  {
    m_LICMultApprox[k] = ((1 << 15) + (k >> 1)) / k;
  }
#endif

#if MULTI_PASS_DMVR
  int mvSearchIdx_bilMrg = 0;
#if JVET_X0049_BDMVR_SW_OPT
  uint16_t currtPrio = 0, currIdx = 0;
  ::memset(m_searchEnlargeOffsetNum, 0, sizeof(m_searchEnlargeOffsetNum));
#endif
  for (int y = -BDMVR_INTME_RANGE; y <= BDMVR_INTME_RANGE; y++)
  {
    for (int x = -BDMVR_INTME_RANGE; x <= BDMVR_INTME_RANGE; x++)
    {
#if JVET_X0049_BDMVR_SW_OPT
#else
      m_searchEnlargeOffsetBilMrg[mvSearchIdx_bilMrg] = Mv(x, y);
#endif
      if ( (abs(x) + abs(y)) == 0 )
      {
#if JVET_X0049_BDMVR_SW_OPT
        currtPrio = 0;
        currIdx = m_searchEnlargeOffsetNum[currtPrio];
        m_searchEnlargeOffsetToIdx[currtPrio][currIdx] = mvSearchIdx_bilMrg;
#else
        m_searchPriorityBilMrg[mvSearchIdx_bilMrg] = 0;
#endif
        m_costShiftBilMrg1[mvSearchIdx_bilMrg] = 63;
        m_costShiftBilMrg2[mvSearchIdx_bilMrg++] = 63;
      }
      else if ( (abs(x) + abs(y)) < 4 )
      {
#if JVET_X0049_BDMVR_SW_OPT
        currtPrio = 1;
        currIdx = m_searchEnlargeOffsetNum[currtPrio];
        m_searchEnlargeOffsetToIdx[currtPrio][currIdx] = mvSearchIdx_bilMrg;
#else
        m_searchPriorityBilMrg[mvSearchIdx_bilMrg] = 1;
#endif
        m_costShiftBilMrg1[mvSearchIdx_bilMrg] = 63;
        m_costShiftBilMrg2[mvSearchIdx_bilMrg++] = 63;
      }
      else if ((abs(x) + abs(y)) < 7)
      {
#if JVET_X0049_BDMVR_SW_OPT
        currtPrio = 2;
        currIdx = m_searchEnlargeOffsetNum[currtPrio];
        m_searchEnlargeOffsetToIdx[currtPrio][currIdx] = mvSearchIdx_bilMrg;
#else
        m_searchPriorityBilMrg[mvSearchIdx_bilMrg] = 2;
#endif
        m_costShiftBilMrg1[mvSearchIdx_bilMrg] = 2;
        m_costShiftBilMrg2[mvSearchIdx_bilMrg++] = 63;
      }
      else if ((abs(x) + abs(y)) < 11)
      {
#if JVET_X0049_BDMVR_SW_OPT
        currtPrio = 3;
        currIdx = m_searchEnlargeOffsetNum[currtPrio];
        m_searchEnlargeOffsetToIdx[currtPrio][currIdx] = mvSearchIdx_bilMrg;
#else
        m_searchPriorityBilMrg[mvSearchIdx_bilMrg] = 3;
#endif
        m_costShiftBilMrg1[mvSearchIdx_bilMrg] = 1;
        m_costShiftBilMrg2[mvSearchIdx_bilMrg++] = 63;
      }
      else
      {
#if JVET_X0049_BDMVR_SW_OPT
        currtPrio = 4;
        currIdx = m_searchEnlargeOffsetNum[currtPrio];
        m_searchEnlargeOffsetToIdx[currtPrio][currIdx] = mvSearchIdx_bilMrg;
#else
        m_searchPriorityBilMrg[mvSearchIdx_bilMrg] = 4;
#endif
        m_costShiftBilMrg1[mvSearchIdx_bilMrg] = 1;
        m_costShiftBilMrg2[mvSearchIdx_bilMrg++] = 2;
      }
#if JVET_X0049_BDMVR_SW_OPT
      m_searchEnlargeOffsetBilMrg[currtPrio][currIdx] = Mv(x, y);
      m_searchEnlargeOffsetNum[currtPrio]++;
#endif
    }
  }
  CHECK(mvSearchIdx_bilMrg != (2 * BDMVR_INTME_RANGE + 1) * (2 * BDMVR_INTME_RANGE + 1),
      "this is wrong, mvSearchIdx_bilMrg != (2 * BDMVR_INTME_RANGE + 1) * (2 * BDMVR_INTME_RANGE + 1)");
#endif
#if JVET_W0090_ARMC_TM
  for (uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++)
  {
    for (uint32_t tmplt = 0; tmplt < 2; tmplt++)
    {
      m_acYuvCurAMLTemplate[tmplt][ch] = nullptr;
      m_acYuvRefAboveTemplate[tmplt][ch] = nullptr;
      m_acYuvRefLeftTemplate[tmplt][ch] = nullptr;
      m_acYuvRefAMLTemplate[tmplt][ch] = nullptr;
    }
  }
#endif
}

InterPrediction::~InterPrediction()
{
  destroy();
}

void InterPrediction::destroy()
{
  for( uint32_t i = 0; i < NUM_REF_PIC_LIST_01; i++ )
  {
    for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ )
    {
      xFree( m_acYuvPred[i][c] );
      m_acYuvPred[i][c] = nullptr;
    }
  }

  for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ )
  {
    for( uint32_t i = 0; i < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL; i++ )
    {
      for( uint32_t j = 0; j < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL; j++ )
      {
        xFree( m_filteredBlock[i][j][c] );
        m_filteredBlock[i][j][c] = nullptr;
      }

      xFree( m_filteredBlockTmp[i][c] );
      m_filteredBlockTmp[i][c] = nullptr;
    }
  }

  m_geoPartBuf[0].destroy();
  m_geoPartBuf[1].destroy();
  m_colorTransResiBuf[0].destroy();
  m_colorTransResiBuf[1].destroy();
  m_colorTransResiBuf[2].destroy();

  if (m_storedMv != nullptr)
  {
    delete[]m_storedMv;
    m_storedMv = nullptr;
  }

  xFree(m_gradX0);   m_gradX0 = nullptr;
  xFree(m_gradY0);   m_gradY0 = nullptr;
  xFree(m_gradX1);   m_gradX1 = nullptr;
  xFree(m_gradY1);   m_gradY1 = nullptr;
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
  xFree(m_absGx);    m_absGx = nullptr;
  xFree(m_absGy);    m_absGy = nullptr;
  xFree(m_dIx);      m_dIx = nullptr;
  xFree(m_dIy);      m_dIy = nullptr;
  xFree(m_dI);    m_dI = nullptr;
  xFree(m_signGxGy); m_signGxGy = nullptr;
  xFree(m_tmpx_pixel_32bit); m_tmpx_pixel_32bit = nullptr;
  xFree(m_tmpy_pixel_32bit); m_tmpy_pixel_32bit = nullptr;
  xFree(m_sumAbsGX_pixel_32bit);     m_sumAbsGX_pixel_32bit = nullptr;
  xFree(m_sumAbsGY_pixel_32bit);     m_sumAbsGY_pixel_32bit = nullptr;
  xFree(m_sumDIX_pixel_32bit);       m_sumDIX_pixel_32bit = nullptr;
  xFree(m_sumDIY_pixel_32bit);       m_sumDIY_pixel_32bit = nullptr;
  xFree(m_sumSignGY_GX_pixel_32bit); m_sumSignGY_GX_pixel_32bit = nullptr;
#endif
#if ENABLE_OBMC
  m_tmpObmcBufL0.destroy();
  m_tmpObmcBufT0.destroy();
  m_tmpSubObmcBuf.destroy();
#endif
  xFree(m_cYuvPredTempDMVRL0);
  m_cYuvPredTempDMVRL0 = nullptr;
  xFree(m_cYuvPredTempDMVRL1);
  m_cYuvPredTempDMVRL1 = nullptr;
  for (uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++)
  {
    xFree(m_cRefSamplesDMVRL0[ch]);
    m_cRefSamplesDMVRL0[ch] = nullptr;
    xFree(m_cRefSamplesDMVRL1[ch]);
    m_cRefSamplesDMVRL1[ch] = nullptr;
  }
  m_IBCBuffer.destroy();
#if TM_AMVP || TM_MRG
  xFree(m_pcCurTplLeft ); m_pcCurTplLeft  = nullptr;
  xFree(m_pcCurTplAbove); m_pcCurTplAbove = nullptr;
  xFree(m_pcRefTplLeft ); m_pcRefTplLeft  = nullptr;
  xFree(m_pcRefTplAbove); m_pcRefTplAbove = nullptr;
#endif
#if INTER_LIC
  xFree(m_pcLICRefLeftTemplate);  m_pcLICRefLeftTemplate  = nullptr;
  xFree(m_pcLICRefAboveTemplate); m_pcLICRefAboveTemplate = nullptr;
  xFree(m_pcLICRecLeftTemplate);  m_pcLICRecLeftTemplate  = nullptr;
  xFree(m_pcLICRecAboveTemplate); m_pcLICRecAboveTemplate = nullptr;
#endif
#if MULTI_HYP_PRED
  m_additionalHypothesisStorage.destroy();
#endif
#if JVET_W0090_ARMC_TM
  for (uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++)
  {
    for (uint32_t tmplt = 0; tmplt < 2; tmplt++)
    {
      xFree(m_acYuvCurAMLTemplate[tmplt][ch]);
      xFree(m_acYuvRefAboveTemplate[tmplt][ch]);
      xFree(m_acYuvRefLeftTemplate[tmplt][ch]);
      xFree(m_acYuvRefAMLTemplate[tmplt][ch]);
      m_acYuvCurAMLTemplate[tmplt][ch] = nullptr;
      m_acYuvRefAboveTemplate[tmplt][ch] = nullptr;
      m_acYuvRefLeftTemplate[tmplt][ch] = nullptr;
      m_acYuvRefAMLTemplate[tmplt][ch] = nullptr;
  }
}
#endif
}

#if INTER_LIC || (TM_AMVP || TM_MRG) || JVET_W0090_ARMC_TM
void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, Reshape* reshape )
#else
void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize )
#endif
{
  m_pcRdCost = pcRdCost;
#if INTER_LIC || (TM_AMVP || TM_MRG) || JVET_W0090_ARMC_TM
  m_pcReshape = reshape;
#endif

  // if it has been initialised before, but the chroma format has changed, release the memory and start again.
  if( m_acYuvPred[REF_PIC_LIST_0][COMPONENT_Y] != nullptr && m_currChromaFormat != chromaFormatIDC )
  {
    destroy();
  }

  m_currChromaFormat = chromaFormatIDC;
  if( m_acYuvPred[REF_PIC_LIST_0][COMPONENT_Y] == nullptr ) // check if first is null (in which case, nothing initialised yet)
  {
    for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ )
    {
#if IF_12TAP || MULTI_PASS_DMVR
#if MULTI_PASS_DMVR
      int extendSize = std::max(2 * BIO_EXTEND_SIZE + 2, 2 * BDMVR_INTME_RANGE);
#else
      int extendSize = std::max(2 * BIO_EXTEND_SIZE + 2, 2 * DMVR_NUM_ITERATION);
#endif
#if IF_12TAP
      int extWidth   = MAX_CU_SIZE + extendSize + 32;
#else
      int extWidth   = MAX_CU_SIZE + extendSize + 16;
#endif
      int extHeight  = MAX_CU_SIZE + extendSize + 1;
#else
      int extWidth = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 16;
      int extHeight = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 1;
      extWidth = extWidth > (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + 16) ? extWidth : MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + 16;
      extHeight = extHeight > (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + 1) ? extHeight : MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + 1;
#endif
      for( uint32_t i = 0; i < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL; i++ )
      {
#if IF_12TAP
        m_filteredBlockTmp[i][c] = ( Pel* ) xMalloc( Pel, ( extWidth + 4 ) * ( extHeight + 15 + 4 ) );
#else
        m_filteredBlockTmp[i][c] = ( Pel* ) xMalloc( Pel, ( extWidth + 4 ) * ( extHeight + 7 + 4 ) );
#endif

        for( uint32_t j = 0; j < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL; j++ )
        {
          m_filteredBlock[i][j][c] = ( Pel* ) xMalloc( Pel, extWidth * extHeight );
        }
      }

      // new structure
      for( uint32_t i = 0; i < NUM_REF_PIC_LIST_01; i++ )
      {
        m_acYuvPred[i][c] = ( Pel* ) xMalloc( Pel, MAX_CU_SIZE * MAX_CU_SIZE );
      }
    }

    m_geoPartBuf[0].create(UnitArea(chromaFormatIDC, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE)));
    m_geoPartBuf[1].create(UnitArea(chromaFormatIDC, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE)));
    m_colorTransResiBuf[0].create(UnitArea(chromaFormatIDC, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE)));
    m_colorTransResiBuf[1].create(UnitArea(chromaFormatIDC, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE)));
    m_colorTransResiBuf[2].create(UnitArea(chromaFormatIDC, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE)));
#if MULTI_HYP_PRED
    m_additionalHypothesisStorage.create(UnitArea(chromaFormatIDC, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE)));
#endif

    m_iRefListIdx = -1;

    m_gradX0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_gradY0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_gradX1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_gradY1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
    m_absGx = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_absGy = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_dIx = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_dIy = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_dI = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_signGxGy = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
    m_tmpx_pixel_32bit = (int*)xMalloc(int, BDOF_SUBPU_SIZE);
    m_tmpy_pixel_32bit = (int*)xMalloc(int, BDOF_SUBPU_SIZE);
    m_sumAbsGX_pixel_32bit = (int*)xMalloc(int, BDOF_SUBPU_SIZE);
    m_sumAbsGY_pixel_32bit = (int*)xMalloc(int, BDOF_SUBPU_SIZE);
    m_sumDIX_pixel_32bit = (int*)xMalloc(int, BDOF_SUBPU_SIZE);
    m_sumDIY_pixel_32bit = (int*)xMalloc(int, BDOF_SUBPU_SIZE);
    m_sumSignGY_GX_pixel_32bit = (int*)xMalloc(int, BDOF_SUBPU_SIZE);
#endif
#if ENABLE_OBMC
    m_tmpObmcBufL0.create(UnitArea(chromaFormatIDC, Area(0, 0, 4, MAX_CU_SIZE)));
    m_tmpObmcBufT0.create(UnitArea(chromaFormatIDC, Area(0, 0, MAX_CU_SIZE, 4)));
    m_tmpSubObmcBuf.create(UnitArea(chromaFormatIDC, Area(0, 0, 20, 4)));
    m_tmpSubObmcBuf.bufs[0].memset(0);
    m_tmpSubObmcBuf.bufs[1].memset(0);
    m_tmpSubObmcBuf.bufs[2].memset(0);
#endif
  }

  if (m_cYuvPredTempDMVRL0 == nullptr && m_cYuvPredTempDMVRL1 == nullptr)
  {
    m_cYuvPredTempDMVRL0 = (Pel*)xMalloc(Pel, (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION)) * (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION)));
    m_cYuvPredTempDMVRL1 = (Pel*)xMalloc(Pel, (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION)) * (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION)));
    for (uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++)
    {
#if IF_12TAP
      m_cRefSamplesDMVRL0[ch] = (Pel*)xMalloc(Pel, (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA(0)) * (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA(0)));
      m_cRefSamplesDMVRL1[ch] = (Pel*)xMalloc(Pel, (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA(0)) * (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA(0)));
#else
      m_cRefSamplesDMVRL0[ch] = (Pel*)xMalloc(Pel, (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA) * (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA));
      m_cRefSamplesDMVRL1[ch] = (Pel*)xMalloc(Pel, (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA) * (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA));
#endif
    }
  }
#if !JVET_J0090_MEMORY_BANDWITH_MEASURE
  m_if.initInterpolationFilter( true );
#endif
#if TM_AMVP || TM_MRG
  if (m_pcCurTplLeft == nullptr)
  {
    m_pcCurTplLeft  = (Pel*)xMalloc(Pel, TM_TPL_SIZE * MAX_CU_SIZE);
    m_pcCurTplAbove = (Pel*)xMalloc(Pel, TM_TPL_SIZE * MAX_CU_SIZE);
    m_pcRefTplLeft  = (Pel*)xMalloc(Pel, TM_TPL_SIZE * MAX_CU_SIZE);
    m_pcRefTplAbove = (Pel*)xMalloc(Pel, TM_TPL_SIZE * MAX_CU_SIZE);
  }
#endif
#if INTER_LIC
  if (m_pcLICRefLeftTemplate == nullptr)
  {
    m_pcLICRefLeftTemplate  = (Pel*)xMalloc(Pel, MAX_CU_SIZE);
    m_pcLICRefAboveTemplate = (Pel*)xMalloc(Pel, MAX_CU_SIZE);
    m_pcLICRecLeftTemplate  = (Pel*)xMalloc(Pel, MAX_CU_SIZE);
    m_pcLICRecAboveTemplate = (Pel*)xMalloc(Pel, MAX_CU_SIZE);
  }
#endif
#if JVET_W0090_ARMC_TM
  for (uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++)
  {
    for (uint32_t tmplt = 0; tmplt < 2; tmplt++)
    {
      m_acYuvCurAMLTemplate[tmplt][ch] = (Pel*)xMalloc(Pel, MAX_CU_SIZE * MAX_CU_SIZE);
      m_acYuvRefAboveTemplate[tmplt][ch] = (Pel*)xMalloc(Pel, MAX_CU_SIZE * MAX_CU_SIZE);
      m_acYuvRefLeftTemplate[tmplt][ch] = (Pel*)xMalloc(Pel, MAX_CU_SIZE * MAX_CU_SIZE);
      m_acYuvRefAMLTemplate[tmplt][ch] = (Pel*)xMalloc(Pel, MAX_CU_SIZE * MAX_CU_SIZE);
    }
  }
#endif

  if (m_storedMv == nullptr)
  {
    const int MVBUFFER_SIZE = MAX_CU_SIZE / MIN_PU_SIZE;
    m_storedMv = new Mv[MVBUFFER_SIZE*MVBUFFER_SIZE];
  }
  if (m_IBCBuffer.bufs.empty())
  {
    m_IBCBufferWidth = g_IBCBufferSize / ctuSize;
    m_IBCBuffer.create(UnitArea(chromaFormatIDC, Area(0, 0, m_IBCBufferWidth, ctuSize)));
  }
}

// ====================================================================================================================
// Public member functions
// ====================================================================================================================

bool InterPrediction::xCheckIdenticalMotion( const PredictionUnit &pu )
{
  const Slice &slice = *pu.cs->slice;

  if( slice.isInterB() && !pu.cs->pps->getWPBiPred() )
  {
    if( pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0 )
    {
      int RefPOCL0 = slice.getRefPic( REF_PIC_LIST_0, pu.refIdx[0] )->getPOC();
      int RefPOCL1 = slice.getRefPic( REF_PIC_LIST_1, pu.refIdx[1] )->getPOC();

      if( RefPOCL0 == RefPOCL1 )
      {
        if( !pu.cu->affine )
        {
          if( pu.mv[0] == pu.mv[1] )
          {
            return true;
          }
        }
        else
        {
          if ( (pu.cu->affineType == AFFINEMODEL_4PARAM && (pu.mvAffi[0][0] == pu.mvAffi[1][0]) && (pu.mvAffi[0][1] == pu.mvAffi[1][1]))
            || (pu.cu->affineType == AFFINEMODEL_6PARAM && (pu.mvAffi[0][0] == pu.mvAffi[1][0]) && (pu.mvAffi[0][1] == pu.mvAffi[1][1]) && (pu.mvAffi[0][2] == pu.mvAffi[1][2])) )
          {
            return true;
          }
        }
      }
    }
  }

  return false;
}

void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const RefPicList &eRefPicList /*= REF_PIC_LIST_X*/, const bool luma /*= true*/, const bool chroma /*= true*/)
{
#if MULTI_HYP_PRED
  CHECK(!pu.addHypData.empty(), "Multi Hyp: !pu.addHypData.empty()");
#endif
  // compute the location of the current PU
  Position puPos    = pu.lumaPos();
  Size puSize       = pu.lumaSize();

  int numPartLine, numPartCol, puHeight, puWidth;
  {
    numPartLine = std::max(puSize.width >> ATMVP_SUB_BLOCK_SIZE, 1u);
    numPartCol = std::max(puSize.height >> ATMVP_SUB_BLOCK_SIZE, 1u);
    puHeight = numPartCol == 1 ? puSize.height : 1 << ATMVP_SUB_BLOCK_SIZE;
    puWidth = numPartLine == 1 ? puSize.width : 1 << ATMVP_SUB_BLOCK_SIZE;
  }

  PredictionUnit subPu;

  subPu.cs        = pu.cs;
  subPu.cu        = pu.cu;
  subPu.mergeType = MRG_TYPE_DEFAULT_N;

  bool isAffine = pu.cu->affine;
  subPu.cu->affine = false;

  // join sub-pus containing the same motion
  bool verMC = puSize.height > puSize.width;
  int  fstStart = (!verMC ? puPos.y : puPos.x);
  int  secStart = (!verMC ? puPos.x : puPos.y);
  int  fstEnd = (!verMC ? puPos.y + puSize.height : puPos.x + puSize.width);
  int  secEnd = (!verMC ? puPos.x + puSize.width : puPos.y + puSize.height);
  int  fstStep = (!verMC ? puHeight : puWidth);
  int  secStep = (!verMC ? puWidth : puHeight);

  const bool isResamplingPossible = pu.cs->sps->getRprEnabledFlag();

  const bool scaled = isResamplingPossible && ( pu.cu->slice->getRefPic( REF_PIC_LIST_0, 0 )->isRefScaled( pu.cs->pps ) || ( pu.cs->slice->getSliceType() == B_SLICE ? pu.cu->slice->getRefPic( REF_PIC_LIST_1, 0 )->isRefScaled( pu.cs->pps ) : false ) );
  m_subPuMC = true;

  for (int fstDim = fstStart; fstDim < fstEnd; fstDim += fstStep)
  {
    for (int secDim = secStart; secDim < secEnd; secDim += secStep)
    {
      int x = !verMC ? secDim : fstDim;
      int y = !verMC ? fstDim : secDim;
      const MotionInfo &curMi = pu.getMotionInfo(Position{ x, y });

      int length = secStep;
      int later  = secDim + secStep;

      while (later < secEnd)
      {
        const MotionInfo &laterMi = !verMC ? pu.getMotionInfo(Position{ later, fstDim }) : pu.getMotionInfo(Position{ fstDim, later });
        if (!scaled && laterMi == curMi
#if INTER_LIC
          && laterMi.usesLIC == curMi.usesLIC
#endif
          )
        {
          length += secStep;
        }
        else
        {
          break;
        }
        later += secStep;
      }
      int dx = !verMC ? length : puWidth;
      int dy = !verMC ? puHeight : length;

      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, dx, dy)));
      subPu = curMi;
      PelUnitBuf subPredBuf = predBuf.subBuf(UnitAreaRelative(pu, subPu));
      subPu.mmvdEncOptMode = 0;
      subPu.mvRefine = false;
      motionCompensation(subPu, subPredBuf, eRefPicList, luma, chroma);
      secDim = later - secStep;
    }
  }
  m_subPuMC = false;

  pu.cu->affine = isAffine;
}
#if !BDOF_RM_CONSTRAINTS
void InterPrediction::xSubPuBio(PredictionUnit& pu, PelUnitBuf& predBuf, const RefPicList &eRefPicList /*= REF_PIC_LIST_X*/, PelUnitBuf* yuvDstTmp /*= NULL*/)
{
  // compute the location of the current PU
  Position puPos = pu.lumaPos();
  Size puSize = pu.lumaSize();

#if JVET_J0090_MEMORY_BANDWITH_MEASURE
  JVET_J0090_SET_CACHE_ENABLE(true);
  int mvShift = (MV_FRACTIONAL_BITS_INTERNAL);
  for (int k = 0; k < NUM_REF_PIC_LIST_01; k++)
  {
    RefPicList refId = (RefPicList)k;
    const Picture* refPic = pu.cu->slice->getRefPic(refId, pu.refIdx[refId]);
    for (int compID = 0; compID < MAX_NUM_COMPONENT; compID++)
    {
      Mv cMv = pu.mv[refId];
      int mvshiftTemp = mvShift + getComponentScaleX((ComponentID)compID, pu.chromaFormat);
      int filtersize = (compID == (COMPONENT_Y)) ? NTAPS_LUMA : NTAPS_CHROMA;
      cMv += Mv(-(((filtersize >> 1) - 1) << mvshiftTemp), -(((filtersize >> 1) - 1) << mvshiftTemp));
      bool wrapRef = false;
      if ( pu.cu->slice->getRefPic(refId, pu.refIdx[refId])->isWrapAroundEnabled( pu.cs->pps ) )
      {
        wrapRef = wrapClipMv(cMv, pu.blocks[0].pos(), pu.blocks[0].size(), pu.cs->sps, pu.cs->pps);
      }
      else
      {
        clipMv(cMv, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);
      }

      int width = predBuf.bufs[compID].width + (filtersize - 1);
      int height = predBuf.bufs[compID].height + (filtersize - 1);

      CPelBuf refBuf;
      Position recOffset = pu.blocks[compID].pos().offset(cMv.getHor() >> mvshiftTemp, cMv.getVer() >> mvshiftTemp);
      refBuf = refPic->getRecoBuf(CompArea((ComponentID)compID, pu.chromaFormat, recOffset, pu.blocks[compID].size()), wrapRef);

      JVET_J0090_SET_REF_PICTURE(refPic, (ComponentID)compID);
      for (int row = 0; row < height; row++)
      {
        for (int col = 0; col < width; col++)
        {
          JVET_J0090_CACHE_ACCESS(((Pel *)refBuf.buf) + row * refBuf.stride + col, __FILE__, __LINE__);
        }
      }
    }
  }
  JVET_J0090_SET_CACHE_ENABLE(false);
#endif
  PredictionUnit subPu;

  subPu.cs = pu.cs;
  subPu.cu = pu.cu;
  subPu.mergeType = pu.mergeType;
  subPu.mmvdMergeFlag = pu.mmvdMergeFlag;
  subPu.mmvdEncOptMode = pu.mmvdEncOptMode;
  subPu.mergeFlag = pu.mergeFlag;
  subPu.ciipFlag = pu.ciipFlag;
  subPu.mvRefine = pu.mvRefine;
#if TM_MRG
  subPu.tmMergeFlag = pu.tmMergeFlag;
#endif
  subPu.refIdx[0] = pu.refIdx[0];
  subPu.refIdx[1] = pu.refIdx[1];
  int  fstStart = puPos.y;
  int  secStart = puPos.x;
  int  fstEnd = puPos.y + puSize.height;
  int  secEnd = puPos.x + puSize.width;
  int  fstStep = std::min((int)MAX_BDOF_APPLICATION_REGION, (int)puSize.height);
  int  secStep = std::min((int)MAX_BDOF_APPLICATION_REGION, (int)puSize.width);
  for (int fstDim = fstStart; fstDim < fstEnd; fstDim += fstStep)
  {
    for (int secDim = secStart; secDim < secEnd; secDim += secStep)
    {
      int x = secDim;
      int y = fstDim;
      int dx = secStep;
      int dy = fstStep;

#if !JVET_W0097_GPM_MMVD_TM
      const MotionInfo &curMi = pu.getMotionInfo(Position{ x, y });
#endif

      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, dx, dy)));
#if JVET_W0097_GPM_MMVD_TM
      subPu.interDir = pu.interDir;
      for (uint32_t i = 0; i < NUM_REF_PIC_LIST_01; i++)
      {
        subPu.refIdx[i] = pu.refIdx[i];
        subPu.mv[i] = pu.mv[i];
      }
#else
      subPu = curMi;
#endif
      PelUnitBuf subPredBuf = predBuf.subBuf(UnitAreaRelative(pu, subPu));

      if (yuvDstTmp)
      {
        PelUnitBuf subPredBufTmp = yuvDstTmp->subBuf(UnitAreaRelative(pu, subPu));
        motionCompensation(subPu, subPredBuf, eRefPicList, true, true, &subPredBufTmp);
      }
      else
      motionCompensation(subPu, subPredBuf, eRefPicList);
    }
  }
  JVET_J0090_SET_CACHE_ENABLE(true);
}
#endif

#if MULTI_PASS_DMVR
void InterPrediction::xPredInterUni(const PredictionUnit &pu, const RefPicList &eRefPicList, PelUnitBuf &pcYuvPred, const bool &bi,
                                    const bool &bioApplied, const bool luma, const bool chroma, const bool isBdofMvRefine)
#else
void InterPrediction::xPredInterUni(const PredictionUnit &pu, const RefPicList &eRefPicList, PelUnitBuf &pcYuvPred,
                                    const bool &bi, const bool &bioApplied, const bool luma, const bool chroma)
#endif
{
  const SPS &sps = *pu.cs->sps;

  int iRefIdx = pu.refIdx[eRefPicList];
  Mv mv[3];
  bool isIBC = false;
#if !INTER_RM_SIZE_CONSTRAINTS
#if ENABLE_OBMC
  if (pu.cu->isobmcMC == false)
#endif
  CHECK( !CU::isIBC( *pu.cu ) && pu.lwidth() == 4 && pu.lheight() == 4, "invalid 4x4 inter blocks" );
#endif
  if (CU::isIBC(*pu.cu))
  {
    isIBC = true;
  }
  if( pu.cu->affine )
  {
    CHECK( iRefIdx < 0, "iRefIdx incorrect." );

    mv[0] = pu.mvAffi[eRefPicList][0];
    mv[1] = pu.mvAffi[eRefPicList][1];
    mv[2] = pu.mvAffi[eRefPicList][2];
  }
  else
  {
    mv[0] = pu.mv[eRefPicList];
  }

  if( !pu.cu->affine )
  {
    const bool isResamplingPossible = pu.cs->sps->getRprEnabledFlag();

    if( !isIBC && ( !isResamplingPossible || !pu.cu->slice->getRefPic( eRefPicList, iRefIdx )->isRefScaled( pu.cs->pps ) ) )
    {
      if( !pu.cs->pps->getWrapAroundEnabledFlag() )
      {
        clipMv( mv[0], pu.cu->lumaPos(), pu.cu->lumaSize(), sps, *pu.cs->pps );
      }
    }
  }

  for( uint32_t comp = COMPONENT_Y; comp < pcYuvPred.bufs.size() && comp <= m_maxCompIDToPred; comp++ )
  {
    const ComponentID compID = ComponentID( comp );
    if (compID == COMPONENT_Y && !luma)
    {
      continue;
    }
    if (compID != COMPONENT_Y && !chroma)
    {
      continue;
    }
#if MULTI_PASS_DMVR
    if (compID != COMPONENT_Y && bioApplied && isBdofMvRefine)
    {
      continue;
    }
#endif
    if ( pu.cu->affine )
    {
      CHECK( bioApplied, "BIO is not allowed with affine" );
      m_iRefListIdx = eRefPicList;
      bool genChromaMv = (!luma && chroma && compID == COMPONENT_Cb);

      xPredAffineBlk(compID, pu, pu.cu->slice->getRefPic(eRefPicList, iRefIdx)->unscaledPic, mv, pcYuvPred, bi, pu.cu->slice->clpRng(compID), genChromaMv, pu.cu->slice->getScalingRatio(eRefPicList, iRefIdx));
    }
    else
    {
      if (isIBC)
      {
        xPredInterBlk(compID, pu, pu.cu->slice->getPic(), mv[0], pcYuvPred, bi, pu.cu->slice->clpRng(compID),
                      bioApplied, isIBC);
      }
      else
      {
        xPredInterBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx )->unscaledPic, mv[0], pcYuvPred, bi, pu.cu->slice->clpRng( compID ), bioApplied, isIBC, pu.cu->slice->getScalingRatio( eRefPicList, iRefIdx ) );
      }
    }
  }
}

#if MULTI_PASS_DMVR
void InterPrediction::xPredInterBiSubPuBDOF(PredictionUnit &pu, PelUnitBuf &pcYuvPred, const bool luma, const bool chroma)
{
  const Slice &slice = *pu.cs->slice;
  bool bioApplied = true;
  // common variable for all subPu
  const bool lumaOnly = (luma && !chroma), chromaOnly = (!luma && chroma);
  const int bioDy = std::min<int>(pu.lumaSize().height, BDOF_SUBPU_DIM);
  const int bioDx = std::min<int>(pu.lumaSize().width,  BDOF_SUBPU_DIM);
  const int scaleX = getComponentScaleX(COMPONENT_Cb, pu.chromaFormat);
  const int scaleY = getComponentScaleY(COMPONENT_Cb, pu.chromaFormat);
  CPelUnitBuf srcPred0 = ( pu.chromaFormat == CHROMA_400 ?
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y())) :
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[0][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[0][2], pcYuvPred.Cr())) );
  CPelUnitBuf srcPred1 = ( pu.chromaFormat == CHROMA_400 ?
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y())) :
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[1][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[1][2], pcYuvPred.Cr())) );
  Position puPos = pu.lumaPos();
  PredictionUnit subPu = pu;
  CHECK(subPu.refIdx[0] < 0, "this is not possible for BDOF");
  CHECK(subPu.refIdx[1] < 0, "this is not possible for BDOF");
  int bioSubPuIdx = 0;
  const int bioSubPuStrideIncr = BDOF_SUBPU_STRIDE - std::max(1, (int)(pu.lumaSize().width >> BDOF_SUBPU_DIM_LOG2));
  for (int y = puPos.y, yStart = 0; y < (puPos.y + pu.lumaSize().height); y = y + bioDy, yStart = yStart + bioDy)
  {
    for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + bioDx, xStart = xStart + bioDx)
    {
      Mv bioMv = m_bdofSubPuMvOffset[bioSubPuIdx];

      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, bioDx, bioDy)));
      if (pu.bdmvrRefine)
      {
        const int bdmvrSubPuIdx = (yStart >> DMVR_SUBCU_HEIGHT_LOG2) * DMVR_SUBPU_STRIDE + (xStart >> DMVR_SUBCU_WIDTH_LOG2);
        subPu.mv[0] = m_bdmvrSubPuMvBuf[0][bdmvrSubPuIdx] + bioMv;
        subPu.mv[1] = m_bdmvrSubPuMvBuf[1][bdmvrSubPuIdx] - bioMv;
      }
      else
      {
        subPu.mv[0] = pu.mv[0] + bioMv;
        subPu.mv[1] = pu.mv[1] - bioMv;
      }

      // inter pred to generate buf data
      for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
      {
        if( subPu.refIdx[refList] < 0)
        {
          continue;
        }
        RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);

        CHECK(CU::isIBC(*subPu.cu) && eRefPicList != REF_PIC_LIST_0, "Invalid interdir for ibc mode");
        CHECK(CU::isIBC(*subPu.cu) && subPu.refIdx[refList] != MAX_NUM_REF, "Invalid reference index for ibc mode");
        CHECK((CU::isInter(*subPu.cu) && subPu.refIdx[refList] >= slice.getNumRefIdx(eRefPicList)), "Invalid reference index");
        m_iRefListIdx = refList;

        PelUnitBuf pcMbBuf = ( subPu.chromaFormat == CHROMA_400 ?
                               PelUnitBuf(subPu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y())) :
                               PelUnitBuf(subPu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[refList][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[refList][2], pcYuvPred.Cr())) );
        pcMbBuf = pcMbBuf.subBuf(UnitAreaRelative(pu, subPu));
        if (bioMv.hor == 0 && bioMv.ver == 0)
        {
          // only chroma MC
          if (!lumaOnly)
            xPredInterUni ( subPu, eRefPicList, pcMbBuf, true, bioApplied, false, chroma, false );
        }
        else
        {
          xPredInterUni ( subPu, eRefPicList, pcMbBuf, true, bioApplied, luma, chroma, false );
        }
      }
      // prepare dst sub buf
      PelUnitBuf subYuvPredBuf = pcYuvPred.subBuf(UnitAreaRelative(pu, subPu));
      int dstStride[MAX_NUM_COMPONENT] = { pcYuvPred.bufs[COMPONENT_Y].stride,
                                           isChromaEnabled(pu.chromaFormat) ? pcYuvPred.bufs[COMPONENT_Cb].stride : 0,
                                           isChromaEnabled(pu.chromaFormat) ? pcYuvPred.bufs[COMPONENT_Cr].stride : 0};
      subYuvPredBuf.bufs[COMPONENT_Y].buf = pcYuvPred.bufs[COMPONENT_Y].buf + xStart + yStart * dstStride[COMPONENT_Y];
      if (isChromaEnabled(pu.chromaFormat))
      {
        subYuvPredBuf.bufs[COMPONENT_Cb].buf = pcYuvPred.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cb]);
        subYuvPredBuf.bufs[COMPONENT_Cr].buf = pcYuvPred.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cr]);
      }
      // prepare src sub buf
      int srcStride[MAX_NUM_COMPONENT] = { srcPred0.bufs[COMPONENT_Y].stride,
                                           isChromaEnabled(pu.chromaFormat) ? srcPred0.bufs[COMPONENT_Cb].stride : 0,
                                           isChromaEnabled(pu.chromaFormat) ? srcPred0.bufs[COMPONENT_Cr].stride : 0};
      CPelUnitBuf srcSubPred0 = srcPred0.subBuf(UnitAreaRelative(pu, subPu));
      CPelUnitBuf srcSubPred1 = srcPred1.subBuf(UnitAreaRelative(pu, subPu));
      srcSubPred0.bufs[COMPONENT_Y].buf = srcPred0.bufs[COMPONENT_Y].buf + xStart + yStart * srcStride[COMPONENT_Y];
      if (isChromaEnabled(pu.chromaFormat))
      {
        srcSubPred0.bufs[COMPONENT_Cb].buf = srcPred0.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cb]);
        srcSubPred0.bufs[COMPONENT_Cr].buf = srcPred0.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cr]);
      }
      srcSubPred1.bufs[COMPONENT_Y].buf = srcPred1.bufs[COMPONENT_Y].buf + xStart + yStart * srcStride[COMPONENT_Y];
      if (isChromaEnabled(pu.chromaFormat))
      {
        srcSubPred1.bufs[COMPONENT_Cb].buf = srcPred1.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cb]);
        srcSubPred1.bufs[COMPONENT_Cr].buf = srcPred1.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cr]);
      }
      // generate the dst buf
      {
        if (bioMv.hor == 0 && bioMv.ver == 0)
        {
          // only derive chroma prediction
          if (!lumaOnly)
            xWeightedAverage( false/*isBdofMvRefine*/, 0/*bdofBlockOffset*/, subPu, srcSubPred0, srcSubPred1, subYuvPredBuf, slice.getSPS()->getBitDepths(), slice.clpRngs(), false/*bioApplied*/, lumaOnly, true/*chromaOnly*/, NULL/*yuvPredTmp*/ );
        }
        else
        {
          xWeightedAverage( false/*isBdofMvRefine*/, 0/*bdofBlockOffset*/, subPu, srcSubPred0, srcSubPred1, subYuvPredBuf, slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied, lumaOnly, chromaOnly, NULL/*yuvPredTmp*/ );
        }
      }
      bioSubPuIdx += 1;
    }
    bioSubPuIdx += bioSubPuStrideIncr;
  }
}
#endif

#if MULTI_PASS_DMVR
void InterPrediction::xPredInterBiBDMVR(PredictionUnit &pu, PelUnitBuf &pcYuvPred, const bool luma, const bool chroma, PelUnitBuf *yuvPredTmp /*= NULL*/)
{
  const PPS   &pps   = *pu.cs->pps;
  const Slice &slice = *pu.cs->slice;
#if !INTER_RM_SIZE_CONSTRAINTS
#if ENABLE_OBMC
  if (pu.cu->isobmcMC == false)
#endif
  CHECK( !pu.cu->affine && pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0 && ( pu.lwidth() + pu.lheight() == 12 ), "invalid 4x8/8x4 bi-predicted blocks" );
#endif

  int refIdx0 = pu.refIdx[REF_PIC_LIST_0];
  int refIdx1 = pu.refIdx[REF_PIC_LIST_1];

  const WPScalingParam *wp0 = pu.cs->slice->getWpScaling(REF_PIC_LIST_0, refIdx0);
  const WPScalingParam *wp1 = pu.cs->slice->getWpScaling(REF_PIC_LIST_1, refIdx1);

  bool bioApplied = false;
  if (pu.cs->sps->getBDOFEnabledFlag() && (!pu.cs->picHeader->getDisBdofFlag()))
  {
#if INTER_LIC
    if (pu.cu->affine || m_subPuMC || pu.cu->LICFlag
#if ENABLE_OBMC
      || pu.cu->isobmcMC
#endif
      )
#else
    if (pu.cu->affine || m_subPuMC)
#endif
    {
      bioApplied = false;
    }
    else
    {
      const bool biocheck0 =
        !((WPScalingParam::isWeighted(wp0) || WPScalingParam::isWeighted(wp1)) && slice.getSliceType() == B_SLICE);
      const bool biocheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE);
      if (biocheck0
        && biocheck1
        && PU::isBiPredFromDifferentDirEqDistPoc(pu)
#if !BDOF_RM_CONSTRAINTS
        && (pu.Y().height >= 8)
        && (pu.Y().width >= 8)
        && ((pu.Y().height * pu.Y().width) >= 128)
#endif
       )
      {
        bioApplied = true;
      }
    }

    if (bioApplied && pu.ciipFlag)
    {
      bioApplied = false;
    }

    if (bioApplied && pu.cu->smvdMode)
    {
      bioApplied = false;
    }

    if (pu.cu->cs->sps->getUseBcw() && bioApplied && pu.cu->BcwIdx != BCW_DEFAULT)
    {
      bioApplied = false;
    }
  }
  if (pu.mmvdEncOptMode == 2 && pu.mmvdMergeFlag)
  {
    bioApplied = false;
  }
#if ENABLE_OBMC
  if (pu.cu->isobmcMC)
  {
    bioApplied = false;
  }
#endif
  const bool isResamplingPossible = pu.cs->sps->getRprEnabledFlag();
  bool dmvrApplied = false;
  dmvrApplied = (pu.mvRefine) && PU::checkDMVRCondition(pu);
  const bool refIsScaled = isResamplingPossible && ( ( refIdx0 < 0 ? false : pu.cu->slice->getRefPic( REF_PIC_LIST_0, refIdx0 )->isRefScaled( pu.cs->pps ) ) || ( refIdx1 < 0 ? false : pu.cu->slice->getRefPic( REF_PIC_LIST_1, refIdx1 )->isRefScaled( pu.cs->pps ) ) );
  dmvrApplied = dmvrApplied && !refIsScaled;
  bioApplied = bioApplied && !refIsScaled;
  // common variable for all subPu
  const bool lumaOnly = (luma && !chroma), chromaOnly = (!luma && chroma);
  const int dy = std::min<int>(pu.lumaSize().height, DMVR_SUBCU_HEIGHT);
  const int dx = std::min<int>(pu.lumaSize().width,  DMVR_SUBCU_WIDTH);
  const int scaleX = getComponentScaleX(COMPONENT_Cb, pu.chromaFormat);
  const int scaleY = getComponentScaleY(COMPONENT_Cb, pu.chromaFormat);
  CPelUnitBuf srcPred0 = ( pu.chromaFormat == CHROMA_400 ?
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y())) :
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[0][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[0][2], pcYuvPred.Cr())) );
  CPelUnitBuf srcPred1 = ( pu.chromaFormat == CHROMA_400 ?
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y())) :
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[1][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[1][2], pcYuvPred.Cr())) );
  Position puPos = pu.lumaPos();
  PredictionUnit subPu = pu;

  int subPuIdx = 0;
  const int dmvrSubPuStrideIncr = DMVR_SUBPU_STRIDE - std::max(1, (int)(pu.lumaSize().width >> DMVR_SUBCU_WIDTH_LOG2));
  int length = 0, later = 0;
  int width = pu.lwidth(), height = pu.lheight();
  int subPuIdxColumn = 0;
  if (height > width)
  {
    for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + dx, xStart = xStart + dx)
    {
      subPuIdx = subPuIdxColumn;
      for (int y = puPos.y, yStart = 0; y < (puPos.y + pu.lumaSize().height); y = y + dy, yStart = yStart + dy)
      {
        subPu.mv[0] = m_bdmvrSubPuMvBuf[0][subPuIdx];
        subPu.mv[1] = m_bdmvrSubPuMvBuf[1][subPuIdx];
        length = dy;
        later = yStart + dy;
        subPuIdx += DMVR_SUBPU_STRIDE;
        while (later < width)
        {
          Mv nextMv[2] = { m_bdmvrSubPuMvBuf[0][subPuIdx] , m_bdmvrSubPuMvBuf[1][subPuIdx] };
          if (nextMv[0] == subPu.mv[0] && nextMv[1] == subPu.mv[1])
          {
            length += dy;
          }
          else
          {
            break;
          }

          later += dy;
          subPuIdx += DMVR_SUBPU_STRIDE;
        }
        subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, dx, length)));

        // inter pred to generate buf data
        for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
        {
          if (subPu.refIdx[refList] < 0)
          {
            continue;
          }

          RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);

          CHECK(CU::isIBC(*subPu.cu) && eRefPicList != REF_PIC_LIST_0, "Invalid interdir for ibc mode");
          CHECK(CU::isIBC(*subPu.cu) && subPu.refIdx[refList] != MAX_NUM_REF, "Invalid reference index for ibc mode");
          CHECK((CU::isInter(*subPu.cu) && subPu.refIdx[refList] >= slice.getNumRefIdx(eRefPicList)), "Invalid reference index");
          m_iRefListIdx = refList;

          PelUnitBuf pcMbBuf = (subPu.chromaFormat == CHROMA_400 ?
            PelUnitBuf(subPu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y())) :
            PelUnitBuf(subPu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[refList][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[refList][2], pcYuvPred.Cr())));
          pcMbBuf = pcMbBuf.subBuf(UnitAreaRelative(pu, subPu));
          if (subPu.refIdx[0] >= 0 && subPu.refIdx[1] >= 0)
          {
            bool isBdofMvRefineSkipChromaMC = (yuvPredTmp == NULL);
            xPredInterUni(subPu, eRefPicList, pcMbBuf, true
              , bioApplied, luma, chroma, isBdofMvRefineSkipChromaMC);
          }
          else
          {
            if (((pps.getUseWP() && slice.getSliceType() == P_SLICE) || (pps.getWPBiPred() && slice.getSliceType() == B_SLICE))
#if INTER_LIC
              && !subPu.cu->LICFlag
#endif
              )
            {
              xPredInterUni(subPu, eRefPicList, pcMbBuf, true
                , bioApplied
                , luma, chroma
              );
            }
            else
            {
              xPredInterUni(subPu, eRefPicList, pcMbBuf, subPu.cu->geoFlag
                , bioApplied
                , luma, chroma
              );
            }
          }
        }
        // prepare dst sub buf
        PelUnitBuf subYuvPredBuf = pcYuvPred.subBuf(UnitAreaRelative(pu, subPu));
        int dstStride[MAX_NUM_COMPONENT] = { pcYuvPred.bufs[COMPONENT_Y].stride,
                                             isChromaEnabled(pu.chromaFormat) ? pcYuvPred.bufs[COMPONENT_Cb].stride : 0,
                                             isChromaEnabled(pu.chromaFormat) ? pcYuvPred.bufs[COMPONENT_Cr].stride : 0 };
        subYuvPredBuf.bufs[COMPONENT_Y].buf = pcYuvPred.bufs[COMPONENT_Y].buf + xStart + yStart * dstStride[COMPONENT_Y];
        if (isChromaEnabled(pu.chromaFormat))
        {
          subYuvPredBuf.bufs[COMPONENT_Cb].buf = pcYuvPred.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cb]);
          subYuvPredBuf.bufs[COMPONENT_Cr].buf = pcYuvPred.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cr]);
        }
        // prepare src sub buf
        int srcStride[MAX_NUM_COMPONENT] = { srcPred0.bufs[COMPONENT_Y].stride,
                                             isChromaEnabled(pu.chromaFormat) ? srcPred0.bufs[COMPONENT_Cb].stride : 0,
                                             isChromaEnabled(pu.chromaFormat) ? srcPred0.bufs[COMPONENT_Cr].stride : 0 };
        CPelUnitBuf srcSubPred0 = srcPred0.subBuf(UnitAreaRelative(pu, subPu));
        CPelUnitBuf srcSubPred1 = srcPred1.subBuf(UnitAreaRelative(pu, subPu));
        srcSubPred0.bufs[COMPONENT_Y].buf = srcPred0.bufs[COMPONENT_Y].buf + xStart + yStart * srcStride[COMPONENT_Y];
        if (isChromaEnabled(pu.chromaFormat))
        {
          srcSubPred0.bufs[COMPONENT_Cb].buf = srcPred0.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cb]);
          srcSubPred0.bufs[COMPONENT_Cr].buf = srcPred0.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cr]);
        }
        srcSubPred1.bufs[COMPONENT_Y].buf = srcPred1.bufs[COMPONENT_Y].buf + xStart + yStart * srcStride[COMPONENT_Y];
        if (isChromaEnabled(pu.chromaFormat))
        {
          srcSubPred1.bufs[COMPONENT_Cb].buf = srcPred1.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cb]);
          srcSubPred1.bufs[COMPONENT_Cr].buf = srcPred1.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cr]);
        }
        // generate the dst buf
        {
          const int bioSubPuOffset = (xStart >> BDOF_SUBPU_DIM_LOG2) + (yStart >> BDOF_SUBPU_DIM_LOG2) * BDOF_SUBPU_STRIDE;
          xWeightedAverage(true/*isBdofMvRefine*/, bioSubPuOffset/*bdofBlockOffset*/, subPu, srcSubPred0, srcSubPred1, subYuvPredBuf,
              slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied, lumaOnly, chromaOnly, yuvPredTmp);
        }

        yStart = later - dy;
        y = puPos.y + yStart;
      }
      subPuIdxColumn++;
    }
  }
  else
  for (int y = puPos.y, yStart = 0; y < (puPos.y + pu.lumaSize().height); y = y + dy, yStart = yStart + dy)
  {
    for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + dx, xStart = xStart + dx)
    {
      subPu.mv[0] = m_bdmvrSubPuMvBuf[0][subPuIdx];
      subPu.mv[1] = m_bdmvrSubPuMvBuf[1][subPuIdx];
      length = dx;
      later = xStart + dx;
      subPuIdx++;
      while (later < width)
      {
        Mv nextMv[2] = { m_bdmvrSubPuMvBuf[0][subPuIdx] , m_bdmvrSubPuMvBuf[1][subPuIdx] };
        if (nextMv[0] == subPu.mv[0] && nextMv[1] == subPu.mv[1])
        {
          length += dx;
        }
        else
        {
          break;
        }

        later += dx;
        subPuIdx++;
      }
      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, length, dy)));
      // inter pred to generate buf data
      for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
      {
        if( subPu.refIdx[refList] < 0)
        {
          continue;
        }

        RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);

        CHECK(CU::isIBC(*subPu.cu) && eRefPicList != REF_PIC_LIST_0, "Invalid interdir for ibc mode");
        CHECK(CU::isIBC(*subPu.cu) && subPu.refIdx[refList] != MAX_NUM_REF, "Invalid reference index for ibc mode");
        CHECK((CU::isInter(*subPu.cu) && subPu.refIdx[refList] >= slice.getNumRefIdx(eRefPicList)), "Invalid reference index");
        m_iRefListIdx = refList;

        PelUnitBuf pcMbBuf = ( subPu.chromaFormat == CHROMA_400 ?
                               PelUnitBuf(subPu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y())) :
                               PelUnitBuf(subPu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[refList][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[refList][2], pcYuvPred.Cr())) );
        pcMbBuf = pcMbBuf.subBuf(UnitAreaRelative(pu, subPu));
        if (subPu.refIdx[0] >= 0 && subPu.refIdx[1] >= 0)
        {
          bool isBdofMvRefineSkipChromaMC = (yuvPredTmp == NULL);
          xPredInterUni ( subPu, eRefPicList, pcMbBuf, true
            , bioApplied, luma, chroma, isBdofMvRefineSkipChromaMC);
        }
        else
        {
          if( ( (pps.getUseWP() && slice.getSliceType() == P_SLICE) || (pps.getWPBiPred() && slice.getSliceType() == B_SLICE) )
    #if INTER_LIC
            && !subPu.cu->LICFlag
    #endif
            )
          {
            xPredInterUni ( subPu, eRefPicList, pcMbBuf, true
              , bioApplied
              , luma, chroma
            );
          }
          else
          {
            xPredInterUni(subPu, eRefPicList, pcMbBuf, subPu.cu->geoFlag
              , bioApplied
              , luma, chroma
            );
          }
        }
      }
      // prepare dst sub buf
      PelUnitBuf subYuvPredBuf = pcYuvPred.subBuf(UnitAreaRelative(pu, subPu));
      int dstStride[MAX_NUM_COMPONENT] = { pcYuvPred.bufs[COMPONENT_Y].stride,
                                           isChromaEnabled(pu.chromaFormat) ? pcYuvPred.bufs[COMPONENT_Cb].stride : 0,
                                           isChromaEnabled(pu.chromaFormat) ? pcYuvPred.bufs[COMPONENT_Cr].stride : 0};
      subYuvPredBuf.bufs[COMPONENT_Y].buf = pcYuvPred.bufs[COMPONENT_Y].buf + xStart + yStart * dstStride[COMPONENT_Y];
      if (isChromaEnabled(pu.chromaFormat))
      {
        subYuvPredBuf.bufs[COMPONENT_Cb].buf = pcYuvPred.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cb]);
        subYuvPredBuf.bufs[COMPONENT_Cr].buf = pcYuvPred.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cr]);
      }
      // prepare src sub buf
      int srcStride[MAX_NUM_COMPONENT] = { srcPred0.bufs[COMPONENT_Y].stride,
                                           isChromaEnabled(pu.chromaFormat) ? srcPred0.bufs[COMPONENT_Cb].stride : 0,
                                           isChromaEnabled(pu.chromaFormat) ? srcPred0.bufs[COMPONENT_Cr].stride : 0};
      CPelUnitBuf srcSubPred0 = srcPred0.subBuf(UnitAreaRelative(pu, subPu));
      CPelUnitBuf srcSubPred1 = srcPred1.subBuf(UnitAreaRelative(pu, subPu));
      srcSubPred0.bufs[COMPONENT_Y].buf = srcPred0.bufs[COMPONENT_Y].buf + xStart + yStart * srcStride[COMPONENT_Y];
      if (isChromaEnabled(pu.chromaFormat))
      {
        srcSubPred0.bufs[COMPONENT_Cb].buf = srcPred0.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cb]);
        srcSubPred0.bufs[COMPONENT_Cr].buf = srcPred0.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cr]);
      }
      srcSubPred1.bufs[COMPONENT_Y].buf = srcPred1.bufs[COMPONENT_Y].buf + xStart + yStart * srcStride[COMPONENT_Y];
      if (isChromaEnabled(pu.chromaFormat))
      {
        srcSubPred1.bufs[COMPONENT_Cb].buf = srcPred1.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cb]);
        srcSubPred1.bufs[COMPONENT_Cr].buf = srcPred1.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * srcStride[COMPONENT_Cr]);
      }
      // generate the dst buf
      {
        const int bioSubPuOffset = (xStart >> BDOF_SUBPU_DIM_LOG2) + (yStart >> BDOF_SUBPU_DIM_LOG2) * BDOF_SUBPU_STRIDE;
        xWeightedAverage( true/*isBdofMvRefine*/, bioSubPuOffset/*bdofBlockOffset*/, subPu, srcSubPred0, srcSubPred1, subYuvPredBuf,
            slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied, lumaOnly, chromaOnly, yuvPredTmp );
      }

      xStart = later - dx;
      x = puPos.x + xStart;
    }
    subPuIdx += dmvrSubPuStrideIncr;
  }
}
#endif

void InterPrediction::xPredInterBi(PredictionUnit &pu, PelUnitBuf &pcYuvPred, const bool luma, const bool chroma, PelUnitBuf *yuvPredTmp /*= NULL*/)
{
  const PPS   &pps = *pu.cs->pps;
  const Slice &slice = *pu.cs->slice;
#if MULTI_PASS_DMVR
  if ( pu.bdmvrRefine )
  {
    if (yuvPredTmp && (pu.lwidth() > DMVR_SUBCU_WIDTH || pu.lheight() > DMVR_SUBCU_HEIGHT)) // pre-do MC for yuvPredTmp to avoid MC for yuvPredTmp within the subblock loop
    {
      for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
      {
        CHECK(pu.refIdx[refList] == NOT_VALID, "pu.refIdx[refList] shouldn't be NOT_VALID.")
          RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);
        m_iRefListIdx = refList;

        PelUnitBuf pcMbBuf = (pu.chromaFormat == CHROMA_400 ?
          PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y())) :
          PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[refList][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[refList][2], pcYuvPred.Cr())));

        xPredInterUni(pu, eRefPicList, pcMbBuf, true, false, luma, chroma);
      }
      CPelUnitBuf srcPred0 = (pu.chromaFormat == CHROMA_400 ?
        CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y())) :
        CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[0][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[0][2], pcYuvPred.Cr())));
      CPelUnitBuf srcPred1 = (pu.chromaFormat == CHROMA_400 ?
        CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y())) :
        CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[1][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[1][2], pcYuvPred.Cr())));
      const bool lumaOnly = luma && !chroma;
      const bool chromaOnly = !luma && chroma;
      if (pps.getWPBiPred() && slice.getSliceType() == B_SLICE && pu.cu->BcwIdx == BCW_DEFAULT)
      {
        xWeightedPredictionBi(pu, srcPred0, srcPred1, *yuvPredTmp, m_maxCompIDToPred, lumaOnly, chromaOnly);
      }
      else if (pps.getUseWP() && slice.getSliceType() == P_SLICE)
      {
        xWeightedPredictionUni(pu, srcPred0, REF_PIC_LIST_0, *yuvPredTmp, -1, m_maxCompIDToPred, lumaOnly, chromaOnly);
      }
      else
      {
        xWeightedAverage(false/*isBdofMvRefine*/, 0/*bioSubPuOffset*/, pu, srcPred0, srcPred1, *yuvPredTmp,
            slice.getSPS()->getBitDepths(), slice.clpRngs(), false, lumaOnly, chromaOnly);
      }

      yuvPredTmp = nullptr;
    }
    xPredInterBiBDMVR(pu, pcYuvPred, luma, chroma, yuvPredTmp);
    return;
  }
#endif
#if !INTER_RM_SIZE_CONSTRAINTS
#if ENABLE_OBMC
  if (pu.cu->isobmcMC == false)
#endif
  CHECK( !pu.cu->affine && pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0 && ( pu.lwidth() + pu.lheight() == 12 ), "invalid 4x8/8x4 bi-predicted blocks" );
#endif
  int refIdx0 = pu.refIdx[REF_PIC_LIST_0];
  int refIdx1 = pu.refIdx[REF_PIC_LIST_1];

  const WPScalingParam *wp0 = pu.cs->slice->getWpScaling(REF_PIC_LIST_0, refIdx0);
  const WPScalingParam *wp1 = pu.cs->slice->getWpScaling(REF_PIC_LIST_1, refIdx1);

  bool bioApplied = false;
  if (pu.cs->sps->getBDOFEnabledFlag() && (!pu.cs->picHeader->getDisBdofFlag()))
  {
#if INTER_LIC
    if (pu.cu->affine || m_subPuMC || pu.cu->LICFlag)
#else
    if (pu.cu->affine || m_subPuMC)
#endif
    {
      bioApplied = false;
    }
    else
    {
      const bool biocheck0 =
        !((WPScalingParam::isWeighted(wp0) || WPScalingParam::isWeighted(wp1)) && slice.getSliceType() == B_SLICE);
      const bool biocheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE);      if (biocheck0
        && biocheck1
        && PU::isBiPredFromDifferentDirEqDistPoc(pu)
#if !BDOF_RM_CONSTRAINTS
        && (pu.Y().height >= 8)
        && (pu.Y().width >= 8)
        && ((pu.Y().height * pu.Y().width) >= 128)
#endif
       )
      {
        bioApplied = true;
      }
    }

    if (bioApplied && pu.ciipFlag)
    {
      bioApplied = false;
    }

    if (bioApplied && pu.cu->smvdMode)
    {
      bioApplied = false;
    }

    if (pu.cu->cs->sps->getUseBcw() && bioApplied && pu.cu->BcwIdx != BCW_DEFAULT)
    {
      bioApplied = false;
    }
  }
  if (pu.mmvdEncOptMode == 2 && pu.mmvdMergeFlag)
  {
    bioApplied = false;
  }
#if ENABLE_OBMC
  if (pu.cu->isobmcMC)
  {
    bioApplied = false;
  }
#endif
  bool dmvrApplied = false;
  dmvrApplied = (pu.mvRefine) && PU::checkDMVRCondition(pu);

  const bool isResamplingPossible = pu.cs->sps->getRprEnabledFlag();
  const bool refIsScaled = isResamplingPossible && ( ( refIdx0 < 0 ? false : pu.cu->slice->getRefPic( REF_PIC_LIST_0, refIdx0 )->isRefScaled( pu.cs->pps ) ) || ( refIdx1 < 0 ? false : pu.cu->slice->getRefPic( REF_PIC_LIST_1, refIdx1 )->isRefScaled( pu.cs->pps ) ) );
  dmvrApplied = dmvrApplied && !refIsScaled;
  bioApplied = bioApplied && !refIsScaled;

#if MULTI_PASS_DMVR
  if (yuvPredTmp && bioApplied && (pu.lwidth() > BDOF_SUBPU_DIM || pu.lheight() > BDOF_SUBPU_DIM)) // pre-do MC for yuvPredTmp to avoid MC for yuvPredTmp within the subblock loop
  {
    for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
    {
      CHECK(pu.refIdx[refList] == NOT_VALID, "pu.refIdx[refList] shouldn't be NOT_VALID.")
        RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);
      m_iRefListIdx = refList;

      PelUnitBuf pcMbBuf = (pu.chromaFormat == CHROMA_400 ?
        PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y())) :
        PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[refList][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[refList][2], pcYuvPred.Cr())));

      xPredInterUni(pu, eRefPicList, pcMbBuf, true, false, luma, chroma);
    }
    CPelUnitBuf srcPred0 = (pu.chromaFormat == CHROMA_400 ?
      CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y())) :
      CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[0][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[0][2], pcYuvPred.Cr())));
    CPelUnitBuf srcPred1 = (pu.chromaFormat == CHROMA_400 ?
      CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y())) :
      CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[1][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[1][2], pcYuvPred.Cr())));
    const bool lumaOnly = luma && !chroma;
    const bool chromaOnly = !luma && chroma;
    if (pps.getWPBiPred() && slice.getSliceType() == B_SLICE && pu.cu->BcwIdx == BCW_DEFAULT)
    {
      xWeightedPredictionBi(pu, srcPred0, srcPred1, *yuvPredTmp, m_maxCompIDToPred, lumaOnly, chromaOnly);
    }
    else if (pps.getUseWP() && slice.getSliceType() == P_SLICE)
    {
      xWeightedPredictionUni(pu, srcPred0, REF_PIC_LIST_0, *yuvPredTmp, -1, m_maxCompIDToPred, lumaOnly, chromaOnly);
    }
    else
    {
      xWeightedAverage(false, 0/*bioSubPuOffset*/, pu, srcPred0, srcPred1, *yuvPredTmp, slice.getSPS()->getBitDepths(), slice.clpRngs(), false, lumaOnly, chromaOnly);
    }

    yuvPredTmp = nullptr;
  }
#endif

  for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
  {
    if( pu.refIdx[refList] < 0)
    {
      continue;
    }

    RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);

    CHECK(CU::isIBC(*pu.cu) && eRefPicList != REF_PIC_LIST_0, "Invalid interdir for ibc mode");
    CHECK(CU::isIBC(*pu.cu) && pu.refIdx[refList] != MAX_NUM_REF, "Invalid reference index for ibc mode");
    CHECK((CU::isInter(*pu.cu) && pu.refIdx[refList] >= slice.getNumRefIdx(eRefPicList)), "Invalid reference index");
    m_iRefListIdx = refList;

    PelUnitBuf pcMbBuf = ( pu.chromaFormat == CHROMA_400 ?
                           PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y())) :
                           PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[refList][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[refList][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[refList][2], pcYuvPred.Cr())) );

    if (pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0)
    {
      if (dmvrApplied)
      {
        if (yuvPredTmp)
        {
          xPredInterUni(pu, eRefPicList, pcMbBuf, true, false, luma, chroma);
        }
        continue;
      }
#if MULTI_PASS_DMVR
      bool isBdofMvRefineSkipChromaMC = (yuvPredTmp == NULL);
      xPredInterUni(pu, eRefPicList, pcMbBuf, true, bioApplied, luma, chroma, isBdofMvRefineSkipChromaMC);
#else
      xPredInterUni(pu, eRefPicList, pcMbBuf, true, bioApplied, luma, chroma);
#endif
    }
    else
    {
      if( ( (pps.getUseWP() && slice.getSliceType() == P_SLICE) || (pps.getWPBiPred() && slice.getSliceType() == B_SLICE) )
#if INTER_LIC
        && !pu.cu->LICFlag
#endif
        )
      {
        xPredInterUni(pu, eRefPicList, pcMbBuf, true, bioApplied, luma, chroma);
      }
      else
      {
        xPredInterUni(pu, eRefPicList, pcMbBuf, pu.cu->geoFlag, bioApplied, luma, chroma);
      }
    }
  }
  CPelUnitBuf srcPred0 = ( pu.chromaFormat == CHROMA_400 ?
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y())) :
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[0][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[0][2], pcYuvPred.Cr())) );
  CPelUnitBuf srcPred1 = ( pu.chromaFormat == CHROMA_400 ?
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y())) :
                           CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvPred.Y()), PelBuf(m_acYuvPred[1][1], pcYuvPred.Cb()), PelBuf(m_acYuvPred[1][2], pcYuvPred.Cr())) );
  const bool lumaOnly   = luma && !chroma;
  const bool chromaOnly = !luma && chroma;
  if( !pu.cu->geoFlag && (!dmvrApplied) && (!bioApplied) && pps.getWPBiPred() && slice.getSliceType() == B_SLICE && pu.cu->BcwIdx == BCW_DEFAULT)
  {
    xWeightedPredictionBi( pu, srcPred0, srcPred1, pcYuvPred, m_maxCompIDToPred, lumaOnly, chromaOnly );
    if (yuvPredTmp)
    {
      yuvPredTmp->copyFrom(pcYuvPred);
    }
  }
  else if( !pu.cu->geoFlag && pps.getUseWP() && slice.getSliceType() == P_SLICE )
  {
    xWeightedPredictionUni( pu, srcPred0, REF_PIC_LIST_0, pcYuvPred, -1, m_maxCompIDToPred, lumaOnly, chromaOnly );
    if (yuvPredTmp)
    {
      yuvPredTmp->copyFrom(pcYuvPred);
    }
  }
  else
  {
    if (dmvrApplied)
    {
      if (yuvPredTmp)
      {
        yuvPredTmp->addAvg(srcPred0, srcPred1, slice.clpRngs(), false);
      }
      xProcessDMVR(pu, pcYuvPred, slice.clpRngs(), bioApplied);
    }
    else
    {
#if MULTI_PASS_DMVR
      xWeightedAverage( true/*isBdofMvRefine*/, 0/*bioSubPuOffset*/, pu, srcPred0, srcPred1, pcYuvPred,
          slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied, lumaOnly, chromaOnly, yuvPredTmp );
#else
      xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred,
          slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied, lumaOnly, chromaOnly, yuvPredTmp );
#endif
    }
  }
}

void InterPrediction::xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng
                                     , const bool& bioApplied
                                     , bool isIBC
                                     , const std::pair<int, int> scalingRatio
                                     , SizeType dmvrWidth
                                     , SizeType dmvrHeight
                                     , bool bilinearMC
                                     , Pel *srcPadBuf
                                     , int32_t srcPadStride
#if JVET_W0090_ARMC_TM
                                     , bool isAML
#if INTER_LIC
                                     , bool doLic
                                     , Mv   mvCurr
#endif
#endif
                                    )
{
#if JVET_W0090_ARMC_TM
#if  JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
  int filterIdx = isAML && pu.mmvdMergeFlag ? 1 : 0;
#else
  int filterIdx = 0;
#endif
  if (bilinearMC)
  {
    filterIdx = 1;
  }
#endif
  JVET_J0090_SET_REF_PICTURE( refPic, compID );
  const ChromaFormat  chFmt = pu.chromaFormat;
  const bool          rndRes = !bi;

  int shiftHor = MV_FRACTIONAL_BITS_INTERNAL + ::getComponentScaleX(compID, chFmt);
  int shiftVer = MV_FRACTIONAL_BITS_INTERNAL + ::getComponentScaleY(compID, chFmt);

  bool  wrapRef = false;
  Mv    mv(_mv);
  if( !isIBC && refPic->isWrapAroundEnabled( pu.cs->pps ) )
  {
    wrapRef = wrapClipMv( mv, pu.blocks[0].pos(), pu.blocks[0].size(), pu.cs->sps, pu.cs->pps );
  }

  bool useAltHpelIf = pu.cu->imv == IMV_HPEL;

  const bool isResamplingPossible = pu.cs->sps->getRprEnabledFlag();

  if( isResamplingPossible && !isIBC && xPredInterBlkRPR( scalingRatio, *pu.cs->pps, CompArea( compID, chFmt, pu.blocks[compID], Size( dstPic.bufs[compID].width, dstPic.bufs[compID].height ) ), refPic, mv, dstPic.bufs[compID].buf, dstPic.bufs[compID].stride, bi, wrapRef, clpRng, 0, useAltHpelIf ) )
  {
    CHECK( bilinearMC, "DMVR should be disabled with RPR" );
    CHECK( bioApplied, "BDOF should be disabled with RPR" );
  }
  else
  {
    int xFrac = mv.hor & ((1 << shiftHor) - 1);
    int yFrac = mv.ver & ((1 << shiftVer) - 1);
    if (isIBC)
    {
      xFrac = yFrac = 0;
      JVET_J0090_SET_CACHE_ENABLE(false);
    }

    PelBuf & dstBuf = dstPic.bufs[compID];
    unsigned width  = dstBuf.width;
    unsigned height = dstBuf.height;

    CPelBuf refBuf;
    {
      Position offset = pu.blocks[compID].pos().offset(mv.getHor() >> shiftHor, mv.getVer() >> shiftVer);
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
      int refBufExtendSize = 0;
      if (bioApplied && compID == COMPONENT_Y)
      {
        refBufExtendSize = ((BIO_EXTEND_SIZE + 1) << 1);  // trick to use SIMD filter
        offset.x -= (BIO_EXTEND_SIZE + 1);
        offset.y -= (BIO_EXTEND_SIZE + 1);
      }
      if (dmvrWidth)
      {
        refBuf = refPic->getRecoBuf(CompArea(compID, chFmt, offset, Size(dmvrWidth + refBufExtendSize, dmvrHeight + refBufExtendSize)), wrapRef);
      }
      else
      {
        refBuf = refPic->getRecoBuf( CompArea( compID, chFmt, offset, Size(pu.blocks[compID].width + refBufExtendSize, pu.blocks[compID].height + refBufExtendSize) ), wrapRef);
      }
#else
      if (dmvrWidth)
      {
        refBuf = refPic->getRecoBuf(CompArea(compID, chFmt, offset, Size(dmvrWidth, dmvrHeight)), wrapRef);
      }
      else
      {
        refBuf = refPic->getRecoBuf(CompArea(compID, chFmt, offset, pu.blocks[compID].size()), wrapRef);
      }
#endif
    }

#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
    if (NULL != srcPadBuf && bioApplied == false)
#else
    if (NULL != srcPadBuf)
#endif
    {
      refBuf.buf    = srcPadBuf;
      refBuf.stride = srcPadStride;
    }
    if (dmvrWidth)
    {
      width  = dmvrWidth;
      height = dmvrHeight;
    }
    // backup data
    int  backupWidth        = width;
    int  backupHeight       = height;
    Pel *backupDstBufPtr    = dstBuf.buf;
    int  backupDstBufStride = dstBuf.stride;

    if (bioApplied && compID == COMPONENT_Y)
    {
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
      backupWidth += ((BIO_EXTEND_SIZE + 1) << 1);
      backupHeight += ((BIO_EXTEND_SIZE + 1) << 1);
      dstBuf.stride = backupWidth;
      dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID];
#else
      width  = width + 2 * BIO_EXTEND_SIZE + 2;
      height = height + 2 * BIO_EXTEND_SIZE + 2;

      // change MC output
      dstBuf.stride = width;
      dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + 2 * dstBuf.stride + 2;
#endif
    }

    if( yFrac == 0 )
    {
#if JVET_W0090_ARMC_TM
      m_if.filterHor( compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, xFrac, rndRes, chFmt, clpRng, filterIdx, bilinearMC, useAltHpelIf );
#else
      m_if.filterHor( compID, ( Pel* ) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, xFrac, rndRes, chFmt, clpRng, bilinearMC, bilinearMC, useAltHpelIf);
#endif
    }
    else if( xFrac == 0 )
    {
#if JVET_W0090_ARMC_TM
      m_if.filterVer( compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, true, rndRes, chFmt, clpRng, filterIdx, bilinearMC, useAltHpelIf );
#else
      m_if.filterVer( compID, ( Pel* ) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, true, rndRes, chFmt, clpRng, bilinearMC, bilinearMC, useAltHpelIf);
#endif
    }
    else
    {
#if SIMD_4x4_12 && defined(TARGET_SIMD_X86)
      //use 4x4 if possible
      if( compID == COMPONENT_Y
          && backupWidth == 4
          && backupHeight == 4
          && !( (xFrac == 8 || yFrac == 8) && useAltHpelIf ) //to avoid (8,12 or 12,8 passes)
          && dmvrWidth == 0                                  //seems to conflict with DMVR, not sure //kolya
        )
        m_if.filter4x4(clpRng,  (Pel*)refBuf.buf, refBuf.stride,  dstBuf.buf, dstBuf.stride, xFrac, yFrac, rndRes);
      else
      {
#endif
        PelBuf tmpBuf = dmvrWidth ? PelBuf( m_filteredBlockTmp[0][compID], Size( dmvrWidth, dmvrHeight ) ) : PelBuf( m_filteredBlockTmp[0][compID], pu.blocks[compID] );
        if( dmvrWidth == 0 )
        {
          tmpBuf.stride = dstBuf.stride;
        }
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
        if (bioApplied && compID == COMPONENT_Y)
        {
          tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], Size(backupWidth, backupWidth));
          tmpBuf.stride = dstBuf.stride;
        }
#endif
#if IF_12TAP
        int vFilterSize = isLuma( compID ) ? NTAPS_LUMA( 0 ) : NTAPS_CHROMA;
#else
        int vFilterSize = isLuma( compID ) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif
#if JVET_W0090_ARMC_TM
        if (isLuma(compID) && filterIdx == 1)
#else
        if (bilinearMC)
#endif
        {
          vFilterSize = NTAPS_BILINEAR;
        }
#if JVET_W0090_ARMC_TM
        m_if.filterHor(compID, (Pel*)refBuf.buf - ((vFilterSize >> 1) - 1) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, backupWidth, backupHeight + vFilterSize - 1, xFrac, false, chFmt, clpRng, filterIdx, bilinearMC, useAltHpelIf);
        JVET_J0090_SET_CACHE_ENABLE(false);
        m_if.filterVer(compID, (Pel*)tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, false, rndRes, chFmt, clpRng, filterIdx, bilinearMC, useAltHpelIf);
#else
        m_if.filterHor( compID, ( Pel* ) refBuf.buf - ( ( vFilterSize >> 1 ) - 1 ) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, backupWidth, backupHeight + vFilterSize - 1, xFrac, false, chFmt, clpRng, bilinearMC, bilinearMC, useAltHpelIf);
        JVET_J0090_SET_CACHE_ENABLE( false );
        m_if.filterVer( compID, ( Pel* ) tmpBuf.buf + ( ( vFilterSize >> 1 ) - 1 ) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, false, rndRes, chFmt, clpRng, bilinearMC, bilinearMC, useAltHpelIf);
#endif
#if SIMD_4x4_12 && defined(TARGET_SIMD_X86)
      }
#endif
    }
    JVET_J0090_SET_CACHE_ENABLE(
      ( srcPadStride == 0 )
      && ( bioApplied
           == false ) );   // Enabled only in non-DMVR-non-BDOF process, In DMVR process, srcPadStride is always non-zero

    if( bioApplied && compID == COMPONENT_Y )
    {
#if !MULTI_PASS_DMVR && !SAMPLE_BASED_BDOF
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
      const int shift = IF_INTERNAL_FRAC_BITS( clpRng.bd );
#else
      const int shift = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
#endif
      int xOffset = ( xFrac < 8 ) ? 1 : 0;
      int yOffset = ( yFrac < 8 ) ? 1 : 0;
      const Pel* refPel = refBuf.buf - yOffset * refBuf.stride - xOffset;
      Pel* dstPel = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + dstBuf.stride + 1;
      for( int w = 0; w < ( width - 2 * BIO_EXTEND_SIZE ); w++ )
      {
        Pel val = leftShift_round( refPel[w], shift );
        dstPel[w] = val - ( Pel ) IF_INTERNAL_OFFS;
      }

      refPel = refBuf.buf + ( 1 - yOffset )*refBuf.stride - xOffset;
      dstPel = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + 2 * dstBuf.stride + 1;
      for( int h = 0; h < ( height - 2 * BIO_EXTEND_SIZE - 2 ); h++ )
      {
        Pel val = leftShift_round( refPel[0], shift );
        dstPel[0] = val - ( Pel ) IF_INTERNAL_OFFS;

        val = leftShift_round( refPel[width - 3], shift );
        dstPel[width - 3] = val - ( Pel ) IF_INTERNAL_OFFS;

        refPel += refBuf.stride;
        dstPel += dstBuf.stride;
      }

      refPel = refBuf.buf + ( height - 2 * BIO_EXTEND_SIZE - 2 + 1 - yOffset )*refBuf.stride - xOffset;
      dstPel = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + ( height - 2 * BIO_EXTEND_SIZE )*dstBuf.stride + 1;
      for( int w = 0; w < ( width - 2 * BIO_EXTEND_SIZE ); w++ )
      {
        Pel val = leftShift_round( refPel[w], shift );
        dstPel[w] = val - ( Pel ) IF_INTERNAL_OFFS;
      }

      // restore data
      width = backupWidth;
      height = backupHeight;
#endif
      dstBuf.buf = backupDstBufPtr;
      dstBuf.stride = backupDstBufStride;
    }
#if RPR_ENABLE
  }
#endif
#if INTER_LIC
#if RPR_ENABLE
  PelBuf& dstBuf = dstPic.bufs[compID];
#endif

    if( m_storeBeforeLIC )
    {
      m_predictionBeforeLIC.bufs[compID].copyFrom( dstBuf );
    }

#if JVET_W0090_ARMC_TM
    if (pu.cu->LICFlag && (!pu.ciipFlag || doLic))
#else
    if( pu.cu->LICFlag && !pu.ciipFlag )
#endif
    {
      CHECK( pu.cu->geoFlag, "Geometric mode is not used with LIC" );
      CHECK( CU::isIBC( *pu.cu ), "IBC mode is not used with LIC" );
      CHECK( pu.interDir == 3, "Bi-prediction is not used with LIC" );
#if !JVET_W0090_ARMC_TM
      CHECK( pu.ciipFlag, "CIIP mode is not used with LIC" );
#endif
#if RPR_ENABLE
      if (PU::checkRprLicCondition(pu))
      {
#endif
#if JVET_W0090_ARMC_TM
      if( isAML )
      {
        xLocalIlluComp(pu, compID, *refPic, mvCurr, bi, dstBuf);
      }
      else
#endif
      xLocalIlluComp( pu, compID, *refPic, _mv, bi, dstBuf );
#if RPR_ENABLE
      }
#endif
    }
#endif
#if !RPR_ENABLE
  }
#endif
}

#if !AFFINE_RM_CONSTRAINTS_AND_OPT
bool InterPrediction::isSubblockVectorSpreadOverLimit( int a, int b, int c, int d, int predType )
{
  int s4 = ( 4 << 11 );
  int filterTap = 6;

  if ( predType == 3 )
  {
    int refBlkWidth  = std::max( std::max( 0, 4 * a + s4 ), std::max( 4 * c, 4 * a + 4 * c + s4 ) ) - std::min( std::min( 0, 4 * a + s4 ), std::min( 4 * c, 4 * a + 4 * c + s4 ) );
    int refBlkHeight = std::max( std::max( 0, 4 * b ), std::max( 4 * d + s4, 4 * b + 4 * d + s4 ) ) - std::min( std::min( 0, 4 * b ), std::min( 4 * d + s4, 4 * b + 4 * d + s4 ) );
    refBlkWidth  = ( refBlkWidth >> 11 ) + filterTap + 3;
    refBlkHeight = ( refBlkHeight >> 11 ) + filterTap + 3;

    if ( refBlkWidth * refBlkHeight > ( filterTap + 9 ) * ( filterTap + 9 ) )
    {
      return true;
    }
  }
  else
  {
    int refBlkWidth  = std::max( 0, 4 * a + s4 ) - std::min( 0, 4 * a + s4 );
    int refBlkHeight = std::max( 0, 4 * b ) - std::min( 0, 4 * b );
    refBlkWidth  = ( refBlkWidth >> 11 ) + filterTap + 3;
    refBlkHeight = ( refBlkHeight >> 11 ) + filterTap + 3;
    if ( refBlkWidth * refBlkHeight > ( filterTap + 9 ) * ( filterTap + 5 ) )
    {
      return true;
    }

    refBlkWidth  = std::max( 0, 4 * c ) - std::min( 0, 4 * c );
    refBlkHeight = std::max( 0, 4 * d + s4 ) - std::min( 0, 4 * d + s4 );
    refBlkWidth  = ( refBlkWidth >> 11 ) + filterTap + 3;
    refBlkHeight = ( refBlkHeight >> 11 ) + filterTap + 3;
    if ( refBlkWidth * refBlkHeight > ( filterTap + 5 ) * ( filterTap + 9 ) )
    {
      return true;
    }
  }
  return false;
}
#endif

#if AFFINE_ENC_OPT
void InterPrediction::xPredAffineBlk(const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng, const bool genChromaMv, const std::pair<int, int> scalingRatio, const bool calGradient)
#else
void InterPrediction::xPredAffineBlk(const ComponentID &compID, const PredictionUnit &pu, const Picture *refPic, const Mv *_mv, PelUnitBuf &dstPic, const bool &bi, const ClpRng &clpRng, bool genChromaMv, const std::pair<int, int> scalingRatio)
#endif
{

  JVET_J0090_SET_REF_PICTURE( refPic, compID );
  const ChromaFormat chFmt = pu.chromaFormat;
  int iScaleX = ::getComponentScaleX( compID, chFmt );
  int iScaleY = ::getComponentScaleY( compID, chFmt );

  Mv mvLT =_mv[0];
  Mv mvRT =_mv[1];
  Mv mvLB =_mv[2];
#if INTER_LIC
  Pel* refLeftTemplate = m_pcLICRefLeftTemplate;
  Pel* refAboveTemplate = m_pcLICRefAboveTemplate;
  Pel* recLeftTemplate = m_pcLICRecLeftTemplate;
  Pel* recAboveTemplate = m_pcLICRecAboveTemplate;
  int numTemplate[2] = { 0 , 0 }; // 0:Above, 1:Left
#endif

  // get affine sub-block width and height
  const int width  = pu.Y().width;
  const int height = pu.Y().height;
  int blockWidth = AFFINE_MIN_BLOCK_SIZE;
  int blockHeight = AFFINE_MIN_BLOCK_SIZE;

  CHECK(blockWidth  > (width >> iScaleX ), "Sub Block width  > Block width");
  CHECK(blockHeight > (height >> iScaleY), "Sub Block height > Block height");
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
  const int MVBUFFER_SIZE = MAX_CU_SIZE / MIN_PU_SIZE;
#endif

  const int cxWidth  = width  >> iScaleX;
  const int cxHeight = height >> iScaleY;
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
  const int iHalfBW  = blockWidth  >> 1;
  const int iHalfBH  = blockHeight >> 1;
#endif

  const int iBit = MAX_CU_DEPTH;
  int iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY;
#if AFFINE_RM_CONSTRAINTS_AND_OPT
  iDMvHorX = (mvRT - mvLT).getHor() << (iBit - floorLog2(width));
  iDMvHorY = (mvRT - mvLT).getVer() << (iBit - floorLog2(width));
  if ( pu.cu->affineType == AFFINEMODEL_6PARAM )
  {
    iDMvVerX = (mvLB - mvLT).getHor() << (iBit - floorLog2(height));
    iDMvVerY = (mvLB - mvLT).getVer() << (iBit - floorLog2(height));
  }
#else
  iDMvHorX = (mvRT - mvLT).getHor() << (iBit - floorLog2(cxWidth));
  iDMvHorY = (mvRT - mvLT).getVer() << (iBit - floorLog2(cxWidth));
  if ( pu.cu->affineType == AFFINEMODEL_6PARAM )
  {
    iDMvVerX = (mvLB - mvLT).getHor() << (iBit - floorLog2(cxHeight));
    iDMvVerY = (mvLB - mvLT).getVer() << (iBit - floorLog2(cxHeight));
  }
#endif
  else
  {
    iDMvVerX = -iDMvHorY;
    iDMvVerY = iDMvHorX;
  }

  int iMvScaleHor = mvLT.getHor() << iBit;
  int iMvScaleVer = mvLT.getVer() << iBit;
  const SPS &sps    = *pu.cs->sps;

#if IF_12TAP
  const int vFilterSize = isLuma(compID) ? NTAPS_LUMA( 0 ) : NTAPS_CHROMA;
#else
  const int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif

  const int shift = iBit - 4 + MV_FRACTIONAL_BITS_INTERNAL;
  bool      wrapRef = false;
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
  const bool subblkMVSpreadOverLimit = isSubblockVectorSpreadOverLimit( iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY, pu.interDir );
#endif

  bool enablePROF = (sps.getUsePROF()) && (!m_skipPROF) && (compID == COMPONENT_Y);
  enablePROF &= (!pu.cs->picHeader->getDisProfFlag());
  enablePROF &= !((pu.cu->affineType == AFFINEMODEL_6PARAM && _mv[0] == _mv[1] && _mv[0] == _mv[2]) || (pu.cu->affineType == AFFINEMODEL_4PARAM && _mv[0] == _mv[1]));
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
  enablePROF &= !subblkMVSpreadOverLimit;
#endif
  const int profThres = 1 << (iBit + (m_isBi ? 1 : 0));
  enablePROF &= !m_encOnly || pu.cu->slice->getCheckLDC() || iDMvHorX > profThres || iDMvHorY > profThres || iDMvVerX > profThres || iDMvVerY > profThres || iDMvHorX < -profThres || iDMvHorY < -profThres || iDMvVerX < -profThres || iDMvVerY < -profThres;
  enablePROF &= (refPic->isRefScaled( pu.cs->pps ) == false);
#if AFFINE_MMVD
  enablePROF &= ((pu.mmvdEncOptMode & 3) != 3); // encoder-only
#endif

#if AFFINE_ENC_OPT
  bool isLast = (enablePROF || calGradient) ? false : !bi;
#else
  bool isLast = enablePROF ? false : !bi;
#endif

#if AFFINE_RM_CONSTRAINTS_AND_OPT
  const int cuExtW = width + PROF_BORDER_EXT_W * 2;
  const int cuExtH = height + PROF_BORDER_EXT_H * 2;
#else
  const int cuExtW = AFFINE_MIN_BLOCK_SIZE + PROF_BORDER_EXT_W * 2;
  const int cuExtH = AFFINE_MIN_BLOCK_SIZE + PROF_BORDER_EXT_H * 2;
#endif

  PelBuf gradXExt(m_gradBuf[0], cuExtW, cuExtH);
  PelBuf gradYExt(m_gradBuf[1], cuExtW, cuExtH);
#if IF_12TAP
  const int MAX_FILTER_SIZE = std::max<int>(NTAPS_LUMA(0), NTAPS_CHROMA);
#else
  const int MAX_FILTER_SIZE = std::max<int>(NTAPS_LUMA, NTAPS_CHROMA);
#endif
#if AFFINE_RM_CONSTRAINTS_AND_OPT
  const int dstExtW = ((width + PROF_BORDER_EXT_W * 2 + 7) >> 3) << 3;
  const int dstExtH = cuExtH;
  PelBuf dstExtBuf(m_filteredBlockTmp[1][compID], cuExtW, cuExtH);
#else
  const int dstExtW = ((blockWidth + PROF_BORDER_EXT_W * 2 + 7) >> 3) << 3;
  const int dstExtH = blockHeight + PROF_BORDER_EXT_H * 2;
  PelBuf dstExtBuf(m_filteredBlockTmp[1][compID], dstExtW, dstExtH);
#endif

  const int refExtH = dstExtH + MAX_FILTER_SIZE - 1;
  PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], dstExtW, refExtH);

  PelBuf &dstBuf = dstPic.bufs[compID];

  int *dMvScaleHor = m_dMvBuf[m_iRefListIdx];
  int *dMvScaleVer = m_dMvBuf[m_iRefListIdx] + 16;

  if (enablePROF)
  {
    int* dMvH = dMvScaleHor;
    int* dMvV = dMvScaleVer;
    int quadHorX = iDMvHorX << 2;
    int quadHorY = iDMvHorY << 2;
    int quadVerX = iDMvVerX << 2;
    int quadVerY = iDMvVerY << 2;

    dMvH[0] = ((iDMvHorX + iDMvVerX) << 1) - ((quadHorX + quadVerX) << 1);
    dMvV[0] = ((iDMvHorY + iDMvVerY) << 1) - ((quadHorY + quadVerY) << 1);

    for (int w = 1; w < blockWidth; w++)
    {
      dMvH[w] = dMvH[w - 1] + quadHorX;
      dMvV[w] = dMvV[w - 1] + quadHorY;
    }

    dMvH += blockWidth;
    dMvV += blockWidth;
    for (int h = 1; h < blockHeight; h++)
    {
      for (int w = 0; w < blockWidth; w++)
      {
        dMvH[w] = dMvH[w - blockWidth] + quadVerX;
        dMvV[w] = dMvV[w - blockWidth] + quadVerY;
      }
      dMvH += blockWidth;
      dMvV += blockWidth;
    }

#if CTU_256
    const int mvShift = MAX_CU_DEPTH + 1;
    const int dmvLimit = (1 << 5) - 1; // this means the maximum magnitude of dmv is half pel. The target MV precision is 1/64, thus the bit shift is 5
#else
    const int mvShift  = 8;
    const int dmvLimit = ( 1 << 5 ) - 1;
#endif

    if (!g_pelBufOP.roundIntVector)
    {
      for (int idx = 0; idx < blockWidth * blockHeight; idx++)
      {
        roundAffineMv(dMvScaleHor[idx], dMvScaleVer[idx], mvShift);
        dMvScaleHor[idx] = Clip3( -dmvLimit, dmvLimit, dMvScaleHor[idx] );
        dMvScaleVer[idx] = Clip3( -dmvLimit, dmvLimit, dMvScaleVer[idx] );
      }
    }
    else
    {
      int sz = blockWidth * blockHeight;
      g_pelBufOP.roundIntVector(dMvScaleHor, sz, mvShift, dmvLimit);
      g_pelBufOP.roundIntVector(dMvScaleVer, sz, mvShift, dmvLimit);
    }
  }
#if AFFINE_ENC_OPT
  else if (calGradient)
  {
    ::memset(m_dMvBuf, 0, sizeof(m_dMvBuf));
  }
#endif
#if AFFINE_RM_CONSTRAINTS_AND_OPT
  if (compID == COMPONENT_Y)
  {
    if (iDMvHorX == 0 && iDMvHorY == 0)
      blockWidth = width;
    else
    {
      int maxDmv = std::max(abs(iDMvHorX), abs(iDMvHorY)) * blockWidth;
      int TH = 1 << (iBit - 1); // Half pel
      while (maxDmv < TH && blockWidth < width)
      {
        blockWidth <<= 1;
        maxDmv <<= 1;
      }
    }
    if (iDMvVerX == 0 && iDMvVerY == 0)
      blockHeight = height;
    else
    {
      int maxDmv = std::max(abs(iDMvVerX), abs(iDMvVerY)) * blockHeight;
      int TH = 1 << (iBit - 1); // Half pel
      while (maxDmv < TH && blockHeight < height)
      {
        blockHeight <<= 1;
        maxDmv <<= 1;
      }
    }
  }
  CMotionBuf mb = pu.getMotionBuf();
  const MotionInfo  *miLine = mb.buf;
  const MotionInfo  *miLine2 = mb.buf + iScaleX + iScaleY * mb.stride;
  int stride = ((blockHeight << iScaleY) >> 2) * mb.stride;

  int iMvScaleTmpHor0 = iMvScaleHor + ((iDMvHorX * blockWidth + iDMvVerX * blockHeight) >> 1);
  int iMvScaleTmpVer0 = iMvScaleVer + ((iDMvHorY * blockWidth + iDMvVerY * blockHeight) >> 1);
#endif
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
  int scaleXLuma = ::getComponentScaleX(COMPONENT_Y, chFmt);
  int scaleYLuma = ::getComponentScaleY(COMPONENT_Y, chFmt);

  if (genChromaMv && pu.chromaFormat != CHROMA_444)
  {
    CHECK(compID == COMPONENT_Y, "Chroma only subblock MV calculation should not apply to Luma");
    int lumaBlockWidth  = AFFINE_MIN_BLOCK_SIZE;
    int lumaBlockHeight = AFFINE_MIN_BLOCK_SIZE;

    CHECK(lumaBlockWidth > (width >> scaleXLuma), "Sub Block width  > Block width");
    CHECK(lumaBlockHeight > (height >> scaleYLuma), "Sub Block height > Block height");

    const int cxWidthLuma  = width >> scaleXLuma;
    const int cxHeightLuma = height >> scaleYLuma;
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
    const int halfBWLuma  = lumaBlockWidth >> 1;
    const int halfBHLuma  = lumaBlockHeight >> 1;

    int dMvHorXLuma, dMvHorYLuma, dMvVerXLuma, dMvVerYLuma;
    dMvHorXLuma = (mvRT - mvLT).getHor() << (iBit - floorLog2(cxWidthLuma));
    dMvHorYLuma = (mvRT - mvLT).getVer() << (iBit - floorLog2(cxWidthLuma));
    if (pu.cu->affineType == AFFINEMODEL_6PARAM)
    {
      dMvVerXLuma = (mvLB - mvLT).getHor() << (iBit - floorLog2(cxHeightLuma));
      dMvVerYLuma = (mvLB - mvLT).getVer() << (iBit - floorLog2(cxHeightLuma));
    }
    else
    {
      dMvVerXLuma = -dMvHorYLuma;
      dMvVerYLuma = dMvHorXLuma;
    }
#endif

#if !AFFINE_RM_CONSTRAINTS_AND_OPT
    const bool subblkMVSpreadOverLimitLuma = isSubblockVectorSpreadOverLimit(dMvHorXLuma, dMvHorYLuma, dMvVerXLuma, dMvVerYLuma, pu.interDir);
#endif

    // get luma MV block by block
    for (int h = 0; h < cxHeightLuma; h += lumaBlockHeight)
    {
      for (int w = 0; w < cxWidthLuma; w += lumaBlockWidth)
      {
        int mvScaleTmpHor, mvScaleTmpVer;
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
        if (!subblkMVSpreadOverLimitLuma)
#endif
        {
#if AFFINE_RM_CONSTRAINTS_AND_OPT
          mvScaleTmpHor = iMvScaleTmpHor0 + iDMvHorX * w + iDMvVerX * h;
          mvScaleTmpVer = iMvScaleTmpVer0 + iDMvHorY * w + iDMvVerY * h;
#else
          mvScaleTmpHor = iMvScaleHor + dMvHorXLuma * (halfBWLuma + w) + dMvVerXLuma * (halfBHLuma + h);
          mvScaleTmpVer = iMvScaleVer + dMvHorYLuma * (halfBWLuma + w) + dMvVerYLuma * (halfBHLuma + h);
#endif
        }
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
        else
        {
          mvScaleTmpHor = iMvScaleHor + dMvHorXLuma * (cxWidthLuma >> 1) + dMvVerXLuma * (cxHeightLuma >> 1);
          mvScaleTmpVer = iMvScaleVer + dMvHorYLuma * (cxWidthLuma >> 1) + dMvVerYLuma * (cxHeightLuma >> 1);
        }
#endif

        roundAffineMv(mvScaleTmpHor, mvScaleTmpVer, shift);
        Mv tmpMv(mvScaleTmpHor, mvScaleTmpVer);
        tmpMv.clipToStorageBitDepth();
        mvScaleTmpHor = tmpMv.getHor();
        mvScaleTmpVer = tmpMv.getVer();

        m_storedMv[h / AFFINE_MIN_BLOCK_SIZE * MVBUFFER_SIZE + w / AFFINE_MIN_BLOCK_SIZE].set(mvScaleTmpHor, mvScaleTmpVer);
      }
    }
  }
#endif
#if AFFINE_ENC_OPT
  int gradLineOffset = 0, gradOffset = 0;
  int gradSubBlkStride = blockHeight * width;
#elif AFFINE_RM_CONSTRAINTS_AND_OPT
  int gradLineOffset = 0, gradOffset = 0;
  int gradSubBlkStride = blockHeight * cuExtW;
#endif
  // get prediction block by block
  for ( int h = 0; h < cxHeight; h += blockHeight )
  {
    for ( int w = 0; w < cxWidth; w += blockWidth )
    {

      int iMvScaleTmpHor, iMvScaleTmpVer;
      if (compID == COMPONENT_Y || pu.chromaFormat == CHROMA_444)
      {
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
        if ( !subblkMVSpreadOverLimit )
#endif
        {
#if AFFINE_RM_CONSTRAINTS_AND_OPT
          iMvScaleTmpHor = iMvScaleTmpHor0 + iDMvHorX * w + iDMvVerX * h;
          iMvScaleTmpVer = iMvScaleTmpVer0 + iDMvHorY * w + iDMvVerY * h;
#else
          iMvScaleTmpHor = iMvScaleHor + iDMvHorX * (iHalfBW + w) + iDMvVerX * (iHalfBH + h);
          iMvScaleTmpVer = iMvScaleVer + iDMvHorY * (iHalfBW + w) + iDMvVerY * (iHalfBH + h);
#endif
        }
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
        else
        {
          iMvScaleTmpHor = iMvScaleHor + iDMvHorX * ( cxWidth >> 1 ) + iDMvVerX * ( cxHeight >> 1 );
          iMvScaleTmpVer = iMvScaleVer + iDMvHorY * ( cxWidth >> 1 ) + iDMvVerY * ( cxHeight >> 1 );
        }
#endif
        roundAffineMv(iMvScaleTmpHor, iMvScaleTmpVer, shift);
        Mv tmpMv(iMvScaleTmpHor, iMvScaleTmpVer);
        tmpMv.clipToStorageBitDepth();
        iMvScaleTmpHor = tmpMv.getHor();
        iMvScaleTmpVer = tmpMv.getVer();

        // clip and scale
        if ( refPic->isWrapAroundEnabled( pu.cs->pps ) )
        {
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
          m_storedMv[h / AFFINE_MIN_BLOCK_SIZE * MVBUFFER_SIZE + w / AFFINE_MIN_BLOCK_SIZE].set(iMvScaleTmpHor, iMvScaleTmpVer);
#endif
          Mv tmpMv(iMvScaleTmpHor, iMvScaleTmpVer);
          wrapRef = wrapClipMv( tmpMv, Position( pu.Y().x + w, pu.Y().y + h ), Size( blockWidth, blockHeight ), &sps, pu.cs->pps );
          iMvScaleTmpHor = tmpMv.getHor();
          iMvScaleTmpVer = tmpMv.getVer();
        }
        else
        {
          wrapRef = false;
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
          m_storedMv[h / AFFINE_MIN_BLOCK_SIZE * MVBUFFER_SIZE + w / AFFINE_MIN_BLOCK_SIZE].set(iMvScaleTmpHor, iMvScaleTmpVer);
#endif
          if (refPic->isRefScaled(pu.cs->pps) == false)
          {
            clipMv(tmpMv, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);
            iMvScaleTmpHor = tmpMv.getHor();
            iMvScaleTmpVer = tmpMv.getVer();
          }
        }
      }
      else
      {
#if AFFINE_RM_CONSTRAINTS_AND_OPT
        Mv curMv = miLine[(w << iScaleX) >> 2].mv[m_iRefListIdx] + miLine2[(w << iScaleX) >> 2].mv[m_iRefListIdx];
#else
        Mv curMv = m_storedMv[((h << iScaleY) / AFFINE_MIN_BLOCK_SIZE) * MVBUFFER_SIZE + ((w << iScaleX) / AFFINE_MIN_BLOCK_SIZE)] +
          m_storedMv[((h << iScaleY) / AFFINE_MIN_BLOCK_SIZE + iScaleY)* MVBUFFER_SIZE + ((w << iScaleX) / AFFINE_MIN_BLOCK_SIZE + iScaleX)];
#endif
        roundAffineMv(curMv.hor, curMv.ver, 1);
        if ( refPic->isWrapAroundEnabled( pu.cs->pps ) )
        {
          wrapRef = wrapClipMv( curMv, Position( pu.Y().x + ( w << iScaleX ), pu.Y().y + ( h << iScaleY ) ), Size( blockWidth << iScaleX, blockHeight << iScaleY ), &sps, pu.cs->pps );
        }
        else
        {
          wrapRef = false;
          if (refPic->isRefScaled(pu.cs->pps) == false)
          {
            clipMv(curMv, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);
          }
        }
        iMvScaleTmpHor = curMv.hor;
        iMvScaleTmpVer = curMv.ver;
      }

      if( xPredInterBlkRPR( scalingRatio, *pu.cs->pps, CompArea( compID, chFmt, pu.blocks[compID].offset( w, h ), Size( blockWidth, blockHeight ) ), refPic, Mv( iMvScaleTmpHor, iMvScaleTmpVer ), dstBuf.buf + w + h * dstBuf.stride, dstBuf.stride, bi, wrapRef, clpRng, 2 ) )
      {
        CHECK( enablePROF, "PROF should be disabled with RPR" );
      }
      else
      {
#if INTER_LIC
        if (pu.cu->LICFlag && (w == 0 || h == 0))
        {
          xGetSublkTemplate(*pu.cu, compID, *refPic, Mv(iMvScaleTmpHor, iMvScaleTmpVer), blockWidth, blockHeight, w, h, numTemplate, refLeftTemplate, refAboveTemplate, recLeftTemplate, recAboveTemplate);
        }
#endif
        // get the MV in high precision
        int xFrac, yFrac, xInt, yInt;

        if (!iScaleX)
        {
          xInt  = iMvScaleTmpHor >> 4;
          xFrac = iMvScaleTmpHor & 15;
        }
        else
        {
          xInt  = iMvScaleTmpHor >> 5;
          xFrac = iMvScaleTmpHor & 31;
        }
        if (!iScaleY)
        {
          yInt  = iMvScaleTmpVer >> 4;
          yFrac = iMvScaleTmpVer & 15;
        }
        else
        {
          yInt  = iMvScaleTmpVer >> 5;
          yFrac = iMvScaleTmpVer & 31;
        }

        const CPelBuf refBuf = refPic->getRecoBuf(
          CompArea(compID, chFmt, pu.blocks[compID].offset(xInt + w, yInt + h), pu.blocks[compID]), wrapRef);

        Pel *ref = (Pel *) refBuf.buf;
        Pel *dst = dstBuf.buf + w + h * dstBuf.stride;

        int refStride = refBuf.stride;
        int dstStride = dstBuf.stride;

        int bw = blockWidth;
        int bh = blockHeight;

#if AFFINE_ENC_OPT
        if (enablePROF || calGradient)
#else
        if (enablePROF)
#endif
        {
          dst       = dstExtBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H);
          dstStride = dstExtBuf.stride;
        }

#if IF_12TAP
        if (yFrac == 0)
        {
          m_if.filterHor(compID, (Pel*)ref, refStride, dst, dstStride, bw, bh, xFrac, isLast, chFmt, clpRng
            , 0, false, false);
        }
        else if (xFrac == 0)
        {
          m_if.filterVer(compID, (Pel*)ref, refStride, dst, dstStride, bw, bh, yFrac, true, isLast, chFmt, clpRng
            , 0, false, false);
        }
        else
        {
#if SIMD_4x4_12 && defined(TARGET_SIMD_X86)
#if AFFINE_RM_CONSTRAINTS_AND_OPT
          if (compID == COMPONENT_Y && bw == 4 && bh == 4)
#else
          if (compID == COMPONENT_Y)
#endif
            m_if.filter4x4(clpRng, (Pel*)ref, refStride, dst, dstStride, xFrac, yFrac, isLast);
          else {
#endif
            m_if.filterHor(compID, (Pel*)ref - ((vFilterSize >> 1) - 1)*refStride, refStride, tmpBuf.buf, tmpBuf.stride, bw, bh + vFilterSize - 1, xFrac, false, chFmt, clpRng
              , 0, false, false);
            JVET_J0090_SET_CACHE_ENABLE(false);
            m_if.filterVer(compID, tmpBuf.buf + ((vFilterSize >> 1) - 1)*tmpBuf.stride, tmpBuf.stride, dst, dstStride, bw, bh, yFrac, false, isLast, chFmt, clpRng
              , 0, false, false);
            JVET_J0090_SET_CACHE_ENABLE(true);
#if SIMD_4x4_12 && defined(TARGET_SIMD_X86)
          }
#endif
        }
#else
        if (yFrac == 0)
        {
          m_if.filterHor(compID, (Pel *) ref, refStride, dst, dstStride, bw, bh, xFrac, isLast, chFmt, clpRng);
        }
        else if (xFrac == 0)
        {
          m_if.filterVer(compID, (Pel *) ref, refStride, dst, dstStride, bw, bh, yFrac, true, isLast, chFmt, clpRng);
        }
        else
        {
          m_if.filterHor(compID, (Pel *) ref - ((vFilterSize >> 1) - 1) * refStride, refStride, tmpBuf.buf,
                         tmpBuf.stride, bw, bh + vFilterSize - 1, xFrac, false, chFmt, clpRng);
          JVET_J0090_SET_CACHE_ENABLE(false);
          m_if.filterVer(compID, tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dst, dstStride,
                         bw, bh, yFrac, false, isLast, chFmt, clpRng);
          JVET_J0090_SET_CACHE_ENABLE(true);
        }
#endif
#if AFFINE_ENC_OPT
        if (enablePROF || calGradient)
#else
        if (enablePROF)
#endif
        {
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
          const int shift = IF_INTERNAL_FRAC_BITS(clpRng.bd);
#else
          const int shift    = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd));
#endif
          const int xOffset = xFrac >> 3;
          const int yOffset = yFrac >> 3;

          const int refOffset = (blockHeight + 1) * refStride;
          const int dstOffset = (blockHeight + 1) * dstStride;

          const Pel *refPel = ref - (1 - yOffset) * refStride + xOffset - 1;
          Pel *      dstPel = dst - dstStride - 1;
          for (int pw = 0; pw < blockWidth + 2; pw++)
          {
            dstPel[pw]             = leftShift_round(refPel[pw], shift) - (Pel) IF_INTERNAL_OFFS;
            dstPel[pw + dstOffset] = leftShift_round(refPel[pw + refOffset], shift) - (Pel) IF_INTERNAL_OFFS;
          }

          refPel = ref + yOffset * refBuf.stride + xOffset;
          dstPel = dst;
          for (int ph = 0; ph < blockHeight; ph++, refPel += refStride, dstPel += dstStride)
          {
            dstPel[-1]         = leftShift_round(refPel[-1], shift) - (Pel) IF_INTERNAL_OFFS;
            dstPel[blockWidth] = leftShift_round(refPel[blockWidth], shift) - (Pel) IF_INTERNAL_OFFS;
          }

#if AFFINE_RM_CONSTRAINTS_AND_OPT
          gradOffset = gradLineOffset + w;
#if AFFINE_ENC_OPT
          g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, width, m_gradX0 + gradOffset, m_gradY0 + gradOffset, clpRng.bd);
#else
          g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, cuExtW, m_gradX0 + gradOffset, m_gradY0 + gradOffset, clpRng.bd);
#endif
#else
#if AFFINE_ENC_OPT
          gradOffset = gradLineOffset + w;
          g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, width, m_gradX0 + gradOffset, m_gradY0 + gradOffset, clpRng.bd);
#else
          PelBuf gradXBuf = gradXExt.subBuf(0, 0, blockWidth + 2, blockHeight + 2);
          PelBuf gradYBuf = gradYExt.subBuf(0, 0, blockWidth + 2, blockHeight + 2);
          g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, gradXBuf.stride,
                                    gradXBuf.buf, gradYBuf.buf, clpRng.bd);
#endif
#endif

#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
          const Pel offset = (1 << (shift - 1)) + IF_INTERNAL_OFFS;
#else
          const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd));
          const Pel offset   = (1 << (shiftNum - 1)) + IF_INTERNAL_OFFS;
#endif
#if AFFINE_RM_CONSTRAINTS_AND_OPT
          Pel* src = dst;
#if AFFINE_ENC_OPT
          Pel* gX = m_gradX0 + gradOffset + width + 1;
          Pel* gY = m_gradY0 + gradOffset + width + 1;
#else
          Pel* gX = m_gradX0 + gradOffset + cuExtW + 1;
          Pel* gY = m_gradY0 + gradOffset + cuExtW + 1;
#endif
          Pel * dstY = dstBuf.buf + w + h * dstBuf.stride;
          for (int sh = 0; sh < blockHeight; sh += AFFINE_MIN_BLOCK_SIZE)
          {
            for (int sw = 0; sw < blockWidth; sw += AFFINE_MIN_BLOCK_SIZE)
            {
#if AFFINE_ENC_OPT
              g_pelBufOP.applyPROF(dstY + sw, dstBuf.stride, src + sw, dstExtBuf.stride, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE, gX + sw, gY + sw, width, dMvScaleHor, dMvScaleVer, AFFINE_MIN_BLOCK_SIZE, bi, shift, offset, clpRng);
#else
              g_pelBufOP.applyPROF(dstY + sw, dstBuf.stride, src + sw, dstExtBuf.stride, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE, gX + sw, gY + sw, cuExtW, dMvScaleHor, dMvScaleVer, AFFINE_MIN_BLOCK_SIZE, bi, shift, offset, clpRng);
#endif
            }
            src += (dstStride << 2);
#if AFFINE_ENC_OPT
            gX += (width << 2);
            gY += (width << 2);
#else
            gX += (cuExtW << 2);
            gY += (cuExtW << 2);
#endif
            dstY += (dstBuf.stride << 2);
          }
#else
          Pel *src = dstExtBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H);
#if AFFINE_ENC_OPT
          Pel* gX = m_gradX0 + gradOffset + width + 1;
          Pel* gY = m_gradY0 + gradOffset + width + 1;
          Pel * dstY = dstBuf.bufAt(w, h);
          g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, width, dMvScaleHor, dMvScaleVer, blockWidth, bi, shift, offset, clpRng);
#else
          Pel *gX  = gradXBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H);
          Pel *gY  = gradYBuf.bufAt(PROF_BORDER_EXT_W, PROF_BORDER_EXT_H);

          Pel *dstY = dstBuf.bufAt(w, h);

#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
          g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY,
                               gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, bi, shift, offset, clpRng);
#else
          g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY,
                               gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, bi, shiftNum, offset, clpRng);
#endif
#endif
#endif
        }
      }
    }
#if AFFINE_RM_CONSTRAINTS_AND_OPT || AFFINE_ENC_OPT
    gradLineOffset += gradSubBlkStride;
#endif
#if AFFINE_RM_CONSTRAINTS_AND_OPT
    miLine += stride;
    miLine2 += stride;
#endif
  }
#if INTER_LIC
  if (m_storeBeforeLIC)
  {
    m_predictionBeforeLIC.bufs[compID].copyFrom(dstBuf);
  }

#if RPR_ENABLE
  if( pu.cu->LICFlag && PU::checkRprLicCondition( pu ) )
#else
  if (pu.cu->LICFlag)
#endif
  {
    PelBuf &dstBuf = dstPic.bufs[compID];
    int LICshift = 0, scale = 0, offset = 0;
    xGetLICParamGeneral(*pu.cu, compID, numTemplate, refLeftTemplate, refAboveTemplate, recLeftTemplate, recAboveTemplate, LICshift, scale, offset);

    const ClpRng& clpRng = pu.cu->cs->slice->clpRng(compID);
    dstBuf.linearTransform(scale, LICshift, offset, true, clpRng);
  }
#endif
}

#if MULTI_PASS_DMVR
void InterPrediction::applyBiOptFlow(const bool isBdofMvRefine, const int bdofBlockOffset, const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths)
#else
void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths)
#endif
{
  const int     height = yuvDst.Y().height;
  const int     width = yuvDst.Y().width;
  int           heightG = height + 2 * BIO_EXTEND_SIZE;
  int           widthG = width + 2 * BIO_EXTEND_SIZE;
  int           offsetPos = widthG*BIO_EXTEND_SIZE + BIO_EXTEND_SIZE;

  Pel*          gradX0 = m_gradX0;
  Pel*          gradX1 = m_gradX1;
  Pel*          gradY0 = m_gradY0;
  Pel*          gradY1 = m_gradY1;

  int           stridePredMC = widthG + 2;
  const Pel*    srcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + stridePredMC + 1;
  const Pel*    srcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + stridePredMC + 1;
  const int     src0Stride = stridePredMC;
  const int     src1Stride = stridePredMC;

  Pel*          dstY = yuvDst.Y().buf;
  const int     dstStride = yuvDst.Y().stride;
  const Pel*    srcY0Temp = srcY0;
  const Pel*    srcY1Temp = srcY1;

  for (int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
  {
    Pel* dstTempPtr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + stridePredMC + 1;
    Pel* gradY = (refList == 0) ? m_gradY0 : m_gradY1;
    Pel* gradX = (refList == 0) ? m_gradX0 : m_gradX1;

    xBioGradFilter(dstTempPtr, stridePredMC, widthG, heightG, widthG, gradX, gradY, clipBitDepths.recon[toChannelType(COMPONENT_Y)]);
#if !MULTI_PASS_DMVR && !SAMPLE_BASED_BDOF
    Pel* padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 2;
    for (int y = 0; y< height; y++)
    {
      padStr[-1] = padStr[0];
      padStr[width] = padStr[width - 1];
      padStr += stridePredMC;
    }

    padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 1;
    ::memcpy(padStr - stridePredMC, padStr, sizeof(Pel)*(widthG));
    ::memcpy(padStr + height*stridePredMC, padStr + (height - 1)*stridePredMC, sizeof(Pel)*(widthG));
#endif
  }

  const ClpRng& clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y);
  const int   bitDepth = clipBitDepths.recon[toChannelType(COMPONENT_Y)];
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int   shiftNum = IF_INTERNAL_FRAC_BITS(bitDepth) + 1;
#else
  const int   shiftNum = IF_INTERNAL_PREC + 1 - bitDepth;
#endif
  const int   offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
  const int   limit = ( 1 << 4 ) - 1;

#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
  int srcBlockOffset = (stridePredMC + 1) * BIO_EXTEND_SIZE;
  int bioBlockParamOffset = (widthG + 1);
  int dstBlockOffset = 0;
  const int bioDx = (width < BDOF_SUBPU_DIM) ? width : BDOF_SUBPU_DIM;
  const int bioDy = (height < BDOF_SUBPU_DIM) ? height : BDOF_SUBPU_DIM;
  const int srcBlockOffsetIncrementY = (stridePredMC << BDOF_SUBPU_DIM_LOG2) - width;
  const int dstBlockOffsetIncrementY = (dstStride << BDOF_SUBPU_DIM_LOG2) - width;
  const int bioBlockParamOffsetIncrementY = (widthG << BDOF_SUBPU_DIM_LOG2) - width;
#endif
#if MULTI_PASS_DMVR
  if (isBdofMvRefine)
  {
    g_pelBufOP.calcBIOParameter(srcY0, srcY1, gradX0, gradX1, gradY0, gradY1, widthG, heightG, src0Stride, src1Stride, widthG,
                                bitDepth, m_absGx, m_absGy, m_dIx, m_dIy, m_signGxGy, m_dI);
    m_bdofMvRefined = true;
    int bioSubPuMvIndex = 0;
    const int bioSubPuMvIndexIncrementY = BDOF_SUBPU_STRIDE - std::max(1, (width >> BDOF_SUBPU_DIM_LOG2));
    const int   bioBlockDistTh = (bioDx * bioDy) << (5 - 4); //4 is to compensate the shift4 of dI in calcBIOParameter
    Pel* dI = m_dI + 2 + 2 * widthG;
    for (int yBlock = 0; yBlock < height; yBlock += bioDy)
    {
      for (int xBlock = 0; xBlock < width; xBlock += bioDx)
      {
        srcY0Temp = srcY0 + srcBlockOffset;
        srcY1Temp = srcY1 + srcBlockOffset;

        int costSubblockSAD = 0;
        Pel* tmp = dI + bioBlockParamOffset;
        g_pelBufOP.calAbsSum(tmp, widthG, bioDx, bioDy, &costSubblockSAD);

        if (costSubblockSAD < bioBlockDistTh)
        {
          m_bdofSubPuMvOffset[bdofBlockOffset + bioSubPuMvIndex].setZero();
          if (bioDx == 4)
          {
            g_pelBufOP.addAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY + dstBlockOffset,
                               dstStride, bioDx, bioDy, shiftNum, offset, clpRng);
          }
          else
          {
            g_pelBufOP.addAvg8(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY + dstBlockOffset,
                               dstStride, bioDx, bioDy, shiftNum, offset, clpRng);
          }
          srcBlockOffset += bioDx;
          dstBlockOffset += bioDx;
          bioBlockParamOffset += bioDx;
          bioSubPuMvIndex += 1;
          continue;
        }
        if (!pu.bdmvrRefine)
        {
          m_bdofSubPuMvOffset[bdofBlockOffset + bioSubPuMvIndex].setZero();
          subBlockBiOptFlow(dstY + dstBlockOffset, dstStride, srcY0Temp, src0Stride, srcY1Temp, src1Stride,
                            bioBlockParamOffset, widthG, bioDx, bioDy, clpRng, shiftNum, offset, limit);
          srcBlockOffset += bioDx;
          dstBlockOffset += bioDx;
          bioBlockParamOffset += bioDx;
          bioSubPuMvIndex += 1;
          continue;
        }

        int sumAbsGX_block = 0, sumAbsGY_block = 0, sumDIX_block = 0, sumDIY_block = 0, sumSignGY_GX_block = 0;
        g_pelBufOP.calcBIOParamSum4(m_absGx + bioBlockParamOffset, m_absGy + bioBlockParamOffset, m_dIx + bioBlockParamOffset,
                                    m_dIy + bioBlockParamOffset, m_signGxGy + bioBlockParamOffset, bioDx + 4, bioDy + 4, widthG,
                                    &sumAbsGX_block, &sumAbsGY_block, &sumDIX_block, &sumDIY_block, &sumSignGY_GX_block);

        int tmpx_block = (sumAbsGX_block == 0 ? 0 : rightShiftMSB(sumDIX_block << 3, sumAbsGX_block));
        int tmpData_block = ((tmpx_block * sumSignGY_GX_block) >> 1);
        int tmpy_block = (sumAbsGY_block == 0 ? 0 : rightShiftMSB(((sumDIY_block << 3) - tmpData_block), sumAbsGY_block));
        tmpx_block = Clip3(-256, 256, tmpx_block);
        tmpy_block = Clip3(-256, 256, tmpy_block);

        Mv bioMv;
        if (tmpx_block >= 0)
          bioMv.hor = ((tmpx_block + 4) >> 3);
        else
          bioMv.hor = (-1) * ((((-1) * tmpx_block) + 4) >> 3);
        if (tmpy_block >= 0)
          bioMv.ver = ((tmpy_block + 4) >> 3);
        else
          bioMv.ver = (-1) * ((((-1) * tmpy_block) + 4) >> 3);

        m_bdofSubPuMvOffset[bdofBlockOffset + bioSubPuMvIndex] = bioMv;
        if (bioMv.hor == 0 && bioMv.ver == 0)
        {
          // by doing this, we do not need to do second LUMA MC
          subBlockBiOptFlow(dstY + dstBlockOffset, dstStride, srcY0Temp, src0Stride, srcY1Temp, src1Stride,
                            bioBlockParamOffset, widthG, bioDx, bioDy, clpRng, shiftNum, offset, limit);
        }
        srcBlockOffset += bioDx;
        dstBlockOffset += bioDx;
        bioBlockParamOffset += bioDx;
        bioSubPuMvIndex += 1;
      }
      srcBlockOffset += srcBlockOffsetIncrementY;
      dstBlockOffset += dstBlockOffsetIncrementY;
      bioBlockParamOffset += bioBlockParamOffsetIncrementY;
      bioSubPuMvIndex += bioSubPuMvIndexIncrementY;
    }
    return;
  }
#endif
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
  g_pelBufOP.calcBIOParameter(srcY0, srcY1, gradX0, gradX1, gradY0, gradY1, widthG, heightG, src0Stride, src1Stride, widthG,
                              bitDepth, m_absGx, m_absGy, m_dIx, m_dIy, m_signGxGy, nullptr);
  for (int yBlock = 0; yBlock < height; yBlock += bioDy)
  {
    for (int xBlock = 0; xBlock < width; xBlock += bioDx)
    {
      srcY0Temp = srcY0 + srcBlockOffset;
      srcY1Temp = srcY1 + srcBlockOffset;
      subBlockBiOptFlow(dstY + dstBlockOffset, dstStride, srcY0Temp, src0Stride, srcY1Temp, src1Stride,
                        bioBlockParamOffset, widthG, bioDx, bioDy, clpRng, shiftNum, offset, limit);
      srcBlockOffset += bioDx;
      dstBlockOffset += bioDx;
      bioBlockParamOffset += bioDx;
    }
    srcBlockOffset += srcBlockOffsetIncrementY;
    dstBlockOffset += dstBlockOffsetIncrementY;
    bioBlockParamOffset += bioBlockParamOffsetIncrementY;
  }
  return;
#endif

  int xUnit = (width >> 2);
  int yUnit = (height >> 2);

  Pel *dstY0 = dstY;
  gradX0 = m_gradX0; gradX1 = m_gradX1;
  gradY0 = m_gradY0; gradY1 = m_gradY1;

  Pel *pGradX0Tmp, *pGradX1Tmp, *pGradY0Tmp, *pGradY1Tmp;
  const Pel *SrcY0Tmp, *SrcY1Tmp;
  int tmpx = 0, tmpy = 0;
  int sumAbsGX = 0, sumAbsGY = 0, sumDIX = 0, sumDIY = 0, sumSignGY_GX = 0;
  int gradOfst, srcOfst, dstOfst, gradLineOfst = 0, srcLineOfst = 0, dstLineOfst = 0;

  for (int yu = 0; yu < yUnit; yu++)
  {
    gradOfst = gradLineOfst;
    srcOfst = srcLineOfst;
    dstOfst = dstLineOfst;

    for (int xu = 0; xu < xUnit; xu++)
    {
      sumAbsGX = 0; sumAbsGY = 0; sumDIX = 0; sumDIY = 0, sumSignGY_GX = 0;
      pGradX0Tmp = m_gradX0 + gradOfst;
      pGradX1Tmp = m_gradX1 + gradOfst;
      pGradY0Tmp = m_gradY0 + gradOfst;
      pGradY1Tmp = m_gradY1 + gradOfst;
      SrcY1Tmp = srcY1 + srcOfst;
      SrcY0Tmp = srcY0 + srcOfst;

      g_pelBufOP.calcBIOSums(SrcY0Tmp, SrcY1Tmp, pGradX0Tmp, pGradX1Tmp, pGradY0Tmp, pGradY1Tmp, xu, yu, src0Stride, src1Stride, widthG, bitDepth, &sumAbsGX, &sumAbsGY, &sumDIX, &sumDIY, &sumSignGY_GX);
      tmpx = (sumAbsGX == 0 ? 0 : rightShiftMSB(sumDIX << 2, sumAbsGX));
      tmpx = Clip3(-limit, limit, tmpx);

      int     mainsGxGy = sumSignGY_GX >> 12;
      int     secsGxGy = sumSignGY_GX & ((1 << 12) - 1);
      int     tmpData = tmpx * mainsGxGy;
      tmpData = ((tmpData << 12) + tmpx*secsGxGy) >> 1;
      tmpy = (sumAbsGY == 0 ? 0 : rightShiftMSB(((sumDIY << 2) - tmpData), sumAbsGY));
      tmpy = Clip3(-limit, limit, tmpy);

      srcY0Temp = SrcY0Tmp + ( stridePredMC + 1 );
      srcY1Temp = SrcY1Tmp + ( stridePredMC + 1 );
      gradX0 = pGradX0Tmp + offsetPos;
      gradX1 = pGradX1Tmp + offsetPos;
      gradY0 = pGradY0Tmp + offsetPos;
      gradY1 = pGradY1Tmp + offsetPos;
      dstY0 = dstY + dstOfst;

      gradOfst += 4;
      srcOfst += 4;
      dstOfst += 4;

      xAddBIOAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, gradX0, gradX1, gradY0, gradY1, widthG, (1 << 2), (1 << 2), (int)tmpx, (int)tmpy, shiftNum, offset, clpRng);
    }  // xu

    gradLineOfst += ( widthG << 2 );
    srcLineOfst += ( src0Stride << 2 );
    dstLineOfst += ( dstStride << 2 );
  }  // yu
}

#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
void InterPrediction::subBlockBiOptFlow(Pel* dstY, const int dstStride, const Pel* src0, const int src0Stride, const Pel* src1, const int src1Stride, int bioParamOffset, const int bioParamStride, int width, int height, const ClpRng& clpRng, const int shiftNum, const int offset, const int limit)
{
#if SAMPLE_BASED_BDOF
  g_pelBufOP.calcBIOParamSum5(m_absGx + bioParamOffset, m_absGy + bioParamOffset, m_dIx + bioParamOffset,
                              m_dIy + bioParamOffset, m_signGxGy + bioParamOffset, bioParamStride, width, height,
                              m_sumAbsGX_pixel_32bit, m_sumAbsGY_pixel_32bit, m_sumDIX_pixel_32bit, m_sumDIY_pixel_32bit, m_sumSignGY_GX_pixel_32bit);
  // sumDIX and sumDIY left shift by 2 is calculated in previous step
  const int bioSubblockSize = width * height;
  for (int pixel_index = 0; pixel_index < bioSubblockSize; pixel_index++)
  {
    if (m_sumAbsGX_pixel_32bit[pixel_index] == 0)
    {
      m_sumDIX_pixel_32bit[pixel_index] = 0;
      m_sumAbsGX_pixel_32bit[pixel_index] = 32;
    }
    else
    {
      m_sumAbsGX_pixel_32bit[pixel_index] = floorLog2(m_sumAbsGX_pixel_32bit[pixel_index]);
    }
    if (m_sumAbsGY_pixel_32bit[pixel_index] == 0)
    {
      m_sumDIY_pixel_32bit[pixel_index] = 0;
      m_sumSignGY_GX_pixel_32bit[pixel_index] = 0;
      m_sumAbsGY_pixel_32bit[pixel_index] = 32;
    }
    else
    {
      m_sumAbsGY_pixel_32bit[pixel_index] = floorLog2(m_sumAbsGY_pixel_32bit[pixel_index]);
    }
  }
  g_pelBufOP.calcBIOClippedVxVy(m_sumDIX_pixel_32bit, m_sumAbsGX_pixel_32bit, m_sumDIY_pixel_32bit, m_sumAbsGY_pixel_32bit, m_sumSignGY_GX_pixel_32bit, limit, bioSubblockSize, m_tmpx_pixel_32bit, m_tmpy_pixel_32bit);
  bioParamOffset += ((bioParamStride + 1) << 1);
#else
  bioParamOffset += ((bioParamStride + 1) << 1);
  int unitSize = 4, extendSize = 1; // unitSize = 1, extendSize = 2 gives same results as per-pixel BDOF
  for (int yUnit = 0; yUnit < height; yUnit += unitSize)
  {
    for (int xUnit = 0; xUnit < width; xUnit += unitSize)
    {
      int subTmpx = 0, subTmpy = 0;
      int subSumGx = 0, subSumGy = 0, subSumDIX = 0, subSumDIY = 0, subSumSignGY_GX = 0;
      int subBioParamOffset = bioParamOffset + (yUnit - extendSize) * bioParamStride + xUnit;
      for (int ySub = -extendSize; ySub < (extendSize + unitSize); ySub++)
      {
        for (int xSub = -extendSize; xSub < (extendSize + unitSize); xSub++)
        {
          subSumGx += m_absGx[subBioParamOffset + xSub];
          subSumGy += m_absGy[subBioParamOffset + xSub];
          subSumDIX += m_dIx[subBioParamOffset + xSub];
          subSumDIY += m_dIy[subBioParamOffset + xSub];
          subSumSignGY_GX += m_signGxGy[subBioParamOffset + xSub];
        }
        subBioParamOffset += bioParamStride;
      }
      subTmpx = (subSumGx == 0 ? 0 : rightShiftMSB(subSumDIX << 2, subSumGx));
      subTmpx = Clip3(-limit, limit, subTmpx);

      int     mainsGxGy = subSumSignGY_GX >> 12;
      int     secsGxGy = subSumSignGY_GX & ((1 << 12) - 1);
      int     tmpData = subTmpx * mainsGxGy;
      tmpData = ((tmpData << 12) + subTmpx*secsGxGy) >> 1;
      subTmpy = (subSumGy == 0 ? 0 : rightShiftMSB(((subSumDIY << 2) - tmpData), subSumGy));
      subTmpy = Clip3(-limit, limit, subTmpy);
      int curSubIdx = yUnit * width + xUnit;
      for (int ySub = 0; ySub < unitSize; ySub++)
      {
        for (int xSub = 0; xSub < unitSize; xSub++)
        {
          m_tmpx_pixel_32bit[curSubIdx + xSub] = subTmpx;
          m_tmpy_pixel_32bit[curSubIdx + xSub] = subTmpy;
        }
        curSubIdx += width;
      }
    }
  }
#endif
  g_pelBufOP.addBIOAvgN(src0, src0Stride, src1, src1Stride, dstY, dstStride, m_gradX0 + bioParamOffset, m_gradX1 + bioParamOffset, m_gradY0 + bioParamOffset, m_gradY1 + bioParamOffset, bioParamStride, width, height, m_tmpx_pixel_32bit, m_tmpy_pixel_32bit, shiftNum, offset, clpRng);
}
#endif

void InterPrediction::xAddBIOAvg4(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
{
  g_pelBufOP.addBIOAvg4(src0, src0Stride, src1, src1Stride, dst, dstStride, gradX0, gradX1, gradY0, gradY1, gradStride, width, height, tmpx, tmpy, shift, offset, clpRng);
}

void InterPrediction::xBioGradFilter(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, int bitDepth)
{
  g_pelBufOP.bioGradFilter(pSrc, srcStride, width, height, gradStride, gradX, gradY, bitDepth);
}

void InterPrediction::xCalcBIOPar(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG, int bitDepth)
{
  g_pelBufOP.calcBIOPar(srcY0Temp, srcY1Temp, gradX0, gradX1, gradY0, gradY1, dotProductTemp1, dotProductTemp2, dotProductTemp3, dotProductTemp5, dotProductTemp6, src0Stride, src1Stride, gradStride, widthG, heightG, bitDepth);
}

void InterPrediction::xCalcBlkGradient(int sx, int sy, int    *arraysGx2, int     *arraysGxGy, int     *arraysGxdI, int     *arraysGy2, int     *arraysGydI, int     &sGx2, int     &sGy2, int     &sGxGy, int     &sGxdI, int     &sGydI, int width, int height, int unitSize)
{
  g_pelBufOP.calcBlkGradient(sx, sy, arraysGx2, arraysGxGy, arraysGxdI, arraysGy2, arraysGydI, sGx2, sGy2, sGxGy, sGxdI, sGydI, width, height, unitSize);
}

#if MULTI_PASS_DMVR
void InterPrediction::xWeightedAverage(const bool isBdofMvRefine, const int bdofBlockOffset,
#else
void InterPrediction::xWeightedAverage(
#endif
    const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths,
    const ClpRngs& clpRngs, const bool& bioApplied, bool lumaOnly, bool chromaOnly, PelUnitBuf* yuvDstTmp /*= NULL*/)
{
  CHECK( (chromaOnly && lumaOnly), "should not happen" );

  const int iRefIdx0 = pu.refIdx[0];
  const int iRefIdx1 = pu.refIdx[1];

  if( iRefIdx0 >= 0 && iRefIdx1 >= 0 )
  {
    if( pu.cu->BcwIdx != BCW_DEFAULT && (yuvDstTmp || !pu.ciipFlag) )
    {
      CHECK(bioApplied, "Bcw is disallowed with BIO");
      pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->BcwIdx, chromaOnly, lumaOnly);
      if (yuvDstTmp)
        yuvDstTmp->addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, chromaOnly, lumaOnly);
      return;
    }
    if (bioApplied)
    {
      const int  src0Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2;
      const int  src1Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2;
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
      const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + (1 + BIO_EXTEND_SIZE) * (src0Stride + 1);
      const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + (1 + BIO_EXTEND_SIZE) * (src1Stride + 1);
#else
      const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + 2 * src0Stride + 2;
      const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + 2 * src1Stride + 2;
#endif

      bool bioEnabled = true;
      if (bioEnabled)
      {
#if MULTI_PASS_DMVR
        applyBiOptFlow(isBdofMvRefine, bdofBlockOffset, pu, pcYuvSrc0, pcYuvSrc1, iRefIdx0, iRefIdx1, pcYuvDst, clipBitDepths);
#else
        applyBiOptFlow(pu, pcYuvSrc0, pcYuvSrc1, iRefIdx0, iRefIdx1, pcYuvDst, clipBitDepths);
#endif
        if (yuvDstTmp)
          yuvDstTmp->bufs[0].addAvg(CPelBuf(pSrcY0, src0Stride, pu.lumaSize()), CPelBuf(pSrcY1, src1Stride, pu.lumaSize()), clpRngs.comp[0]);
      }
      else
      {
        pcYuvDst.bufs[0].addAvg(CPelBuf(pSrcY0, src0Stride, pu.lumaSize()), CPelBuf(pSrcY1, src1Stride, pu.lumaSize()), clpRngs.comp[0]);
        if (yuvDstTmp)
          yuvDstTmp->bufs[0].copyFrom(pcYuvDst.bufs[0]);
      }
    }
    if (!bioApplied && (lumaOnly || chromaOnly))
    {
      pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, chromaOnly, lumaOnly);
    }
#if MULTI_PASS_DMVR
      // this part is to derive the chroma dst pred
    else if (!isBdofMvRefine || !bioApplied || yuvDstTmp != NULL)
#else
    else
#endif
    {
      pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, bioApplied);
    }
    if (yuvDstTmp)
    {
      if (bioApplied)
      {
        if (isChromaEnabled(yuvDstTmp->chromaFormat))
        {
          yuvDstTmp->bufs[1].copyFrom(pcYuvDst.bufs[1]);
          yuvDstTmp->bufs[2].copyFrom(pcYuvDst.bufs[2]);
        }
      }
      else
      {
        yuvDstTmp->copyFrom(pcYuvDst, lumaOnly, chromaOnly);
      }
    }
  }
  else if( iRefIdx0 >= 0 && iRefIdx1 < 0 )
  {
    if( pu.cu->geoFlag )
    {
#if JVET_W0097_GPM_MMVD_TM
      pcYuvDst.copyFrom(pcYuvSrc0, lumaOnly, chromaOnly);
#else
      pcYuvDst.copyFrom( pcYuvSrc0 );
#endif
    }
    else
    {
      pcYuvDst.copyClip( pcYuvSrc0, clpRngs, lumaOnly, chromaOnly );
    }
    if (yuvDstTmp)
    {
      yuvDstTmp->copyFrom( pcYuvDst, lumaOnly, chromaOnly );
    }
  }
  else if( iRefIdx0 < 0 && iRefIdx1 >= 0 )
  {
    if( pu.cu->geoFlag )
    {
#if JVET_W0097_GPM_MMVD_TM
      pcYuvDst.copyFrom(pcYuvSrc1, lumaOnly, chromaOnly);
#else
      pcYuvDst.copyFrom( pcYuvSrc1 );
#endif
    }
    else
    {
      pcYuvDst.copyClip( pcYuvSrc1, clpRngs, lumaOnly, chromaOnly );
    }
    if (yuvDstTmp)
    {
      yuvDstTmp->copyFrom(pcYuvDst, lumaOnly, chromaOnly);
    }
  }
}

#if JVET_W0090_ARMC_TM
#if !INTER_LIC
template <bool TrueA_FalseL>
void InterPrediction::xGetPredBlkTpl(const CodingUnit& cu, const ComponentID compID, const CPelBuf& refBuf, const Mv& mv, const int posW, const int posH, const int tplSize, Pel* predBlkTpl
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
                                     , bool AML
#endif
                                     )
{
  const int lumaShift = 2 + MV_FRACTIONAL_BITS_DIFF;
  const int horShift = (lumaShift + ::getComponentScaleX(compID, cu.chromaFormat));
  const int verShift = (lumaShift + ::getComponentScaleY(compID, cu.chromaFormat));

  const int xInt = mv.getHor() >> horShift;
  const int yInt = mv.getVer() >> verShift;
  const int xFrac = mv.getHor() & ((1 << horShift) - 1);
  const int yFrac = mv.getVer() & ((1 << verShift) - 1);

  const Pel* ref;
  Pel* dst;
  int refStride, dstStride, bw, bh;
  if (TrueA_FalseL)
  {
    ref = refBuf.bufAt(cu.blocks[compID].pos().offset(xInt + posW, yInt + posH - 1));
    dst = predBlkTpl + posW;
    refStride = refBuf.stride;
    dstStride = tplSize;
    bw = tplSize;
    bh = 1;
  }
  else
  {
    ref = refBuf.bufAt(cu.blocks[compID].pos().offset(xInt + posW - 1, yInt + posH));
    dst = predBlkTpl + posH;
    refStride = refBuf.stride;
    dstStride = 1;
    bw = 1;
    bh = tplSize;
  }

#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
  int   nFilterIdx =  AML  ? 1 : 0;
#else
  const int  nFilterIdx   = 0;
#endif
  const bool useAltHpelIf = false;

  if (yFrac == 0)
  {
    m_if.filterHor(compID, (Pel*)ref, refStride, dst, dstStride, bw, bh, xFrac, true, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
  }
  else if (xFrac == 0)
  {
    m_if.filterVer(compID, (Pel*)ref, refStride, dst, dstStride, bw, bh, yFrac, true, true, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
  }
  else
  {
      
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
#if IF_12TAP
    int vFilterSize = isLuma(compID) ? NTAPS_LUMA(0) : NTAPS_CHROMA;
#else
    int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif
    if (isLuma(compID) && nFilterIdx == 1)
    {
      vFilterSize = NTAPS_BILINEAR;
    }
#else
#if IF_12TAP
    const int vFilterSize = isLuma(compID) ? NTAPS_LUMA(0) : NTAPS_CHROMA;
#else
    const int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif
#endif
    PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], Size(bw, bh + vFilterSize - 1));

    m_if.filterHor(compID, (Pel*)ref - ((vFilterSize >> 1) - 1)*refStride, refStride, tmpBuf.buf, tmpBuf.stride, bw, bh + vFilterSize - 1, xFrac, false, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
    JVET_J0090_SET_CACHE_ENABLE(false);
    m_if.filterVer(compID, tmpBuf.buf + ((vFilterSize >> 1) - 1)*tmpBuf.stride, tmpBuf.stride, dst, dstStride, bw, bh, yFrac, false, true, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
    JVET_J0090_SET_CACHE_ENABLE(true);
  }
}
#endif
void InterPrediction::xWeightedAverageY(const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs)
{
  const int iRefIdx0 = pu.refIdx[0];
  const int iRefIdx1 = pu.refIdx[1];

  if (iRefIdx0 >= 0 && iRefIdx1 >= 0)
  {
    if (pu.cu->BcwIdx != BCW_DEFAULT)
    {
      pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->BcwIdx, false, true);
    }
    else
    {
      pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, false, true);
    }
  }
  else if (iRefIdx0 >= 0 && iRefIdx1 < 0)
  {
    pcYuvDst.copyClip(pcYuvSrc0, clpRngs, true);
  }
  else if (iRefIdx0 < 0 && iRefIdx1 >= 0)
  {
    pcYuvDst.copyClip(pcYuvSrc1, clpRngs, true);
  }
}
void InterPrediction::xPredAffineTpl(const PredictionUnit &pu, const RefPicList &eRefPicList, int* numTemplate, Pel* refLeftTemplate, Pel* refAboveTemplate)
{
  int iRefIdx = pu.refIdx[eRefPicList];
  CHECK(iRefIdx < 0, "iRefIdx incorrect.");
  const Picture* refPic = pu.cu->slice->getRefPic(eRefPicList, iRefIdx)->unscaledPic;
  Mv mvLT = pu.mvAffi[eRefPicList][0];
  Mv mvRT = pu.mvAffi[eRefPicList][1];
  Mv mvLB = pu.mvAffi[eRefPicList][2];
  // get affine sub-block width and height
  const int width = pu.Y().width;
  const int height = pu.Y().height;
  int blockWidth = AFFINE_MIN_BLOCK_SIZE;
  int blockHeight = AFFINE_MIN_BLOCK_SIZE;

  CHECK(blockWidth > width, "Sub Block width  > Block width");
  CHECK(blockHeight > height, "Sub Block height > Block height");

  const int cxWidth = width;
  const int cxHeight = height;
  const int iBit = MAX_CU_DEPTH;
  int iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY;
  iDMvHorX = (mvRT - mvLT).getHor() << (iBit - floorLog2(width));
  iDMvHorY = (mvRT - mvLT).getVer() << (iBit - floorLog2(width));
  if (pu.cu->affineType == AFFINEMODEL_6PARAM)
  {
    iDMvVerX = (mvLB - mvLT).getHor() << (iBit - floorLog2(height));
    iDMvVerY = (mvLB - mvLT).getVer() << (iBit - floorLog2(height));
  }
  else
  {
    iDMvVerX = -iDMvHorY;
    iDMvVerY = iDMvHorX;
  }
  int iMvScaleHor = mvLT.getHor() << iBit;
  int iMvScaleVer = mvLT.getVer() << iBit;

  const int shift = iBit - 4 + MV_FRACTIONAL_BITS_INTERNAL;
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
  const bool subblkMVSpreadOverLimit = isSubblockVectorSpreadOverLimit(iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY, pu.interDir);
#endif
#if AFFINE_RM_CONSTRAINTS_AND_OPT
  if (iDMvHorX == 0 && iDMvHorY == 0)
    blockWidth = width;
  else
  {
    int maxDmv = std::max(abs(iDMvHorX), abs(iDMvHorY)) * blockWidth;
    int TH = 1 << (iBit - 1); // Half pel
    while (maxDmv < TH && blockWidth < width)
    {
      blockWidth <<= 1;
      maxDmv <<= 1;
    }
  }
  if (iDMvVerX == 0 && iDMvVerY == 0)
    blockHeight = height;
  else
  {
    int maxDmv = std::max(abs(iDMvVerX), abs(iDMvVerY)) * blockHeight;
    int TH = 1 << (iBit - 1); // Half pel
    while (maxDmv < TH && blockHeight < height)
    {
      blockHeight <<= 1;
      maxDmv <<= 1;
    }
  }
#endif
  int iMvScaleTmpHor0 = iMvScaleHor + ((iDMvHorX * blockWidth + iDMvVerX * blockHeight) >> 1);
  int iMvScaleTmpVer0 = iMvScaleVer + ((iDMvHorY * blockWidth + iDMvVerY * blockHeight) >> 1);
  // get prediction block by block
  for (int h = 0; h < cxHeight; h += blockHeight)
  {
    for (int w = 0; w < cxWidth; w += blockWidth)
    {
      if (w == 0 || h == 0)
      {
        int iMvScaleTmpHor, iMvScaleTmpVer;

#if !AFFINE_RM_CONSTRAINTS_AND_OPT
        if (!subblkMVSpreadOverLimit)
#endif
        {
          iMvScaleTmpHor = iMvScaleTmpHor0 + iDMvHorX * w + iDMvVerX * h;
          iMvScaleTmpVer = iMvScaleTmpVer0 + iDMvHorY * w + iDMvVerY * h;
        }
#if !AFFINE_RM_CONSTRAINTS_AND_OPT
        else
        {
          iMvScaleTmpHor = iMvScaleHor + iDMvHorX * (cxWidth >> 1) + iDMvVerX * (cxHeight >> 1);
          iMvScaleTmpVer = iMvScaleVer + iDMvHorY * (cxWidth >> 1) + iDMvVerY * (cxHeight >> 1);
        }
#endif
        roundAffineMv(iMvScaleTmpHor, iMvScaleTmpVer, shift);
        Mv tmpMv(iMvScaleTmpHor, iMvScaleTmpVer);
        tmpMv.clipToStorageBitDepth();
        iMvScaleTmpHor = tmpMv.getHor();
        iMvScaleTmpVer = tmpMv.getVer();

        // clip and scale
        if (refPic->isRefScaled(pu.cs->pps) == false)
        {
          clipMv(tmpMv, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);
          iMvScaleTmpHor = tmpMv.getHor();
          iMvScaleTmpVer = tmpMv.getVer();
        }
        xGetSublkAMLTemplate(*pu.cu, COMPONENT_Y, *refPic, Mv(iMvScaleTmpHor, iMvScaleTmpVer), blockWidth, blockHeight, w, h, numTemplate, refLeftTemplate, refAboveTemplate
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
                             , pu.afMmvdFlag
#endif
                             );
      }
    }
  }
}
#endif

void InterPrediction::motionCompensation( PredictionUnit &pu, PelUnitBuf &predBuf, const RefPicList &eRefPicList
  , const bool luma, const bool chroma
  , PelUnitBuf* predBufWOBIO /*= NULL*/
)
{
  // Note: there appears to be an interaction with weighted prediction that
  // makes the code follow different paths if chroma is on or off (in the encoder).
  // Therefore for 4:0:0, "chroma" is not changed to false.
#if MULTI_HYP_PRED
  if (!pu.addHypData.empty())
  {
    CHECK(eRefPicList != REF_PIC_LIST_X, "Multi Hyp: eRefPicList != REF_PIC_LIST_X");
    CHECK(!luma, "Multi Hyp: !luma");
    xAddHypMC(pu, predBuf, predBufWOBIO, !chroma);
    return;
  }
#endif
  CHECK(predBufWOBIO && pu.ciipFlag, "the case should not happen!");

  if (!pu.cs->pcv->isEncoder)
  {
    if (CU::isIBC(*pu.cu))
    {
      CHECK(!luma, "IBC only for Chroma is not allowed.");
      xIntraBlockCopy(pu, predBuf, COMPONENT_Y);
      if (chroma && isChromaEnabled(pu.chromaFormat))
      {
        xIntraBlockCopy(pu, predBuf, COMPONENT_Cb);
        xIntraBlockCopy(pu, predBuf, COMPONENT_Cr);
      }
      return;
    }
  }
  // dual tree handling for IBC as the only ref
  if ((!luma || !chroma) && eRefPicList == REF_PIC_LIST_0)
  {
    xPredInterUni(pu, eRefPicList, predBuf, false, false, luma, chroma);
    return;
  }
  // else, go with regular MC below
        CodingStructure &cs = *pu.cs;
  const PPS &pps            = *cs.pps;
  const SliceType sliceType =  cs.slice->getSliceType();

  if( eRefPicList != REF_PIC_LIST_X )
  {
    CHECK(predBufWOBIO != NULL, "the case should not happen!");
    if ((CU::isIBC(*pu.cu) == false) && ((sliceType == P_SLICE && pps.getUseWP()) || (sliceType == B_SLICE && pps.getWPBiPred()))
#if INTER_LIC
      && !pu.cu->LICFlag
#endif
      )
    {
      xPredInterUni(pu, eRefPicList, predBuf, true, false, luma, chroma);
      xWeightedPredictionUni(pu, predBuf, eRefPicList, predBuf, -1, m_maxCompIDToPred, (luma && !chroma),
                             (!luma && chroma));
    }
    else
    {
      xPredInterUni(pu, eRefPicList, predBuf, false, false, luma, chroma);
    }
  }
  else
  {
#if !INTER_RM_SIZE_CONSTRAINTS
#if ENABLE_OBMC
  if (pu.cu->isobmcMC == false)
#endif
    CHECK( !pu.cu->affine && pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0 && ( pu.lwidth() + pu.lheight() == 12 ), "invalid 4x8/8x4 bi-predicted blocks" );
#endif
#if !BDOF_RM_CONSTRAINTS
    int refIdx0 = pu.refIdx[REF_PIC_LIST_0];
    int refIdx1 = pu.refIdx[REF_PIC_LIST_1];

    const WPScalingParam *wp0 = pu.cs->slice->getWpScaling(REF_PIC_LIST_0, refIdx0);
    const WPScalingParam *wp1 = pu.cs->slice->getWpScaling(REF_PIC_LIST_1, refIdx1);

    bool bioApplied = false;
    const Slice &slice = *pu.cs->slice;
    if (pu.cs->sps->getBDOFEnabledFlag() && (!pu.cs->picHeader->getDisBdofFlag()))
    {
      if (pu.cu->affine || m_subPuMC)
      {
        bioApplied = false;
      }
      else
      {
        const bool biocheck0 =
          !((WPScalingParam::isWeighted(wp0) || WPScalingParam::isWeighted(wp1)) && slice.getSliceType() == B_SLICE);
        const bool biocheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE);
        if (biocheck0
          && biocheck1
          && PU::isBiPredFromDifferentDirEqDistPoc(pu)
          && (pu.Y().height >= 8)
          && (pu.Y().width >= 8)
          && ((pu.Y().height * pu.Y().width) >= 128)
          )
        {
          bioApplied = true;
        }
      }

      if (bioApplied && pu.ciipFlag)
      {
        bioApplied = false;
      }

      if (bioApplied && pu.cu->smvdMode)
      {
        bioApplied = false;
      }
      if (pu.cu->cs->sps->getUseBcw() && bioApplied && pu.cu->BcwIdx != BCW_DEFAULT)
      {
        bioApplied = false;
      }
      if (pu.mmvdEncOptMode == 2 && pu.mmvdMergeFlag)
      {
        bioApplied = false;
      }
    }
#if ENABLE_OBMC
      if (pu.cu->isobmcMC)
      {
        bioApplied = false;
      }
#endif
    bool refIsScaled = ( refIdx0 < 0 ? false : pu.cu->slice->getRefPic( REF_PIC_LIST_0, refIdx0 )->isRefScaled( pu.cs->pps ) ) ||
                       ( refIdx1 < 0 ? false : pu.cu->slice->getRefPic( REF_PIC_LIST_1, refIdx1 )->isRefScaled( pu.cs->pps ) );
    bioApplied = refIsScaled ? false : bioApplied;

    bool dmvrApplied = false;
    dmvrApplied = (pu.mvRefine) && PU::checkDMVRCondition(pu);
#if MULTI_PASS_DMVR
    if ((pu.lumaSize().width > MAX_BDOF_APPLICATION_REGION || pu.lumaSize().height > MAX_BDOF_APPLICATION_REGION) && pu.mergeType != MRG_TYPE_SUBPU_ATMVP && (bioApplied && !dmvrApplied && !pu.bdmvrRefine))
#else
    if ((pu.lumaSize().width > MAX_BDOF_APPLICATION_REGION || pu.lumaSize().height > MAX_BDOF_APPLICATION_REGION) && pu.mergeType != MRG_TYPE_SUBPU_ATMVP && (bioApplied && !dmvrApplied))
#endif
    {
      xSubPuBio(pu, predBuf, eRefPicList, predBufWOBIO);
    }
    else
#endif
      if( pu.mergeType != MRG_TYPE_DEFAULT_N && pu.mergeType != MRG_TYPE_IBC )
      {
        CHECK( predBufWOBIO != NULL, "the case should not happen!" );
        xSubPuMC( pu, predBuf, eRefPicList, luma, chroma );
      }
      else if( xCheckIdenticalMotion( pu ) )
      {
        xPredInterUni( pu, REF_PIC_LIST_0, predBuf, false, false, luma, chroma );
        if( predBufWOBIO )
        {
          predBufWOBIO->copyFrom( predBuf, ( luma && !chroma ), ( chroma && !luma ) );
        }
      }
      else
      {
#if MULTI_PASS_DMVR
        m_bdofMvRefined = false;
#if !BDOF_RM_CONSTRAINTS
        if (pu.bdmvrRefine && !bioApplied)
        {
          for (int bdofSubPuIdx = 0; bdofSubPuIdx < BDOF_SUBPU_MAX_NUM; bdofSubPuIdx++)
          {
            m_bdofSubPuMvOffset[bdofSubPuIdx].setZero();
          }
        }
#endif
        xPredInterBi(pu, predBuf, luma, chroma, predBufWOBIO);
        if (m_bdofMvRefined)
        {
          xPredInterBiSubPuBDOF(pu, predBuf, luma, chroma);  // do not change the predBufWOBIO
          m_bdofMvRefined = false;
        }
#else
        xPredInterBi( pu, predBuf, luma, chroma, predBufWOBIO );
#endif
      }
  }
  return;
}

void InterPrediction::motionCompensation( CodingUnit &cu, const RefPicList &eRefPicList
  , const bool luma, const bool chroma
)
{
  for( auto &pu : CU::traversePUs( cu ) )
  {
    PelUnitBuf predBuf = cu.cs->getPredBuf( pu );
    pu.mvRefine = true;
    motionCompensation(pu, predBuf, eRefPicList, luma, chroma);
    pu.mvRefine = false;
  }
}

void InterPrediction::motionCompensation( PredictionUnit &pu, const RefPicList &eRefPicList /*= REF_PIC_LIST_X*/
  , const bool luma, const bool chroma
)
{
  PelUnitBuf predBuf = pu.cs->getPredBuf( pu );
  motionCompensation(pu, predBuf, eRefPicList, luma, chroma);
}

#if ENABLE_OBMC
/** Function for sub-block based Overlapped Block Motion Compensation (OBMC).
*
* This function can:
* 1. Perform sub-block OBMC for a CU.
* 2. Before motion estimation, subtract (scaled) predictors generated by applying neighboring motions to current CU/PU from the original signal of current CU/PU,
*    to make the motion estimation biased to OBMC.
*/
void InterPrediction::subBlockOBMC(PredictionUnit  &pu, PelUnitBuf* pDst)
{
  if (
    pu.cs->sps->getUseOBMC() == false
    || pu.cu->obmcFlag == false
#if INTER_LIC
    || pu.cu->LICFlag
#endif
    || pu.lwidth() * pu.lheight() < 32
    )
  {
    return;
  }

  const UnitArea   orgPuArea = pu;
  PredictionUnit subPu = pu;

  const uint32_t uiWidth = pu.lwidth();
  const uint32_t uiHeight = pu.lheight();

  const uint32_t uiMinCUW = pu.cs->pcv->minCUWidth;

  const uint32_t uiHeightInBlock = uiHeight / uiMinCUW;
  const uint32_t uiWidthInBlock = uiWidth / uiMinCUW;

#if MULTI_PASS_DMVR
  const bool bSubMotion = pu.cu->affine || pu.bdmvrRefine;
#else
  const bool bSubMotion = pu.cu->affine || PU::checkDMVRCondition(pu);
#endif

  MotionInfo NeighMi = MotionInfo();

  int BcwIdx = pu.cu->BcwIdx;
  bool affine = pu.cu->affine;
  bool geo = pu.cu->geoFlag;
  subPu.cu->affine = false;
  subPu.cu->BcwIdx = BCW_DEFAULT;
  subPu.cu->geoFlag = false;
#if INTER_LIC
  subPu.cu->LICFlag = false;
#endif
  subPu.ciipFlag = false;
#if TM_MRG
  subPu.tmMergeFlag = false;
#endif
#if MULTI_PASS_DMVR
  subPu.bdmvrRefine = false;
#endif
  subPu.mvRefine = false;
  subPu.mmvdMergeFlag = false;
  PelUnitBuf pcYuvPred = pDst == nullptr ? pu.cs->getPredBuf(pu) : *pDst;

  PelUnitBuf pcYuvTmpPredL0 = m_tmpObmcBufL0.subBuf(UnitAreaRelative(*pu.cu, pu));
  PelUnitBuf pcYuvTmpPredT0 = m_tmpObmcBufT0.subBuf(UnitAreaRelative(*pu.cu, pu));

  for (int iBlkBoundary = 0; iBlkBoundary < 2; iBlkBoundary++)  // 0 - top; 1 - left
  {
    unsigned int uiLengthInBlock = ((iBlkBoundary == 0) ? uiWidthInBlock : uiHeightInBlock);

    int iSub = 0, iState = 0;

    while (iSub < uiLengthInBlock)
    {
      int      iLength = 0;
      Position curOffset = (iBlkBoundary == 0) ? Position(iSub * uiMinCUW, 0) : Position(0, iSub * uiMinCUW);

      iState = PU::getSameNeigMotion(pu, NeighMi, curOffset, iBlkBoundary, iLength, uiLengthInBlock - iSub);

      if (iState == 2)  // do OBMC
      {
        subPu = NeighMi;
        if (iBlkBoundary == 0)
        {
          subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(orgPuArea.lumaPos().offset(iSub * uiMinCUW, 0), Size{ iLength*uiMinCUW, uiMinCUW })));
        }
        else
        {
          subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(orgPuArea.lumaPos().offset(0, iSub * uiMinCUW), Size{ uiMinCUW, iLength*uiMinCUW })));
        }

        const UnitArea predArea = UnitAreaRelative(orgPuArea, subPu);
        PelUnitBuf     cPred = pcYuvPred.subBuf(predArea);
        PelUnitBuf cTmp1;

        if (iBlkBoundary == 0)//above
        {
          cTmp1 = pcYuvTmpPredT0.subBuf(predArea);
        }
        else//left
        {
          cTmp1 = pcYuvTmpPredL0.subBuf(predArea);
        }
        xSubBlockMotionCompensation(subPu, cTmp1);

        for (int compID = 0; compID < MAX_NUM_COMPONENT; compID++)
        {
          xSubblockOBMC(ComponentID(compID), subPu, cPred, cTmp1, iBlkBoundary);
        }

        iSub += iLength;
      }
      else if (iState == 1 || iState == 3)   // consecutive intra neighbors or   skip OBMC based on MV similarity
      {
        iSub += iLength;
      }
      else                    // unavailable neighbors
      {
        iSub += uiLengthInBlock;
        break;
      }
    }
    CHECK(iSub != uiLengthInBlock, "not all sub-blocks are merged");
  }


  if (!bSubMotion)
  {
    pu.cu->BcwIdx = BcwIdx;
    pu.cu->affine = affine;
    pu.cu->geoFlag = geo;
    return;
  }

  PelUnitBuf pcYuvTmpPred = m_tmpSubObmcBuf;

  PelUnitBuf cTmp1 = pcYuvTmpPred.subBuf(UnitArea(pu.chromaFormat, Area(0, 0, uiMinCUW, uiMinCUW)));
  PelUnitBuf cTmp2 = pcYuvTmpPred.subBuf(UnitArea(pu.chromaFormat, Area(4, 0, uiMinCUW, uiMinCUW)));
  PelUnitBuf cTmp3 = pcYuvTmpPred.subBuf(UnitArea(pu.chromaFormat, Area(8, 0, uiMinCUW, uiMinCUW)));
  PelUnitBuf cTmp4 = pcYuvTmpPred.subBuf(UnitArea(pu.chromaFormat, Area(12, 0, uiMinCUW, uiMinCUW)));
  PelUnitBuf zero  = pcYuvTmpPred.subBuf(UnitArea(pu.chromaFormat, Area(16, 0, uiMinCUW, uiMinCUW)));

  for (int iSubX = 0; iSubX < uiWidthInBlock; iSubX += 1)
  {
    for (int iSubY = 0; iSubY < uiHeightInBlock; iSubY += 1)
    {
      bool bCURBoundary = (iSubX == uiWidthInBlock - 1);
      bool bCUBBoundary = (iSubY == uiHeightInBlock - 1);

      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(orgPuArea.lumaPos().offset(iSubX * uiMinCUW, iSubY * uiMinCUW), Size{ uiMinCUW, uiMinCUW })));
      const UnitArea predArea = UnitAreaRelative(orgPuArea, subPu);
      PelUnitBuf     cPred = pcYuvPred.subBuf(predArea);

      bool isAboveAvail = false, isLeftAvail = false, isBelowAvail = false, isRightAvail = false;

      // above
      if (iSubY)
      {
        isAboveAvail = PU::getNeighborMotion(pu, NeighMi, Position(iSubX * uiMinCUW, iSubY * uiMinCUW), Size(uiMinCUW, uiMinCUW), 0);
        if (isAboveAvail)
        {
          subPu = NeighMi;
          xSubBlockMotionCompensation(subPu, cTmp1);
        }
      }

      // left
      if (iSubX)
      {
        isLeftAvail = PU::getNeighborMotion(pu, NeighMi, Position(iSubX * uiMinCUW, iSubY * uiMinCUW), Size(uiMinCUW, uiMinCUW), 1);
        if (isLeftAvail)
        {
          subPu = NeighMi;
          xSubBlockMotionCompensation(subPu, cTmp2);
        }
      }

      // below
      if (!bCUBBoundary)
      {
        isBelowAvail = PU::getNeighborMotion(pu, NeighMi, Position(iSubX * uiMinCUW, iSubY * uiMinCUW), Size(uiMinCUW, uiMinCUW), 2);
        if (isBelowAvail)
        {
          subPu = NeighMi;
          xSubBlockMotionCompensation(subPu, cTmp3);
        }
      }

      // right
      if (!bCURBoundary)
      {
        isRightAvail = PU::getNeighborMotion(pu, NeighMi, Position(iSubX * uiMinCUW, iSubY * uiMinCUW), Size(uiMinCUW, uiMinCUW), 3);
        if (isRightAvail)
        {
          subPu = NeighMi;
          xSubBlockMotionCompensation(subPu, cTmp4);
        }
      }

      if( isAboveAvail || isLeftAvail || isBelowAvail || isRightAvail )
      {
        for( int compID = 0; compID < MAX_NUM_COMPONENT; compID++ )
        {
          xSubblockOBMCBlending( ComponentID( compID ), subPu, cPred, isAboveAvail ? cTmp1: zero, isLeftAvail ? cTmp2: zero, isBelowAvail ? cTmp3: zero, isRightAvail ? cTmp4: zero, isAboveAvail, isLeftAvail, isBelowAvail, isRightAvail, true );
        }
      }
    }
  }
  pu.cu->BcwIdx = BcwIdx;
  pu.cu->affine = affine;
  pu.cu->geoFlag = geo;

  return;
}

// Function for (weighted) averaging predictors of current block and predictors generated by applying neighboring motions to current block.
void InterPrediction::xSubblockOBMC(const ComponentID eComp, PredictionUnit &pu, PelUnitBuf &pcYuvPredDst, PelUnitBuf &pcYuvPredSrc, int iDir, bool bSubMotion)
{
  int iWidth = pu.blocks[eComp].width;
  int iHeight = pu.blocks[eComp].height;

  if (iWidth == 0 || iHeight == 0)
  {
    return;
  }

  Pel* pOrgDst = pcYuvPredDst.bufs[eComp].buf;
  Pel* pOrgSrc = pcYuvPredSrc.bufs[eComp].buf;
  const int strideDst = pcYuvPredDst.bufs[eComp].stride;
  const int strideSrc = pcYuvPredSrc.bufs[eComp].stride;

  if (iDir == 0) //above
  {
    for (int i = 0; i < iWidth; i++)
    {
      Pel* pDst = pOrgDst;
      Pel* pSrc = pOrgSrc;

      pDst[i] = bSubMotion ? (3 * pDst[i] + pSrc[i] + 2) >> 2 : (26 * pDst[i] + 6 * pSrc[i] + 16) >> 5;

      if (eComp == COMPONENT_Y)
      {
        pDst += strideDst;
        pSrc += strideSrc;
        pDst[i] = (7 * pDst[i] + pSrc[i] + 4) >> 3;
        pDst += strideDst;
        pSrc += strideSrc;
        pDst[i] = (15 * pDst[i] + pSrc[i] + 8) >> 4;
        if (!bSubMotion)
        {
          pDst += strideDst;
          pSrc += strideSrc;
          pDst[i] = (31 * pDst[i] + pSrc[i] + 16) >> 5;
        }
      }
    }
  }

  if (iDir == 1) //left
  {
    Pel* pDst = pOrgDst;
    Pel* pSrc = pOrgSrc;
    for (int i = 0; i < iHeight; i++)
    {
      pDst[0] = bSubMotion ? (3 * pDst[0] + pSrc[0] + 2) >> 2 : (26 * pDst[0] + 6 * pSrc[0] + 16) >> 5;

      if (eComp == COMPONENT_Y)
      {
        pDst[1] = (7 * pDst[1] + pSrc[1] + 4) >> 3;
        pDst[2] = (15 * pDst[2] + pSrc[2] + 8) >> 4;
        if (!bSubMotion)
        {
          pDst[3] = (31 * pDst[3] + pSrc[3] + 16) >> 5;
        }
      }

      pDst += strideDst;
      pSrc += strideSrc;
    }
  }

  if (iDir == 2) //below
  {
    for (int i = 0; i < iWidth; i++)
    {
      Pel* pDst = pOrgDst + (iHeight - 1) * strideDst;
      Pel* pSrc = pOrgSrc + (iHeight - 1) * strideSrc;

      pDst[i] = (3 * pDst[i] + pSrc[i] + 2) >> 2;

      if (eComp == COMPONENT_Y)
      {
        pDst -= strideDst;
        pSrc -= strideSrc;
        pDst[i] = (7 * pDst[i] + pSrc[i] + 4) >> 3;
        pDst -= strideDst;
        pSrc -= strideSrc;
        pDst[i] = (15 * pDst[i] + pSrc[i] + 8) >> 4;
      }
    }
  }

  if (iDir == 3) //right
  {
    Pel* pDst = pOrgDst + (iWidth - 4);
    Pel* pSrc = pOrgSrc + (iWidth - 4);
    for (int i = 0; i < iHeight; i++)
    {
      pDst[3] = (3 * pDst[3] + pSrc[3] + 2) >> 2;
      if (eComp == COMPONENT_Y)
      {
        pDst[2] = (7 * pDst[2] + pSrc[2] + 4) >> 3;
        pDst[1] = (15 * pDst[1] + pSrc[1] + 8) >> 4;
      }
      pDst += strideDst;
      pSrc += strideSrc;
    }
  }
}

#if ENABLE_OBMC
void InterPrediction::xSubblockOBMCBlending(const ComponentID eComp, PredictionUnit &pu, PelUnitBuf &pcYuvPredDst, PelUnitBuf &pcYuvPredSrc1, PelUnitBuf &pcYuvPredSrc2, PelUnitBuf &pcYuvPredSrc3, PelUnitBuf &pcYuvPredSrc4, bool isAboveAvail, bool isLeftAvail, bool isBelowAvail, bool isRightAvail, bool bSubMotion)
{
  int iWidth = pu.blocks[eComp].width;
  int iHeight = pu.blocks[eComp].height;

  if (iWidth == 0 || iHeight == 0)
  {
    return;
  }

  Pel* pOrgDst = pcYuvPredDst.bufs[eComp].buf;
  Pel* pOrgSrc1 = pcYuvPredSrc1.bufs[eComp].buf;
  Pel* pOrgSrc2 = pcYuvPredSrc2.bufs[eComp].buf;
  Pel* pOrgSrc3 = pcYuvPredSrc3.bufs[eComp].buf;
  Pel* pOrgSrc4 = pcYuvPredSrc4.bufs[eComp].buf;

  const int strideDst = pcYuvPredDst.bufs[eComp].stride;
  const int strideSrc = pcYuvPredSrc1.bufs[eComp].stride;

  unsigned int isChroma = !isLuma( eComp );
  unsigned int aboveWeight[4], leftWeight[4], belowWeight[4], rightWeight[4];

  if( isAboveAvail )
  {
    memcpy( aboveWeight, defaultWeight[isChroma], sizeof( aboveWeight ) );
  }
  else
  {
    memset( aboveWeight, 0, sizeof( aboveWeight ) );
  }

  if( isLeftAvail )
  {
    memcpy( leftWeight, defaultWeight[isChroma], sizeof( leftWeight ) );
  }
  else
  {
    memset( leftWeight, 0, sizeof( leftWeight ) );
  }

  if( isBelowAvail )
  {
    memcpy( belowWeight, defaultWeight[isChroma], sizeof( belowWeight ) );
  }
  else
  {
    memset( belowWeight, 0, sizeof( belowWeight ) );
  }

  if( isRightAvail )
  {
    memcpy( rightWeight, defaultWeight[isChroma], sizeof( rightWeight ) );
  }
  else
  {
    memset( rightWeight, 0, sizeof( rightWeight ) );
  }

  unsigned int shift = 7;
  unsigned int sumWeight = 1 << shift;
  unsigned int add = 1 << (shift - 1);

  Pel* pDst = pOrgDst;
  Pel* pSrc1 = pOrgSrc1;
  Pel* pSrc2 = pOrgSrc2;
  Pel* pSrc3 = pOrgSrc3;
  Pel* pSrc4 = pOrgSrc4;

  if( isLuma( eComp ) )
  {
    for( int j = 0; j < iHeight; j++ )
    {
      unsigned int idx_h = iHeight - 1 - j;

      for( int i = 0; i < iWidth; i++ )
      {
        unsigned int idx_w = iWidth - 1 - i;

        unsigned int sumOBMCWeight = aboveWeight[j] + leftWeight[i] + belowWeight[idx_h] + rightWeight[idx_w];

        if( sumOBMCWeight == 0 )
        {
          continue;
        }

        unsigned int currentWeight = sumWeight - sumOBMCWeight;

        pDst[i] = (currentWeight * pDst[i] + aboveWeight[j] * pSrc1[i] + leftWeight[i] * pSrc2[i] + belowWeight[idx_h] * pSrc3[i] + rightWeight[idx_w] * pSrc4[i] + add) >> shift;
      }
      pDst += strideDst;
      pSrc1 += strideSrc;
      pSrc2 += strideSrc;
      pSrc3 += strideSrc;
      pSrc4 += strideSrc;
    }
  }
  else
  {
    pDst[0] = ((sumWeight - aboveWeight[0] - leftWeight[0]) * pDst[0] + aboveWeight[0] * pSrc1[0] + leftWeight[0] * pSrc2[0] + add) >> shift;
    pDst[1] = ((sumWeight - aboveWeight[0] - rightWeight[0]) * pDst[1] + aboveWeight[0] * pSrc1[1] + rightWeight[0] * pSrc4[1] + add) >> shift;

    pDst += strideDst;
    pSrc2 += strideSrc;
    pSrc3 += strideSrc;
    pSrc4 += strideSrc;

    pDst[0] = ((sumWeight - leftWeight[0] - belowWeight[0]) * pDst[0] + leftWeight[0] * pSrc2[0] + belowWeight[0] * pSrc3[0] + add) >> shift;
    pDst[1] = ((sumWeight - belowWeight[0] - rightWeight[0]) * pDst[1] + belowWeight[0] * pSrc3[1] + rightWeight[0] * pSrc4[1] + add) >> shift;
  }
}
#endif

void InterPrediction::xSubBlockMotionCompensation(PredictionUnit &pu, PelUnitBuf &pcYuvPred)
{
  if (xCheckIdenticalMotion(pu))
  {
    xPredInterUni(pu, REF_PIC_LIST_0, pcYuvPred, false, false, true, true);
  }
  else
  {
    xPredInterBi(pu, pcYuvPred, true);
  }
}
#endif

int InterPrediction::rightShiftMSB(int numer, int denom)
{
  return numer >> floorLog2(denom);
}

#if JVET_W0097_GPM_MMVD_TM && TM_MRG
#if JVET_Y0065_GPM_INTRA
void InterPrediction::motionCompensationGeo( CodingUnit &cu, MergeCtx &geoMrgCtx, MergeCtx &geoTmMrgCtx0, MergeCtx &geoTmMrgCtx1, IntraPrediction* pcIntraPred, std::vector<Pel>* reshapeLUT )
#else
void InterPrediction::motionCompensationGeo(CodingUnit &cu, MergeCtx &geoMrgCtx, MergeCtx &geoTmMrgCtx0, MergeCtx &geoTmMrgCtx1)
#endif
#else
#if JVET_Y0065_GPM_INTRA
void InterPrediction::motionCompensationGeo( CodingUnit &cu, MergeCtx &geoMrgCtx, IntraPrediction* pcIntraPred, std::vector<Pel>* reshapeLUT )
#else
void InterPrediction::motionCompensationGeo( CodingUnit &cu, MergeCtx &geoMrgCtx )
#endif
#endif
{
  const uint8_t splitDir = cu.firstPU->geoSplitDir;
  const uint8_t candIdx0 = cu.firstPU->geoMergeIdx0;
  const uint8_t candIdx1 = cu.firstPU->geoMergeIdx1;
#if JVET_W0097_GPM_MMVD_TM
  const bool    geoMMVDFlag0 = cu.firstPU->geoMMVDFlag0;
  const uint8_t geoMMVDIdx0 = cu.firstPU->geoMMVDIdx0;
  const bool    geoMMVDFlag1 = cu.firstPU->geoMMVDFlag1;
  const uint8_t geoMMVDIdx1 = cu.firstPU->geoMMVDIdx1;
#if TM_MRG
  const bool    geoTmFlag0 = cu.firstPU->geoTmFlag0;
  const bool    geoTmFlag1 = cu.firstPU->geoTmFlag1;
#endif
#endif
  for( auto &pu : CU::traversePUs( cu ) )
  {
    const UnitArea localUnitArea( cu.cs->area.chromaFormat, Area( 0, 0, pu.lwidth(), pu.lheight() ) );
    PelUnitBuf tmpGeoBuf0 = m_geoPartBuf[0].getBuf( localUnitArea );
    PelUnitBuf tmpGeoBuf1 = m_geoPartBuf[1].getBuf( localUnitArea );
    PelUnitBuf predBuf    = cu.cs->getPredBuf( pu );
#if JVET_Y0065_GPM_INTRA
    bool isIntra0 = candIdx0 >= GEO_MAX_NUM_UNI_CANDS;
    bool isIntra1 = candIdx1 >= GEO_MAX_NUM_UNI_CANDS;
    if (isIntra0)
    {
      PU::getGeoIntraMPMs(pu, pu.intraMPM, splitDir, g_geoTmShape[0][g_GeoParams[pu.geoSplitDir][0]]);
      pu.intraDir[0] = pu.intraMPM[candIdx0 - GEO_MAX_NUM_UNI_CANDS];
      pcIntraPred->initIntraPatternChType(cu, pu.Y());
      pcIntraPred->predIntraAng(COMPONENT_Y, tmpGeoBuf0.Y(), pu);
      if (isChromaEnabled(pu.chromaFormat))
      {
        pu.intraDir[1] = pu.intraDir[0];
        pcIntraPred->initIntraPatternChType(cu, pu.Cb());
        pcIntraPred->predIntraAng(COMPONENT_Cb, tmpGeoBuf0.Cb(), pu);
        pcIntraPred->initIntraPatternChType(cu, pu.Cr());
        pcIntraPred->predIntraAng(COMPONENT_Cr, tmpGeoBuf0.Cr(), pu);
      }
    }
    else
    {
#endif
#if JVET_W0097_GPM_MMVD_TM
#if TM_MRG
    if (geoTmFlag0)
    {
      geoTmMrgCtx0.setMergeInfo(pu, candIdx0);
    }
    else
#endif
    if (geoMMVDFlag0)
    {
      geoMrgCtx.setGeoMmvdMergeInfo(pu, candIdx0, geoMMVDIdx0);
    }
    else
#endif
    geoMrgCtx.setMergeInfo( pu, candIdx0 );

    motionCompensation(pu, tmpGeoBuf0, REF_PIC_LIST_X, true, isChromaEnabled(pu.chromaFormat)); // TODO: check 4:0:0 interaction with weighted prediction.
    if( g_mctsDecCheckEnabled && !MCTSHelper::checkMvBufferForMCTSConstraint( pu, true ) )
    {
      printf( "DECODER_GEO_PU: pu motion vector across tile boundaries (%d,%d,%d,%d)\n", pu.lx(), pu.ly(), pu.lwidth(), pu.lheight() );
    }
#if JVET_Y0065_GPM_INTRA
      if (isIntra1)
      {
        tmpGeoBuf0.roundToOutputBitdepth(tmpGeoBuf0, cu.slice->clpRngs());
#if ENABLE_OBMC
#if JVET_W0123_TIMD_FUSION
        PU::spanMotionInfo2(pu);
#else
        PU::spanMotionInfo(pu);
#endif
        cu.isobmcMC = true;
        subBlockOBMC(pu, &tmpGeoBuf0);
        cu.isobmcMC = false;
#endif
      }
    }

    if (isIntra1)
    {
      PU::getGeoIntraMPMs(pu, pu.intraMPM+GEO_MAX_NUM_INTRA_CANDS, splitDir, g_geoTmShape[1][g_GeoParams[pu.geoSplitDir][0]]);
      pu.intraDir[0] = pu.intraMPM[candIdx1 - GEO_MAX_NUM_UNI_CANDS + GEO_MAX_NUM_INTRA_CANDS];
      pcIntraPred->initIntraPatternChType(cu, pu.Y());
      pcIntraPred->predIntraAng(COMPONENT_Y, tmpGeoBuf1.Y(), pu);
      if (isChromaEnabled(pu.chromaFormat))
      {
        pu.intraDir[1] = pu.intraDir[0];
        pcIntraPred->initIntraPatternChType(cu, pu.Cb());
        pcIntraPred->predIntraAng(COMPONENT_Cb, tmpGeoBuf1.Cb(), pu);
        pcIntraPred->initIntraPatternChType(cu, pu.Cr());
        pcIntraPred->predIntraAng(COMPONENT_Cr, tmpGeoBuf1.Cr(), pu);
      }
    }
    else
    {
#endif
#if JVET_W0097_GPM_MMVD_TM
#if TM_MRG
    if (geoTmFlag1)
    {
      geoTmMrgCtx1.setMergeInfo(pu, candIdx1);
    }
    else
#endif
    if (geoMMVDFlag1)
    {
      geoMrgCtx.setGeoMmvdMergeInfo(pu, candIdx1, geoMMVDIdx1);
    }
    else
#endif
    geoMrgCtx.setMergeInfo( pu, candIdx1 );

    motionCompensation(pu, tmpGeoBuf1, REF_PIC_LIST_X, true, isChromaEnabled(pu.chromaFormat)); // TODO: check 4:0:0 interaction with weighted prediction.
    if( g_mctsDecCheckEnabled && !MCTSHelper::checkMvBufferForMCTSConstraint( pu, true ) )
    {
      printf( "DECODER_GEO_PU: pu motion vector across tile boundaries (%d,%d,%d,%d)\n", pu.lx(), pu.ly(), pu.lwidth(), pu.lheight() );
    }
#if JVET_Y0065_GPM_INTRA
      if (isIntra0)
      {
        tmpGeoBuf1.roundToOutputBitdepth(tmpGeoBuf1, cu.slice->clpRngs());
#if ENABLE_OBMC
#if JVET_W0123_TIMD_FUSION
        PU::spanMotionInfo2(pu);
#else
        PU::spanMotionInfo(pu);
#endif
        cu.isobmcMC = true;
        subBlockOBMC(pu, &tmpGeoBuf1);
        cu.isobmcMC = false;
#endif
      }
    }
    if (pu.gpmIntraFlag)
    {
      if (reshapeLUT)
      {
        if (!isIntra1)
        {
          tmpGeoBuf1.Y().rspSignal(*reshapeLUT);
        }
        else if (!isIntra0)
        {
          tmpGeoBuf0.Y().rspSignal(*reshapeLUT);
        }
      }
      weightedGeoBlkRounded(pu, splitDir, isChromaEnabled(pu.chromaFormat)? MAX_NUM_CHANNEL_TYPE : CHANNEL_TYPE_LUMA, predBuf, tmpGeoBuf0, tmpGeoBuf1);
    }
    else
#endif
    weightedGeoBlk(pu, splitDir, isChromaEnabled(pu.chromaFormat)? MAX_NUM_CHANNEL_TYPE : CHANNEL_TYPE_LUMA, predBuf, tmpGeoBuf0, tmpGeoBuf1);
  }
}

void InterPrediction::weightedGeoBlk( PredictionUnit &pu, const uint8_t splitDir, int32_t channel, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1)
{
  if( channel == CHANNEL_TYPE_LUMA )
  {
    m_if.weightedGeoBlk( pu, pu.lumaSize().width, pu.lumaSize().height, COMPONENT_Y, splitDir, predDst, predSrc0, predSrc1 );
  }
  else if( channel == CHANNEL_TYPE_CHROMA )
  {
    m_if.weightedGeoBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0, predSrc1 );
    m_if.weightedGeoBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0, predSrc1 );
  }
  else
  {
    m_if.weightedGeoBlk( pu, pu.lumaSize().width,   pu.lumaSize().height,   COMPONENT_Y,  splitDir, predDst, predSrc0, predSrc1 );
    if (isChromaEnabled(pu.chromaFormat))
    {
      m_if.weightedGeoBlk(pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0,
                          predSrc1);
      m_if.weightedGeoBlk(pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0,
                          predSrc1);
    }
  }
}

#if JVET_Y0065_GPM_INTRA
void InterPrediction::weightedGeoBlkRounded( PredictionUnit &pu, const uint8_t splitDir, int32_t channel, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1)
{
  if( channel == CHANNEL_TYPE_LUMA )
  {
    m_if.weightedGeoBlkRounded( pu, pu.lumaSize().width, pu.lumaSize().height, COMPONENT_Y, splitDir, predDst, predSrc0, predSrc1 );
  }
  else if( channel == CHANNEL_TYPE_CHROMA )
  {
    m_if.weightedGeoBlkRounded( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0, predSrc1 );
    m_if.weightedGeoBlkRounded( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0, predSrc1 );
  }
  else
  {
    m_if.weightedGeoBlkRounded( pu, pu.lumaSize().width,   pu.lumaSize().height,   COMPONENT_Y,  splitDir, predDst, predSrc0, predSrc1 );
    if (isChromaEnabled(pu.chromaFormat))
    {
      m_if.weightedGeoBlkRounded(pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0,
                          predSrc1);
      m_if.weightedGeoBlkRounded(pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0,
                          predSrc1);
    }
  }
}
#endif

void InterPrediction::xPrefetch(PredictionUnit& pu, PelUnitBuf &pcPad, RefPicList refId, bool forLuma)
{
  int offset, width, height;
  Mv cMv;
  const Picture* refPic = pu.cu->slice->getRefPic( refId, pu.refIdx[refId] )->unscaledPic;
  int mvShift = (MV_FRACTIONAL_BITS_INTERNAL);

  int start = 0;
  int end = MAX_NUM_COMPONENT;

  start = forLuma ? 0 : 1;
  end = forLuma ? 1 : MAX_NUM_COMPONENT;

  for (int compID = start; compID < end; compID++)
  {
    cMv = Mv(pu.mv[refId].getHor(), pu.mv[refId].getVer());
#if IF_12TAP
    pcPad.bufs[compID].stride = (pcPad.bufs[compID].width + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA(0));
    int filtersize = (compID == (COMPONENT_Y)) ? NTAPS_LUMA(0) : NTAPS_CHROMA;
#else
    pcPad.bufs[compID].stride = (pcPad.bufs[compID].width + (2 * DMVR_NUM_ITERATION) + NTAPS_LUMA);
    int filtersize = (compID == (COMPONENT_Y)) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif
    width = pcPad.bufs[compID].width;
    height = pcPad.bufs[compID].height;
    offset = (DMVR_NUM_ITERATION) * (pcPad.bufs[compID].stride + 1);

    int mvshiftTempHor = mvShift + getComponentScaleX((ComponentID)compID, pu.chromaFormat);
    int mvshiftTempVer = mvShift + getComponentScaleY((ComponentID)compID, pu.chromaFormat);
    width += (filtersize - 1);
    height += (filtersize - 1);
    cMv += Mv(-(((filtersize >> 1) - 1) << mvshiftTempHor),
      -(((filtersize >> 1) - 1) << mvshiftTempVer));
    bool wrapRef = false;
    if( refPic->isWrapAroundEnabled( pu.cs->pps ) )
    {
      wrapRef = wrapClipMv( cMv, pu.blocks[0].pos(), pu.blocks[0].size(), pu.cs->sps, pu.cs->pps );
    }
    else
    {
      clipMv( cMv, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps );
    }
    /* Pre-fetch similar to HEVC*/
    {
      CPelBuf refBuf;
      Position Rec_offset = pu.blocks[compID].pos().offset(cMv.getHor() >> mvshiftTempHor, cMv.getVer() >> mvshiftTempVer);
      refBuf = refPic->getRecoBuf(CompArea((ComponentID)compID, pu.chromaFormat, Rec_offset, pu.blocks[compID].size()), wrapRef);
      PelBuf &dstBuf = pcPad.bufs[compID];
      g_pelBufOP.copyBuffer((Pel *)refBuf.buf, refBuf.stride, ((Pel *)dstBuf.buf) + offset, dstBuf.stride, width, height);
    }
  }
}

void InterPrediction::xPad(PredictionUnit& pu, PelUnitBuf &pcPad, RefPicList refId)
{
  int offset = 0, width, height;
  int padsize;
  Mv cMv;
  for (int compID = 0; compID < getNumberValidComponents(pu.chromaFormat); compID++)
  {
#if IF_12TAP
    int filtersize = (compID == (COMPONENT_Y)) ? NTAPS_LUMA(0) : NTAPS_CHROMA;
#else
    int filtersize = (compID == (COMPONENT_Y)) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif
    width = pcPad.bufs[compID].width;
    height = pcPad.bufs[compID].height;
    offset = (DMVR_NUM_ITERATION) * (pcPad.bufs[compID].stride + 1);
    /*using the larger padsize for 422*/
    padsize = (DMVR_NUM_ITERATION) >> getComponentScaleY((ComponentID)compID, pu.chromaFormat);
    width += (filtersize - 1);
    height += (filtersize - 1);
    /*padding on all side of size DMVR_PAD_LENGTH*/
    g_pelBufOP.padding(pcPad.bufs[compID].buf + offset, pcPad.bufs[compID].stride, width, height, padsize);
  }
}

inline int32_t div_for_maxq7(int64_t N, int64_t D)
{
  int32_t sign, q;
  sign = 0;
  if (N < 0)
  {
    sign = 1;
    N = -N;
  }

  q = 0;
  D = (D << 3);
  if (N >= D)
  {
    N -= D;
    q++;
  }
  q = (q << 1);

  D = (D >> 1);
  if (N >= D)
  {
    N -= D;
    q++;
  }
  q = (q << 1);

  if (N >= (D >> 1))
  {
    q++;
  }
  if (sign)
  {
    return (-q);
  }
  return(q);
}

void xSubPelErrorSrfc(uint64_t *sadBuffer, int32_t *deltaMv)
{
  int64_t numerator, denominator;
  int32_t mvDeltaSubPel;
  int32_t mvSubPelLvl = 4;/*1: half pel, 2: Qpel, 3:1/8, 4: 1/16*/
                                                        /*horizontal*/
  numerator   = (int64_t)((sadBuffer[1] - sadBuffer[3]) << mvSubPelLvl);
  denominator = (int64_t)((sadBuffer[1] + sadBuffer[3] - (sadBuffer[0] << 1)));

#if MULTI_PASS_DMVR
    if (denominator > 0)
    {
      if ((sadBuffer[1] != sadBuffer[0]) && (sadBuffer[3] != sadBuffer[0]))
      {
        mvDeltaSubPel = div_for_maxq7(numerator, denominator);
        deltaMv[0] = (mvDeltaSubPel);
      }
      else
      {
        if (sadBuffer[1] == sadBuffer[0])
        {
          deltaMv[0] = -8;// half pel
        }
        else
        {
          deltaMv[0] = 8;// half pel
        }
      }
    }
    else
    {
      if (sadBuffer[1] < sadBuffer[3])
      {
        deltaMv[0] = -8;
      }
      else if (sadBuffer[1] == sadBuffer[3])
      {
        deltaMv[0] = 0;
      }
      else
      {
        deltaMv[0] = 8;
      }
    }
#else
    if (0 != denominator)
    {
      if ((sadBuffer[1] != sadBuffer[0]) && (sadBuffer[3] != sadBuffer[0]))
      {
        mvDeltaSubPel = div_for_maxq7(numerator, denominator);
        deltaMv[0] = (mvDeltaSubPel);
      }
      else
      {
        if (sadBuffer[1] == sadBuffer[0])
        {
          deltaMv[0] = -8;// half pel
        }
        else
        {
          deltaMv[0] = 8;// half pel
        }
      }
    }
#endif

    /*vertical*/
    numerator = (int64_t)((sadBuffer[2] - sadBuffer[4]) << mvSubPelLvl);
    denominator = (int64_t)((sadBuffer[2] + sadBuffer[4] - (sadBuffer[0] << 1)));
#if MULTI_PASS_DMVR
    if (denominator > 0)
    {
      if ((sadBuffer[2] != sadBuffer[0]) && (sadBuffer[4] != sadBuffer[0]))
      {
        mvDeltaSubPel = div_for_maxq7(numerator, denominator);
        deltaMv[1] = (mvDeltaSubPel);
      }
      else
      {
        if (sadBuffer[2] == sadBuffer[0])
        {
          deltaMv[1] = -8;// half pel
        }
        else
        {
          deltaMv[1] = 8;// half pel
        }
      }
    }
    else
    {
      if (sadBuffer[2] < sadBuffer[4])
      {
        deltaMv[1] = -8;
      }
      else if (sadBuffer[2] == sadBuffer[4])
      {
        deltaMv[1] = 0;
      }
      else
      {
        deltaMv[1] = 8;
      }
    }
#else
    if (0 != denominator)
    {
      if ((sadBuffer[2] != sadBuffer[0]) && (sadBuffer[4] != sadBuffer[0]))
      {
        mvDeltaSubPel = div_for_maxq7(numerator, denominator);
        deltaMv[1] = (mvDeltaSubPel);
      }
      else
      {
        if (sadBuffer[2] == sadBuffer[0])
        {
          deltaMv[1] = -8;// half pel
        }
        else
        {
          deltaMv[1] = 8;// half pel
        }
      }
    }
#endif
  return;
}

void InterPrediction::xBIPMVRefine(int bd, Pel *pRefL0, Pel *pRefL1, uint64_t& minCost, int16_t *deltaMV, uint64_t *pSADsArray, int width, int height)
{
  const int32_t refStrideL0 = m_biLinearBufStride;
  const int32_t refStrideL1 = m_biLinearBufStride;
  Pel *pRefL0Orig = pRefL0;
  Pel *pRefL1Orig = pRefL1;
  for (int nIdx = 0; (nIdx < 25); ++nIdx)
  {
    int32_t sadOffset = ((m_pSearchOffset[nIdx].getVer() * ((2 * DMVR_NUM_ITERATION) + 1)) + m_pSearchOffset[nIdx].getHor());
    pRefL0 = pRefL0Orig + m_pSearchOffset[nIdx].hor + (m_pSearchOffset[nIdx].ver * refStrideL0);
    pRefL1 = pRefL1Orig - m_pSearchOffset[nIdx].hor - (m_pSearchOffset[nIdx].ver * refStrideL1);
    if (*(pSADsArray + sadOffset) == MAX_UINT64)
    {
      const uint64_t cost = xDMVRCost(bd, pRefL0, refStrideL0, pRefL1, refStrideL1, width, height);
      *(pSADsArray + sadOffset) = cost;
    }
    if (*(pSADsArray + sadOffset) < minCost)
    {
      minCost = *(pSADsArray + sadOffset);
      deltaMV[0] = m_pSearchOffset[nIdx].getHor();
      deltaMV[1] = m_pSearchOffset[nIdx].getVer();
    }
  }
}

void InterPrediction::xFinalPaddedMCForDMVR(PredictionUnit &pu, PelUnitBuf &pcYuvSrc0, PelUnitBuf &pcYuvSrc1,
                                            PelUnitBuf &pcPad0, PelUnitBuf &pcPad1, const bool bioApplied,
                                            const Mv mergeMV[NUM_REF_PIC_LIST_01], bool blockMoved)
{
  int offset, deltaIntMvX, deltaIntMvY;

  PelUnitBuf pcYUVTemp = pcYuvSrc0;
  PelUnitBuf pcPadTemp = pcPad0;
  /*always high precision MVs are used*/
  int mvShift = MV_FRACTIONAL_BITS_INTERNAL;

  for (int k = 0; k < NUM_REF_PIC_LIST_01; k++)
  {
    RefPicList refId = (RefPicList)k;
    Mv cMv = pu.mv[refId];
    m_iRefListIdx = refId;
    const Picture* refPic = pu.cu->slice->getRefPic( refId, pu.refIdx[refId] )->unscaledPic;
    Mv cMvClipped = cMv;
    if( !pu.cs->pps->getWrapAroundEnabledFlag() )
    {
      clipMv( cMvClipped, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps );
    }

    Mv startMv = mergeMV[refId];

    if( g_mctsDecCheckEnabled && !MCTSHelper::checkMvForMCTSConstraint( pu, startMv, MV_PRECISION_INTERNAL ) )
    {
      const Area& tileArea = pu.cs->picture->mctsInfo.getTileArea();
      printf( "Attempt an access over tile boundary at block %d,%d %d,%d with MV %d,%d (in Tile TL: %d,%d BR: %d,%d)\n",
        pu.lx(), pu.ly(), pu.lwidth(), pu.lheight(), startMv.getHor(), startMv.getVer(), tileArea.topLeft().x, tileArea.topLeft().y, tileArea.bottomRight().x, tileArea.bottomRight().y );
      THROW( "MCTS constraint failed!" );
    }
    for (int compID = 0; compID < getNumberValidComponents(pu.chromaFormat); compID++)
    {
      Pel *srcBufPelPtr = NULL;
      int pcPadstride = 0;
      if (blockMoved || (compID == 0))
      {
        pcPadstride = pcPadTemp.bufs[compID].stride;
        int mvshiftTempHor = mvShift + getComponentScaleX((ComponentID)compID, pu.chromaFormat);
        int mvshiftTempVer = mvShift + getComponentScaleY((ComponentID)compID, pu.chromaFormat);
        int leftPixelExtra;
        if (compID == COMPONENT_Y)
        {
#if IF_12TAP
          leftPixelExtra = (NTAPS_LUMA(0) >> 1) - 1;
#else
          leftPixelExtra = (NTAPS_LUMA >> 1) - 1;
#endif
        }
        else
        {
          leftPixelExtra = (NTAPS_CHROMA >> 1) - 1;
        }
        PelBuf &srcBuf = pcPadTemp.bufs[compID];
        deltaIntMvX = (cMv.getHor() >> mvshiftTempHor) -
          (startMv.getHor() >> mvshiftTempHor);
        deltaIntMvY = (cMv.getVer() >> mvshiftTempVer) -
          (startMv.getVer() >> mvshiftTempVer);

        CHECK((abs(deltaIntMvX) > DMVR_NUM_ITERATION) || (abs(deltaIntMvY) > DMVR_NUM_ITERATION), "not expected DMVR movement");
        offset = (DMVR_NUM_ITERATION + leftPixelExtra) * (pcPadTemp.bufs[compID].stride + 1);
        offset += (deltaIntMvY)* pcPadTemp.bufs[compID].stride;
        offset += (deltaIntMvX);
        srcBufPelPtr = (srcBuf.buf + offset);
      }
      JVET_J0090_SET_CACHE_ENABLE(false);
      xPredInterBlk((ComponentID) compID, pu, refPic, cMvClipped, pcYUVTemp, true,
                    pu.cs->slice->getClpRngs().comp[compID], bioApplied, false,
                    pu.cu->slice->getScalingRatio(refId, pu.refIdx[refId]), 0, 0, 0, srcBufPelPtr, pcPadstride);
      JVET_J0090_SET_CACHE_ENABLE(false);
    }
    pcYUVTemp = pcYuvSrc1;
    pcPadTemp = pcPad1;
  }
}

uint64_t InterPrediction::xDMVRCost(int bitDepth, Pel* pOrg, uint32_t refStride, const Pel* pRef, uint32_t orgStride, int width, int height)
{
  DistParam cDistParam;
  cDistParam.applyWeight = false;
  cDistParam.useMR = false;
  m_pcRdCost->setDistParam(cDistParam, pOrg, pRef, orgStride, refStride, bitDepth, COMPONENT_Y, width, height, 1);
  uint64_t uiCost = cDistParam.distFunc(cDistParam);
  return uiCost>>1;
}

void xDMVRSubPixelErrorSurface(bool notZeroCost, int16_t *totalDeltaMV, int16_t *deltaMV, uint64_t *pSADsArray)
{
  int sadStride = (((2 * DMVR_NUM_ITERATION) + 1));
  uint64_t sadbuffer[5];
  if (notZeroCost && (abs(totalDeltaMV[0]) != (2 << MV_FRACTIONAL_BITS_INTERNAL))
    && (abs(totalDeltaMV[1]) != (2 << MV_FRACTIONAL_BITS_INTERNAL)))
  {
    int32_t tempDeltaMv[2] = { 0,0 };
    sadbuffer[0] = pSADsArray[0];
    sadbuffer[1] = pSADsArray[-1];
    sadbuffer[2] = pSADsArray[-sadStride];
    sadbuffer[3] = pSADsArray[1];
    sadbuffer[4] = pSADsArray[sadStride];
    xSubPelErrorSrfc(sadbuffer, tempDeltaMv);
    totalDeltaMV[0] += tempDeltaMv[0];
    totalDeltaMV[1] += tempDeltaMv[1];
  }
}

void InterPrediction::xinitMC(PredictionUnit& pu, const ClpRngs &clpRngs)
{
  const int refIdx0 = pu.refIdx[0];
  const int refIdx1 = pu.refIdx[1];
  /*use merge MV as starting MV*/
  Mv mergeMVL0(pu.mv[REF_PIC_LIST_0]);
  Mv mergeMVL1(pu.mv[REF_PIC_LIST_1]);

  /*Clip the starting MVs*/
  if( !pu.cs->pps->getWrapAroundEnabledFlag() )
  {
    clipMv( mergeMVL0, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps );
    clipMv( mergeMVL1, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps );
  }

  /*L0 MC for refinement*/
  {
    int offset;
#if IF_12TAP
    int leftPixelExtra = (NTAPS_LUMA(0) >> 1) - 1;
#else
    int leftPixelExtra = (NTAPS_LUMA >> 1) - 1;
#endif
    offset = (DMVR_NUM_ITERATION + leftPixelExtra) * (m_cYuvRefBuffDMVRL0.bufs[COMPONENT_Y].stride + 1);
    offset += (-(int)DMVR_NUM_ITERATION)* (int)m_cYuvRefBuffDMVRL0.bufs[COMPONENT_Y].stride;
    offset += (-(int)DMVR_NUM_ITERATION);
    PelBuf srcBuf = m_cYuvRefBuffDMVRL0.bufs[COMPONENT_Y];
    PelUnitBuf yuvPredTempL0 = PelUnitBuf(pu.chromaFormat, PelBuf(m_cYuvPredTempDMVRL0,
      m_biLinearBufStride
      , pu.lwidth() + (2 * DMVR_NUM_ITERATION), pu.lheight() + (2 * DMVR_NUM_ITERATION)));

    xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( REF_PIC_LIST_0, refIdx0 )->unscaledPic, mergeMVL0, yuvPredTempL0, true, clpRngs.comp[COMPONENT_Y],
      false, false, pu.cu->slice->getScalingRatio( REF_PIC_LIST_0, refIdx0 ), pu.lwidth() + ( 2 * DMVR_NUM_ITERATION ), pu.lheight() + ( 2 * DMVR_NUM_ITERATION ), true, ( (Pel *)srcBuf.buf ) + offset, srcBuf.stride );
  }

  /*L1 MC for refinement*/
  {
    int offset;
#if IF_12TAP
    int leftPixelExtra = (NTAPS_LUMA(0) >> 1) - 1;
#else
    int leftPixelExtra = (NTAPS_LUMA >> 1) - 1;
#endif
    offset = (DMVR_NUM_ITERATION + leftPixelExtra) * (m_cYuvRefBuffDMVRL1.bufs[COMPONENT_Y].stride + 1);
    offset += (-(int)DMVR_NUM_ITERATION)* (int)m_cYuvRefBuffDMVRL1.bufs[COMPONENT_Y].stride;
    offset += (-(int)DMVR_NUM_ITERATION);
    PelBuf srcBuf = m_cYuvRefBuffDMVRL1.bufs[COMPONENT_Y];
    PelUnitBuf yuvPredTempL1 = PelUnitBuf(pu.chromaFormat, PelBuf(m_cYuvPredTempDMVRL1,
      m_biLinearBufStride
      , pu.lwidth() + (2 * DMVR_NUM_ITERATION), pu.lheight() + (2 * DMVR_NUM_ITERATION)));

    xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( REF_PIC_LIST_1, refIdx1 )->unscaledPic, mergeMVL1, yuvPredTempL1, true, clpRngs.comp[COMPONENT_Y],
      false, false, pu.cu->slice->getScalingRatio( REF_PIC_LIST_1, refIdx1 ), pu.lwidth() + ( 2 * DMVR_NUM_ITERATION ), pu.lheight() + ( 2 * DMVR_NUM_ITERATION ), true, ( (Pel *)srcBuf.buf ) + offset, srcBuf.stride );
  }
}

void InterPrediction::xProcessDMVR(PredictionUnit& pu, PelUnitBuf &pcYuvDst, const ClpRngs &clpRngs, const bool bioApplied)
{
#if MULTI_PASS_DMVR
  CHECK( true, "DMVR is removed when MULTI_PASS_DMVR is turned on." );
#else
  int iterationCount = 1;
  /*Always High Precision*/
  int mvShift = MV_FRACTIONAL_BITS_INTERNAL;

  /*use merge MV as starting MV*/
  Mv mergeMv[] = { pu.mv[REF_PIC_LIST_0] , pu.mv[REF_PIC_LIST_1] };

  m_biLinearBufStride = (MAX_CU_SIZE + (2 * DMVR_NUM_ITERATION));
  int dy = std::min<int>(pu.lumaSize().height, DMVR_SUBCU_HEIGHT);
  int dx = std::min<int>(pu.lumaSize().width,  DMVR_SUBCU_WIDTH);
  Position puPos = pu.lumaPos();

  int bd = pu.cs->slice->getClpRngs().comp[COMPONENT_Y].bd;

  int            bioEnabledThres = 2 * dy * dx;
  bool           bioAppliedType[MAX_NUM_SUBCU_DMVR];

#if JVET_J0090_MEMORY_BANDWITH_MEASURE
  JVET_J0090_SET_CACHE_ENABLE(true);
  for (int k = 0; k < NUM_REF_PIC_LIST_01; k++)
  {
    RefPicList refId = (RefPicList)k;
    const Picture* refPic = pu.cu->slice->getRefPic(refId, pu.refIdx[refId]);
    for (int compID = 0; compID < MAX_NUM_COMPONENT; compID++)
    {
      Mv cMv = pu.mv[refId];
      int mvshiftTemp = mvShift + getComponentScaleX((ComponentID)compID, pu.chromaFormat);
      int filtersize = (compID == (COMPONENT_Y)) ? NTAPS_LUMA : NTAPS_CHROMA;
      cMv += Mv(-(((filtersize >> 1) - 1) << mvshiftTemp), -(((filtersize >> 1) - 1) << mvshiftTemp));
      bool wrapRef = false;
      if ( pu.cs->pps->getWrapAroundEnabledFlag() )
      {
        wrapRef = wrapClipMv(cMv, pu.blocks[0].pos(), pu.blocks[0].size(), pu.cs->sps, pu.cs->pps);
      }
      else
      {
        clipMv(cMv, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);
      }

      int width = pcYuvDst.bufs[compID].width + (filtersize - 1);
      int height = pcYuvDst.bufs[compID].height + (filtersize - 1);

      CPelBuf refBuf;
      Position recOffset = pu.blocks[compID].pos().offset(cMv.getHor() >> mvshiftTemp, cMv.getVer() >> mvshiftTemp);
      refBuf = refPic->getRecoBuf(CompArea((ComponentID)compID, pu.chromaFormat, recOffset, pu.blocks[compID].size()), wrapRef);

      JVET_J0090_SET_REF_PICTURE(refPic, (ComponentID)compID);
      for (int row = 0; row < height; row++)
      {
        for (int col = 0; col < width; col++)
        {
          JVET_J0090_CACHE_ACCESS(((Pel *)refBuf.buf) + row * refBuf.stride + col, __FILE__, __LINE__);
        }
      }
    }
  }
  JVET_J0090_SET_CACHE_ENABLE(false);
#endif

  {
    int num = 0;

    int scaleX = getComponentScaleX(COMPONENT_Cb, pu.chromaFormat);
    int scaleY = getComponentScaleY(COMPONENT_Cb, pu.chromaFormat);
    m_biLinearBufStride = (dx + (2 * DMVR_NUM_ITERATION));
    // point mc buffer to cetre point to avoid multiplication to reach each iteration to the begining
    Pel *biLinearPredL0 = m_cYuvPredTempDMVRL0 + (DMVR_NUM_ITERATION * m_biLinearBufStride) + DMVR_NUM_ITERATION;
    Pel *biLinearPredL1 = m_cYuvPredTempDMVRL1 + (DMVR_NUM_ITERATION * m_biLinearBufStride) + DMVR_NUM_ITERATION;

    PredictionUnit subPu = pu;
    subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(puPos.x, puPos.y, dx, dy)));
    m_cYuvRefBuffDMVRL0 = (pu.chromaFormat == CHROMA_400 ?
      PelUnitBuf(pu.chromaFormat, PelBuf(m_cRefSamplesDMVRL0[0], pcYuvDst.Y())) :
      PelUnitBuf(pu.chromaFormat, PelBuf(m_cRefSamplesDMVRL0[0], pcYuvDst.Y()),
        PelBuf(m_cRefSamplesDMVRL0[1], pcYuvDst.Cb()), PelBuf(m_cRefSamplesDMVRL0[2], pcYuvDst.Cr())));
    m_cYuvRefBuffDMVRL0 = m_cYuvRefBuffDMVRL0.subBuf(UnitAreaRelative(pu, subPu));

    m_cYuvRefBuffDMVRL1 = (pu.chromaFormat == CHROMA_400 ?
      PelUnitBuf(pu.chromaFormat, PelBuf(m_cRefSamplesDMVRL1[0], pcYuvDst.Y())) :
      PelUnitBuf(pu.chromaFormat, PelBuf(m_cRefSamplesDMVRL1[0], pcYuvDst.Y()), PelBuf(m_cRefSamplesDMVRL1[1], pcYuvDst.Cb()),
        PelBuf(m_cRefSamplesDMVRL1[2], pcYuvDst.Cr())));
    m_cYuvRefBuffDMVRL1 = m_cYuvRefBuffDMVRL1.subBuf(UnitAreaRelative(pu, subPu));

    PelUnitBuf srcPred0 = (pu.chromaFormat == CHROMA_400 ?
      PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvDst.Y())) :
      PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[0][0], pcYuvDst.Y()), PelBuf(m_acYuvPred[0][1], pcYuvDst.Cb()), PelBuf(m_acYuvPred[0][2], pcYuvDst.Cr())));
    PelUnitBuf srcPred1 = (pu.chromaFormat == CHROMA_400 ?
      PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvDst.Y())) :
      PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvPred[1][0], pcYuvDst.Y()), PelBuf(m_acYuvPred[1][1], pcYuvDst.Cb()), PelBuf(m_acYuvPred[1][2], pcYuvDst.Cr())));

    srcPred0 = srcPred0.subBuf(UnitAreaRelative(pu, subPu));
    srcPred1 = srcPred1.subBuf(UnitAreaRelative(pu, subPu));

    int yStart = 0;

    for (int y = puPos.y; y < (puPos.y + pu.lumaSize().height); y = y + dy, yStart = yStart + dy)
    {
      for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + dx, xStart = xStart + dx)
      {
        subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, dx, dy)));
        subPu.mv[0] = mergeMv[0];
        subPu.mv[1] = mergeMv[1];
        xPrefetch(subPu, m_cYuvRefBuffDMVRL0, REF_PIC_LIST_0, 1);
        xPrefetch(subPu, m_cYuvRefBuffDMVRL1, REF_PIC_LIST_1, 1);

        xinitMC(subPu, clpRngs);

        uint64_t minCost = MAX_UINT64;
        bool notZeroCost = true;
        int16_t totalDeltaMV[2] = { 0,0 };
        int16_t deltaMV[2] = { 0, 0 };
        uint64_t  *pSADsArray;

        for (int i = 0; i < (((2 * DMVR_NUM_ITERATION) + 1) * ((2 * DMVR_NUM_ITERATION) + 1)); i++)
        {
          m_SADsArray[i] = MAX_UINT64;
        }
        pSADsArray = &m_SADsArray[(((2 * DMVR_NUM_ITERATION) + 1) * ((2 * DMVR_NUM_ITERATION) + 1)) >> 1];

        for (int i = 0; i < iterationCount; i++)
        {
          deltaMV[0] = 0;
          deltaMV[1] = 0;
          Pel *addrL0 = biLinearPredL0 + totalDeltaMV[0] + (totalDeltaMV[1] * m_biLinearBufStride);
          Pel *addrL1 = biLinearPredL1 - totalDeltaMV[0] - (totalDeltaMV[1] * m_biLinearBufStride);
          if (i == 0)
          {
            minCost = xDMVRCost(clpRngs.comp[COMPONENT_Y].bd, addrL0, m_biLinearBufStride, addrL1, m_biLinearBufStride, dx, dy);
            minCost -= (minCost >>2);
            pSADsArray[0] = minCost;
            if (minCost < (dx * dy))
            {
              notZeroCost = false;
              break;
            }
          }
          if (!minCost)
          {
            notZeroCost = false;
            break;
          }

          xBIPMVRefine(bd, addrL0, addrL1, minCost, deltaMV, pSADsArray, dx, dy);

          if (deltaMV[0] == 0 && deltaMV[1] == 0)
          {
            break;
          }
          totalDeltaMV[0] += deltaMV[0];
          totalDeltaMV[1] += deltaMV[1];
          pSADsArray += ((deltaMV[1] * (((2 * DMVR_NUM_ITERATION) + 1))) + deltaMV[0]);
        }

        bioAppliedType[num] = (minCost < bioEnabledThres) ? false : bioApplied;
        totalDeltaMV[0] = (totalDeltaMV[0] << mvShift);
        totalDeltaMV[1] = (totalDeltaMV[1] << mvShift);
        xDMVRSubPixelErrorSurface(notZeroCost, totalDeltaMV, deltaMV, pSADsArray);

        pu.mvdL0SubPu[num] = Mv(totalDeltaMV[0], totalDeltaMV[1]);
        PelUnitBuf subPredBuf = pcYuvDst.subBuf(UnitAreaRelative(pu, subPu));

        bool blockMoved = false;

        if (pu.mvdL0SubPu[num] != Mv(0, 0))
        {
          blockMoved = true;
          if (isChromaEnabled(pu.chromaFormat))
          {
          xPrefetch(subPu, m_cYuvRefBuffDMVRL0, REF_PIC_LIST_0, 0);
          xPrefetch(subPu, m_cYuvRefBuffDMVRL1, REF_PIC_LIST_1, 0);
          }
          xPad(subPu, m_cYuvRefBuffDMVRL0, REF_PIC_LIST_0);
          xPad(subPu, m_cYuvRefBuffDMVRL1, REF_PIC_LIST_1);
        }

        int dstStride[MAX_NUM_COMPONENT] = { pcYuvDst.bufs[COMPONENT_Y].stride,
                                             isChromaEnabled(pu.chromaFormat) ? pcYuvDst.bufs[COMPONENT_Cb].stride : 0,
                                             isChromaEnabled(pu.chromaFormat) ? pcYuvDst.bufs[COMPONENT_Cr].stride : 0};
        subPu.mv[0] = mergeMv[REF_PIC_LIST_0] + pu.mvdL0SubPu[num];
        subPu.mv[1] = mergeMv[REF_PIC_LIST_1] - pu.mvdL0SubPu[num];

        subPu.mv[0].clipToStorageBitDepth();
        subPu.mv[1].clipToStorageBitDepth();

        xFinalPaddedMCForDMVR(subPu, srcPred0, srcPred1, m_cYuvRefBuffDMVRL0, m_cYuvRefBuffDMVRL1, bioAppliedType[num],
                              mergeMv, blockMoved);

        subPredBuf.bufs[COMPONENT_Y].buf = pcYuvDst.bufs[COMPONENT_Y].buf + xStart + yStart * dstStride[COMPONENT_Y];

        if (isChromaEnabled(pu.chromaFormat))
        {
        subPredBuf.bufs[COMPONENT_Cb].buf = pcYuvDst.bufs[COMPONENT_Cb].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cb]);

        subPredBuf.bufs[COMPONENT_Cr].buf = pcYuvDst.bufs[COMPONENT_Cr].buf + (xStart >> scaleX) + ((yStart >> scaleY) * dstStride[COMPONENT_Cr]);
        }
        xWeightedAverage(subPu, srcPred0, srcPred1, subPredBuf, subPu.cu->slice->getSPS()->getBitDepths(), subPu.cu->slice->clpRngs(), bioAppliedType[num]);
        num++;
      }
    }
  }
  JVET_J0090_SET_CACHE_ENABLE(true);
#endif
}

#if JVET_J0090_MEMORY_BANDWITH_MEASURE
void InterPrediction::cacheAssign( CacheModel *cache )
{
  m_cacheModel = cache;
  m_if.cacheAssign( cache );
  m_if.initInterpolationFilter( !cache->isCacheEnable() );
}
#endif

#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
void  InterPrediction::sortInterMergeMMVDCandidates(PredictionUnit &pu, MergeCtx& mrgCtx, uint32_t * mmvdLUT, uint32_t MMVDIdx)
{
  
  const int tempNum = (const int) (std::min<int>(MMVD_BASE_MV_NUM, mrgCtx.numValidMergeCand) * MMVD_MAX_REFINE_NUM);
  const int groupSize = std::min<int>(tempNum, ADAPTIVE_SUB_GROUP_SIZE_MMVD);
#if _WINDOWS
  Distortion candCostList[MMVD_BASE_MV_NUM* MMVD_MAX_REFINE_NUM];
#else
  Distortion candCostList[tempNum] ;
#endif
  
  for (uint32_t i = 0; i < tempNum; i++)
  {
    mmvdLUT[i] = i;
    candCostList[i] = MAX_UINT;
  }
  Distortion uiCost;
  DistParam cDistParam;
  cDistParam.applyWeight = false;
  int nWidth = pu.lumaSize().width;
  int nHeight = pu.lumaSize().height;
  if (!xAMLGetCurBlkTemplate(pu, nWidth, nHeight))
  {
    return;
  }
  
  int startMMVDIdx = 0;
  int endMMVDIdx = tempNum;
  if(MMVDIdx != -1)
  {
    uint32_t gpId = MMVDIdx/groupSize;
    startMMVDIdx = gpId * groupSize;
    endMMVDIdx = (gpId+1) * groupSize;
  }
  
  int shiftEnc = MMVD_SIZE_SHIFT;
  int encGrpSize = groupSize >> shiftEnc;
  for (int mmvdMergeCand = startMMVDIdx; mmvdMergeCand < endMMVDIdx; mmvdMergeCand++)
  {
    mrgCtx.setMmvdMergeCandiInfo(pu, mmvdMergeCand, mmvdMergeCand);
    
    for (int refList = 0; refList < 2; refList++)
    {
      if (pu.refIdx[refList] >= 0)
      {
        pu.mv[refList].roundToPrecision(MV_PRECISION_QUARTER, MV_PRECISION_INT);
      }
    }
    
    uiCost = 0;
    
    PelUnitBuf pcBufPredRefTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
    PelUnitBuf pcBufPredCurTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
    PelUnitBuf pcBufPredRefLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));
    PelUnitBuf pcBufPredCurLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));
    
    getBlkAMLRefTemplate(pu, pcBufPredRefTop, pcBufPredRefLeft);
    
    if (m_bAMLTemplateAvailabe[0])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
      
      uiCost += cDistParam.distFunc(cDistParam);
    }
    
    if (m_bAMLTemplateAvailabe[1])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
      
      uiCost += cDistParam.distFunc(cDistParam);
    }
    // update part
    uint32_t i;
    uint32_t shift = 0;
    uint32_t gpIdx = mmvdMergeCand/groupSize;
    uint32_t endIdx = gpIdx * groupSize + encGrpSize;
    while (shift < encGrpSize && uiCost < candCostList[endIdx - 1 - shift])
    {
      shift++;
    }
    if (shift != 0)
    {
      for (i = 1; i < shift; i++)
      {
        mmvdLUT[endIdx - i] = mmvdLUT[endIdx - 1 - i];
        candCostList[endIdx - i] = candCostList[endIdx - 1 - i];
      }
      mmvdLUT[endIdx - shift] = mmvdMergeCand;
      candCostList[endIdx - shift] = uiCost;
    }
  }
  
}
#endif

#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
void  InterPrediction::sortAffineMergeCandidates(PredictionUnit pu, AffineMergeCtx& affMrgCtx, uint32_t * affMmvdLUT, uint32_t afMMVDIdx)
{
  const int tempNum = AF_MMVD_NUM;
  int baseIdxToMergeIdxOffset = (int)PU::getMergeIdxFromAfMmvdBaseIdx(affMrgCtx, 0);
  int baseCount               = std::min<int>((int)AF_MMVD_BASE_NUM, affMrgCtx.numValidMergeCand - baseIdxToMergeIdxOffset);
  const int groupSize = std::min<int>(tempNum, ADAPTIVE_SUB_GROUP_SIZE_MMVD_AFF);
  Distortion candCostList[tempNum];
  for (uint32_t i = 0; i < tempNum; i++)
  {
    affMmvdLUT[i] = i;
    candCostList[i] = MAX_UINT;
  }
  
  if (baseCount < 1)
  {
    return;
  }
  Distortion uiCost;
  
  DistParam cDistParam;
  cDistParam.applyWeight = false;
  
  int nWidth = pu.lumaSize().width;
  int nHeight = pu.lumaSize().height;
  
  if (!xAMLGetCurBlkTemplate(pu, nWidth, nHeight))
  {
    return;
  }
  
  int startMMVDIdx = 0;
  int endMMVDIdx = tempNum;
  if(afMMVDIdx != -1)
  {
    uint32_t gpId = afMMVDIdx/groupSize;
    startMMVDIdx = gpId * groupSize;
    endMMVDIdx = (gpId+1) * groupSize;
  }
  int shiftEnc = AFFINE_MMVD_SIZE_SHIFT;
  int encGrpSize = groupSize >> shiftEnc;
  for (int mmvdMergeCand = startMMVDIdx; mmvdMergeCand < endMMVDIdx; mmvdMergeCand++)
  {
    pu.afMmvdMergeIdx = (uint8_t)mmvdMergeCand;
    
    int baseIdx = (int)mmvdMergeCand / AF_MMVD_MAX_REFINE_NUM;
    int stepIdx = (int)mmvdMergeCand - baseIdx * AF_MMVD_MAX_REFINE_NUM;
    int dirIdx  = stepIdx % AF_MMVD_OFFSET_DIR;
    stepIdx = stepIdx / AF_MMVD_OFFSET_DIR;
    
    pu.cu->affine = true;
    pu.cu->imv    = IMV_OFF;
    pu.cu->mmvdSkip         = false;
    pu.regularMergeFlag = false;
    pu.mmvdMergeFlag    = false;
    pu.mergeFlag      = true;
    pu.afMmvdFlag     = true;
    pu.afMmvdBaseIdx  = (uint8_t)baseIdx;
    pu.afMmvdDir      = (uint8_t)dirIdx;
    pu.afMmvdStep     = (uint8_t)stepIdx;
    pu.mergeIdx       = (uint8_t)(baseIdxToMergeIdxOffset + baseIdx);
    pu.mergeType = affMrgCtx.mergeType[pu.mergeIdx];
    pu.cu->LICFlag = affMrgCtx.LICFlags[pu.mergeIdx];
    pu.cu->LICFlag = false;
    pu.interDir = affMrgCtx.interDirNeighbours[pu.mergeIdx];
    pu.cu->affineType = affMrgCtx.affineType[pu.mergeIdx];
    pu.cu->BcwIdx = affMrgCtx.BcwIdx[pu.mergeIdx];
    pu.ciipFlag = false;
    MvField mvfMmvd[2][3];
    PU::getAfMmvdMvf(pu, affMrgCtx, mvfMmvd, pu.mergeIdx, pu.afMmvdStep, pu.afMmvdDir);
    for (int i = 0; i < 2; i++)
    {
      if( pu.cs->slice->getNumRefIdx( RefPicList( i ) ) > 0 )
      {
        pu.mvpIdx[i] = 0;
        pu.mvpNum[i] = 0;
        pu.mvd[i]    = Mv();
        pu.refIdx[i] = mvfMmvd[i][0].refIdx;
        pu.mvAffi[i][0] = mvfMmvd[i][0].mv;
        pu.mvAffi[i][1] = mvfMmvd[i][1].mv;
        pu.mvAffi[i][2] = mvfMmvd[i][2].mv;
      }
    }
    uiCost = 0;
    PelUnitBuf pcBufPredRefTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
    PelUnitBuf pcBufPredCurTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
    PelUnitBuf pcBufPredRefLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));
    PelUnitBuf pcBufPredCurLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));
    getAffAMLRefTemplate(pu, pcBufPredRefTop, pcBufPredRefLeft);
    
    if (m_bAMLTemplateAvailabe[0])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
      
      uiCost += cDistParam.distFunc(cDistParam);
    }
    
    if (m_bAMLTemplateAvailabe[1])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
      
      uiCost += cDistParam.distFunc(cDistParam);
    }
    
    // update part
    uint32_t i;
    uint32_t shift = 0;
    uint32_t gpIdx = mmvdMergeCand/groupSize;
    uint32_t endIdx = gpIdx * groupSize + encGrpSize;
    while (shift < encGrpSize && uiCost < candCostList[endIdx - 1 - shift])
    {
      shift++;
    }
    
    if (shift != 0)
    {
      for (i = 1; i < shift; i++)
      {
        affMmvdLUT[endIdx - i] = affMmvdLUT[endIdx - 1 - i];
        candCostList[endIdx - i] = candCostList[endIdx - 1 - i];
      }
      affMmvdLUT[endIdx - shift] = mmvdMergeCand;
      candCostList[endIdx - shift] = uiCost;
    }
    
  }
}
#endif

#if JVET_W0090_ARMC_TM
#if JVET_Y0134_TMVP_NAMVP_CAND_REORDERING
void InterPrediction::adjustMergeCandidatesInOneCandidateGroup(PredictionUnit &pu, MergeCtx& mvpMergeCandCtx, int numRetrievedMergeCand, int mrgCandIdx)
{
  if (mvpMergeCandCtx.numValidMergeCand <= 1)
  {
    return;
  }

  const int numCandInCategory = std::min(numRetrievedMergeCand, mvpMergeCandCtx.numValidMergeCand);

  uint32_t RdCandList[MRG_MAX_NUM_CANDS];
  Distortion candCostList[MRG_MAX_NUM_CANDS];

  for (uint32_t j = 0; j < MRG_MAX_NUM_CANDS; j++)
  {
    RdCandList[j] = j;
    candCostList[j] = MAX_UINT;
  }

  Distortion uiCost;

  DistParam cDistParam;
  cDistParam.applyWeight = false;

  int nWidth = pu.lumaSize().width;
  int nHeight = pu.lumaSize().height;

  auto origMergeIdx = pu.mergeIdx;
  for (uint32_t uiMergeCand = 0; uiMergeCand < mvpMergeCandCtx.numValidMergeCand; uiMergeCand++)
  {
    if (mvpMergeCandCtx.candCost[uiMergeCand] == MAX_UINT64)
    {
      uiCost = 0;

      mvpMergeCandCtx.setMergeInfo(pu, uiMergeCand);

      PelUnitBuf pcBufPredRefTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
      PelUnitBuf pcBufPredCurTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
      PelUnitBuf pcBufPredRefLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));
      PelUnitBuf pcBufPredCurLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));

      getBlkAMLRefTemplate(pu, pcBufPredRefTop, pcBufPredRefLeft);

      if (m_bAMLTemplateAvailabe[0])
      {
        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

        uiCost += cDistParam.distFunc(cDistParam);
      }

      if (m_bAMLTemplateAvailabe[1])
      {
        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

        uiCost += cDistParam.distFunc(cDistParam);
      }
    }
    else
    {
      uiCost = mvpMergeCandCtx.candCost[uiMergeCand];
    }

    updateCandList(uiMergeCand, uiCost, numCandInCategory, RdCandList, candCostList);
  }
  pu.mergeIdx = origMergeIdx;

  updateCandInOneCandidateGroup(mvpMergeCandCtx, RdCandList, numCandInCategory);

  mvpMergeCandCtx.numValidMergeCand = numCandInCategory;

  for (int idx = 0; idx < numCandInCategory; idx++)
  {
    mvpMergeCandCtx.candCost[idx] = candCostList[idx];
  }
}

void  InterPrediction::updateCandInOneCandidateGroup(MergeCtx& mrgCtx, uint32_t* RdCandList, int numCandInCategory)
{
  MergeCtx mrgCtxTmp;
  for (uint32_t uiMergeCand = 0; uiMergeCand < mrgCtx.numValidMergeCand; uiMergeCand++)
  {
    mrgCtxTmp.BcwIdx[uiMergeCand] = mrgCtx.BcwIdx[uiMergeCand];
    mrgCtxTmp.interDirNeighbours[uiMergeCand] = mrgCtx.interDirNeighbours[uiMergeCand];
    mrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1)] = mrgCtx.mvFieldNeighbours[(uiMergeCand << 1)];
    mrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1) + 1] = mrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1];
    mrgCtxTmp.useAltHpelIf[uiMergeCand] = mrgCtx.useAltHpelIf[uiMergeCand];
#if INTER_LIC 
    mrgCtxTmp.LICFlags[uiMergeCand] = mrgCtx.LICFlags[uiMergeCand];
#endif
#if MULTI_HYP_PRED
    mrgCtxTmp.addHypNeighbours[uiMergeCand] = mrgCtx.addHypNeighbours[uiMergeCand];
#endif
  }
  //update
  for (uint32_t uiMergeCand = 0; uiMergeCand < numCandInCategory; uiMergeCand++)
  {
    mrgCtx.BcwIdx[uiMergeCand] = mrgCtxTmp.BcwIdx[RdCandList[uiMergeCand]];
    mrgCtx.interDirNeighbours[uiMergeCand] = mrgCtxTmp.interDirNeighbours[RdCandList[uiMergeCand]];
    mrgCtx.mvFieldNeighbours[(uiMergeCand << 1)] = mrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand] << 1)];
    mrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1] = mrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand] << 1) + 1];
    mrgCtx.useAltHpelIf[uiMergeCand] = mrgCtxTmp.useAltHpelIf[RdCandList[uiMergeCand]];
#if INTER_LIC
    mrgCtx.LICFlags[uiMergeCand] = mrgCtxTmp.LICFlags[RdCandList[uiMergeCand]];
#endif
#if MULTI_HYP_PRED
    mrgCtx.addHypNeighbours[uiMergeCand] = mrgCtxTmp.addHypNeighbours[RdCandList[uiMergeCand]];
#endif
  }
}
#endif

void  InterPrediction::adjustInterMergeCandidates(PredictionUnit &pu, MergeCtx& mrgCtx, int mrgCandIdx)
{
  uint32_t RdCandList[MRG_MAX_NUM_CANDS][MRG_MAX_NUM_CANDS];
  Distortion candCostList[MRG_MAX_NUM_CANDS][MRG_MAX_NUM_CANDS];

  for (uint32_t i = 0; i < MRG_MAX_NUM_CANDS; i++)
  {
    for (uint32_t j = 0; j < MRG_MAX_NUM_CANDS; j++)
    {
      RdCandList[i][j] = j;
      candCostList[i][j] = MAX_UINT;
    }
  }

  Distortion uiCost;

  DistParam cDistParam;
  cDistParam.applyWeight = false;

  /*const SPS &sps = *pu.cs->sps;
  Position puPos = pu.lumaPos();*/
  int nWidth = pu.lumaSize().width;
  int nHeight = pu.lumaSize().height;

  if (!xAMLGetCurBlkTemplate(pu, nWidth, nHeight))
  {
    return;
  }

#if JVET_X0049_ADAPT_DMVR
  uint8_t origMergeIdx = pu.mergeIdx;
#endif
  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE)*ADAPTIVE_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE > mrgCtx.numValidMergeCand)) ? mrgCtx.numValidMergeCand : ((mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE >= mrgCtx.numValidMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    uiCost = 0;

    mrgCtx.setMergeInfo(pu, uiMergeCand);
    PU::spanMotionInfo(pu, mrgCtx);

    PelUnitBuf pcBufPredRefTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
    PelUnitBuf pcBufPredCurTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
    PelUnitBuf pcBufPredRefLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));
    PelUnitBuf pcBufPredCurLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));

#if JVET_Y0128_NON_CTC
    bool bRefIsRescaled = false;
    for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
    {
      const RefPicList eRefPicList = refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0;
      bRefIsRescaled |= (pu.refIdx[refList] >= 0) ? pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList])->isRefScaled(pu.cs->pps) : false;
    }
    if ( !bRefIsRescaled )
    {
#endif
    getBlkAMLRefTemplate(pu, pcBufPredRefTop, pcBufPredRefLeft);

    if (m_bAMLTemplateAvailabe[0])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

      uiCost += cDistParam.distFunc(cDistParam);
    }

    if (m_bAMLTemplateAvailabe[1])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

      uiCost += cDistParam.distFunc(cDistParam);
    }
#if JVET_Y0128_NON_CTC
    }
#endif

    updateCandList(uiMergeCand, uiCost, ADAPTIVE_SUB_GROUP_SIZE, RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE], candCostList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE]);
  }
#if JVET_X0049_ADAPT_DMVR
  pu.mergeIdx = origMergeIdx;
#else
  pu.mergeIdx = mrgCandIdx;    //restore the merge index
#endif
  updateCandInfo(mrgCtx, RdCandList
    , mrgCandIdx
  );

}
bool InterPrediction::xAMLGetCurBlkTemplate(PredictionUnit& pu, int nCurBlkWidth, int nCurBlkHeight)
{
  m_bAMLTemplateAvailabe[0] = xAMLIsTopTempAvailable(pu);
  m_bAMLTemplateAvailabe[1] = xAMLIsLeftTempAvailable(pu);

  if (!m_bAMLTemplateAvailabe[0] && !m_bAMLTemplateAvailabe[1])
  {
    return false;
  }

  /* const int       lumaShift = 2 + MV_FRACTIONAL_BITS_DIFF;
   const int       horShift = (lumaShift + ::getComponentScaleX(COMPONENT_Y, pu.chromaFormat));
   const int       verShift = (lumaShift + ::getComponentScaleY(COMPONENT_Y, pu.chromaFormat));*/
  const Picture&  currPic = *pu.cs->picture;
  const CPelBuf recBuf = currPic.getRecoBuf(pu.cs->picture->blocks[COMPONENT_Y]);
  std::vector<Pel>& invLUT = m_pcReshape->getInvLUT();

  if (m_bAMLTemplateAvailabe[0])
  {
    const Pel*    rec = recBuf.bufAt(pu.blocks[COMPONENT_Y].pos().offset(0, -AML_MERGE_TEMPLATE_SIZE));
    PelBuf pcYBuf = PelBuf(m_acYuvCurAMLTemplate[0][0], nCurBlkWidth, AML_MERGE_TEMPLATE_SIZE);
    Pel*   pcY = pcYBuf.bufAt(0, 0);
    for (int k = 0; k < nCurBlkWidth; k++)
    {
      for (int l = 0; l < AML_MERGE_TEMPLATE_SIZE; l++)
      {
        int recVal = rec[k + l * recBuf.stride];

        if (m_pcReshape->getSliceReshaperInfo().getUseSliceReshaper() && m_pcReshape->getCTUFlag())
        {
          recVal = invLUT[recVal];
        }

        pcY[k + l * nCurBlkWidth] = recVal;
      }
    }
  }

  if (m_bAMLTemplateAvailabe[1])
  {
    PelBuf pcYBuf = PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nCurBlkHeight);
    Pel*   pcY = pcYBuf.bufAt(0, 0);
    const Pel*    rec = recBuf.bufAt(pu.blocks[COMPONENT_Y].pos().offset(-AML_MERGE_TEMPLATE_SIZE, 0));
    for (int k = 0; k < nCurBlkHeight; k++)
    {
      for (int l = 0; l < AML_MERGE_TEMPLATE_SIZE; l++)
      {
        int recVal = rec[recBuf.stride * k + l];

        if (m_pcReshape->getSliceReshaperInfo().getUseSliceReshaper() && m_pcReshape->getCTUFlag())
        {
          recVal = invLUT[recVal];
        }

        pcY[AML_MERGE_TEMPLATE_SIZE * k + l] = recVal;
      }
    }
  }

  return true;
}

bool InterPrediction::xAMLIsTopTempAvailable(PredictionUnit& pu)
{
  const CodingStructure &cs = *pu.cs;
  Position posRT = pu.Y().topRight();
  const PredictionUnit *puAbove = cs.getPURestricted(posRT.offset(0, -1), pu, pu.chType);

  return (puAbove && pu.cu != puAbove->cu);
}

bool InterPrediction::xAMLIsLeftTempAvailable(PredictionUnit& pu)
{
  const CodingStructure &cs = *pu.cs;
  Position posLB = pu.Y().bottomLeft();
  const PredictionUnit *puLeft = cs.getPURestricted(posLB.offset(-1, 0), pu, pu.chType);

  return (puLeft && pu.cu != puLeft->cu);
}
void InterPrediction::updateCandList(uint32_t uiCand, Distortion uiCost, uint32_t uiMrgCandNum, uint32_t* RdCandList, Distortion* CandCostList)
{
  uint32_t i;
  uint32_t shift = 0;

  while (shift < uiMrgCandNum && uiCost < CandCostList[uiMrgCandNum - 1 - shift])
  {
    shift++;
  }

  if (shift != 0)
  {
    for (i = 1; i < shift; i++)
    {
      RdCandList[uiMrgCandNum - i] = RdCandList[uiMrgCandNum - 1 - i];
      CandCostList[uiMrgCandNum - i] = CandCostList[uiMrgCandNum - 1 - i];
    }
    RdCandList[uiMrgCandNum - shift] = uiCand;
    CandCostList[uiMrgCandNum - shift] = uiCost;
  }
}
void  InterPrediction::updateCandInfo(MergeCtx& mrgCtx, uint32_t(*RdCandList)[MRG_MAX_NUM_CANDS], int mrgCandIdx)
{
  MergeCtx mrgCtxTmp;
  for (uint32_t ui = 0; ui < MRG_MAX_NUM_CANDS; ++ui)
  {
    mrgCtxTmp.BcwIdx[ui] = BCW_DEFAULT;
    mrgCtxTmp.interDirNeighbours[ui] = 0;
    mrgCtxTmp.mvFieldNeighbours[(ui << 1)].refIdx = NOT_VALID;
    mrgCtxTmp.mvFieldNeighbours[(ui << 1) + 1].refIdx = NOT_VALID;
    mrgCtxTmp.useAltHpelIf[ui] = false;
#if INTER_LIC
    mrgCtxTmp.LICFlags[ui] = false;
#endif
#if MULTI_HYP_PRED
    mrgCtxTmp.addHypNeighbours[ui].clear();
#endif
  }
  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE)*ADAPTIVE_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE > mrgCtx.numValidMergeCand)) ? mrgCtx.numValidMergeCand : ((mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE >= mrgCtx.numValidMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    mrgCtxTmp.BcwIdx[uiMergeCand] = mrgCtx.BcwIdx[uiMergeCand];
    mrgCtxTmp.interDirNeighbours[uiMergeCand] = mrgCtx.interDirNeighbours[uiMergeCand];
    mrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1)] = mrgCtx.mvFieldNeighbours[(uiMergeCand << 1)];
    mrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1) + 1] = mrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1];
    mrgCtxTmp.useAltHpelIf[uiMergeCand] = mrgCtx.useAltHpelIf[uiMergeCand];
#if INTER_LIC 
    mrgCtxTmp.LICFlags[uiMergeCand] = mrgCtx.LICFlags[uiMergeCand];
#endif
#if MULTI_HYP_PRED
    mrgCtxTmp.addHypNeighbours[uiMergeCand] = mrgCtx.addHypNeighbours[uiMergeCand];
#endif
  }
  //update
  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE)*ADAPTIVE_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE > mrgCtx.numValidMergeCand)) ? mrgCtx.numValidMergeCand : ((mrgCandIdx / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE + 1)*ADAPTIVE_SUB_GROUP_SIZE >= mrgCtx.numValidMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }

    mrgCtx.BcwIdx[uiMergeCand] = mrgCtxTmp.BcwIdx[RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_SUB_GROUP_SIZE]];
    mrgCtx.interDirNeighbours[uiMergeCand] = mrgCtxTmp.interDirNeighbours[RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_SUB_GROUP_SIZE]];
    mrgCtx.mvFieldNeighbours[(uiMergeCand << 1)] = mrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_SUB_GROUP_SIZE] << 1)];
    mrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1] = mrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_SUB_GROUP_SIZE] << 1) + 1];
    mrgCtx.useAltHpelIf[uiMergeCand] = mrgCtxTmp.useAltHpelIf[RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_SUB_GROUP_SIZE]];
#if INTER_LIC
    mrgCtx.LICFlags[uiMergeCand] = mrgCtxTmp.LICFlags[RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_SUB_GROUP_SIZE]];
#endif
#if MULTI_HYP_PRED
    mrgCtx.addHypNeighbours[uiMergeCand] = mrgCtxTmp.addHypNeighbours[RdCandList[uiMergeCand / ADAPTIVE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_SUB_GROUP_SIZE]];
#endif
  }
}
void InterPrediction::getBlkAMLRefTemplate(PredictionUnit &pu, PelUnitBuf &pcBufPredRefTop, PelUnitBuf &pcBufPredRefLeft)
{
  Mv mvCurr;
  const int lumaShift = 2 + MV_FRACTIONAL_BITS_DIFF;
  const int horShift  = (lumaShift + ::getComponentScaleX(COMPONENT_Y, pu.chromaFormat));
  const int verShift  = (lumaShift + ::getComponentScaleY(COMPONENT_Y, pu.chromaFormat));

  if (xCheckIdenticalMotion(pu))
  {
    mvCurr = pu.mv[0];
    /*const int horIntMv = (mvCurr.getHor() + ((1 << horShift) >> 1)) >> horShift;
    const int verIntMv = (mvCurr.getVer() + ((1 << verShift) >> 1)) >> verShift;
    Mv    subPelMv(horIntMv << horShift, verIntMv << verShift);*/
    Mv subPelMv = mvCurr;
    clipMv(mvCurr, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);
    CHECK(pu.refIdx[0] < 0, "invalid ref idx");

    if (m_bAMLTemplateAvailabe[0])
    {
      Mv mvTop(0, -(AML_MERGE_TEMPLATE_SIZE << verShift));
      mvTop += subPelMv;

      clipMv(mvTop, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);

#if RPR_ENABLE
      const Picture* picRef = pu.cu->slice->getRefPic(REF_PIC_LIST_0, pu.refIdx[0])->unscaledPic;
      const std::pair<int, int>& scalingRatio = pu.cu->slice->getScalingRatio(REF_PIC_LIST_0, pu.refIdx[0]);
#if INTER_LIC
      xPredInterBlk( COMPONENT_Y, pu, picRef, mvTop, pcBufPredRefTop,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
      xPredInterBlk( COMPONENT_Y, pu, picRef, mvTop, pcBufPredRefTop,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true );
#endif
#else
#if INTER_LIC
      xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( REF_PIC_LIST_0, pu.refIdx[0] ), mvTop, pcBufPredRefTop,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true, true,
                     mvCurr );
#else
      xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( REF_PIC_LIST_0, pu.refIdx[0] ), mvTop, pcBufPredRefTop,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true );
#endif
#endif
    }
    if (m_bAMLTemplateAvailabe[1])
    {
      Mv mvLeft(-(AML_MERGE_TEMPLATE_SIZE << horShift), 0);
      mvLeft += subPelMv;

      clipMv(mvLeft, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);

#if RPR_ENABLE
      const Picture* picRef = pu.cu->slice->getRefPic(REF_PIC_LIST_0, pu.refIdx[0])->unscaledPic;
      const std::pair<int, int>& scalingRatio = pu.cu->slice->getScalingRatio(REF_PIC_LIST_0, pu.refIdx[0]);
#if INTER_LIC
      xPredInterBlk( COMPONENT_Y, pu, picRef, mvLeft, pcBufPredRefLeft,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
      xPredInterBlk( COMPONENT_Y, pu, picRef, mvLeft, pcBufPredRefLeft,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true );
#endif
#else
#if INTER_LIC
      xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( REF_PIC_LIST_0, pu.refIdx[0] ), mvLeft, pcBufPredRefLeft,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true, true,
                     mvCurr );
#else
      xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( REF_PIC_LIST_0, pu.refIdx[0] ), mvLeft, pcBufPredRefLeft,
                     false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true );
#endif
#endif
    }
  }
  else
  {
    for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
    {
      if (pu.refIdx[refList] < 0)
      {
        continue;
      }
      RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);
      CHECK(pu.refIdx[refList] >= pu.cu->slice->getNumRefIdx(eRefPicList), "Invalid reference index");

      m_iRefListIdx = refList;
      mvCurr        = pu.mv[refList];
      /*const int horIntMv = (mvCurr.getHor() + ((1 << horShift) >> 1)) >> horShift;
      const int verIntMv = (mvCurr.getVer() + ((1 << verShift) >> 1)) >> verShift;
      Mv    subPelMv(horIntMv << horShift, verIntMv << verShift);*/
      Mv subPelMv = mvCurr;
      clipMv(mvCurr, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);

      if (m_bAMLTemplateAvailabe[0])
      {
        Mv mvTop(0, -(AML_MERGE_TEMPLATE_SIZE << verShift));
        mvTop += subPelMv;

        clipMv(mvTop, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);

        PelUnitBuf pcMbBuf =
          PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAboveTemplate[refList][0], pcBufPredRefTop.Y()));

        if (pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0)
        {
#if RPR_ENABLE
          const Picture* picRef = pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList])->unscaledPic;
          const std::pair<int, int>& scalingRatio = pu.cu->slice->getScalingRatio(eRefPicList, pu.refIdx[refList]);
#if INTER_LIC
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvTop, pcMbBuf, true,
                         pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvTop, pcMbBuf, true,
                         pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true );
#endif
#else
#if INTER_LIC
          xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( eRefPicList, pu.refIdx[refList] ), mvTop, pcMbBuf, true,
                         pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true, true,
                         mvCurr );
#else
          xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( eRefPicList, pu.refIdx[refList] ), mvTop, pcMbBuf, true,
                         pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true );
#endif
#endif
        }
        else
        {
#if RPR_ENABLE
          const Picture* picRef = pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList])->unscaledPic;
          const std::pair<int, int>& scalingRatio = pu.cu->slice->getScalingRatio(eRefPicList, pu.refIdx[refList]);
#if INTER_LIC
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvTop, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvTop, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true );
#endif
#else
#if INTER_LIC
          xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( eRefPicList, pu.refIdx[refList] ), mvTop, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
          xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( eRefPicList, pu.refIdx[refList] ), mvTop, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true );
#endif
#endif
        }
      }
      if (m_bAMLTemplateAvailabe[1])
      {
        Mv mvLeft(-(AML_MERGE_TEMPLATE_SIZE << horShift), 0);
        mvLeft += subPelMv;

        clipMv(mvLeft, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);

        PelUnitBuf pcMbBuf =
          PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefLeftTemplate[refList][0], pcBufPredRefLeft.Y()));

        if (pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0)
        {
#if RPR_ENABLE
          const Picture* picRef = pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList])->unscaledPic;
          const std::pair<int, int>& scalingRatio = pu.cu->slice->getScalingRatio( eRefPicList, pu.refIdx[refList] );
#if INTER_LIC
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvLeft, pcMbBuf,
                         true, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvLeft, pcMbBuf,
                         true, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true );
#endif
#else
  #if INTER_LIC
          xPredInterBlk(COMPONENT_Y, pu, pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList]), mvLeft, pcMbBuf,
                        true, pu.cu->slice->clpRng(COMPONENT_Y), false, false, SCALE_1X, 0, 0, false, NULL, 0, true, true, mvCurr);
  #else
          xPredInterBlk(COMPONENT_Y, pu, pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList]), mvLeft, pcMbBuf,
                        true, pu.cu->slice->clpRng(COMPONENT_Y), false, false, SCALE_1X, 0, 0, false, NULL, 0, true);
  #endif
#endif
        }
        else
        {
#if RPR_ENABLE
          const Picture* picRef = pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList])->unscaledPic;
          const std::pair<int, int>& scalingRatio = pu.cu->slice->getScalingRatio(eRefPicList, pu.refIdx[refList]);
#if INTER_LIC
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvLeft, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
          xPredInterBlk( COMPONENT_Y, pu, picRef, mvLeft, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, scalingRatio, 0, 0, false, NULL, 0, true );
#endif
#else
#if INTER_LIC
          xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( eRefPicList, pu.refIdx[refList] ), mvLeft, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true, true, mvCurr );
#else
          xPredInterBlk( COMPONENT_Y, pu, pu.cu->slice->getRefPic( eRefPicList, pu.refIdx[refList] ), mvLeft, pcMbBuf,
                         false, pu.cu->slice->clpRng( COMPONENT_Y ), false, false, SCALE_1X, 0, 0, false, NULL, 0, true );
#endif
#endif
        }
      }
    }
    if (m_bAMLTemplateAvailabe[0])
    {
      CPelUnitBuf srcPred0 = CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAboveTemplate[0][0], pcBufPredRefTop.Y()));
      CPelUnitBuf srcPred1 = CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAboveTemplate[1][0], pcBufPredRefTop.Y()));
      xWeightedAverageY(pu, srcPred0, srcPred1, pcBufPredRefTop, pu.cu->slice->getSPS()->getBitDepths(),
                        pu.cu->slice->clpRngs());
    }
    if (m_bAMLTemplateAvailabe[1])
    {
      CPelUnitBuf srcPred0 = CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefLeftTemplate[0][0], pcBufPredRefLeft.Y()));
      CPelUnitBuf srcPred1 = CPelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefLeftTemplate[1][0], pcBufPredRefLeft.Y()));
      xWeightedAverageY(pu, srcPred0, srcPred1, pcBufPredRefLeft, pu.cu->slice->getSPS()->getBitDepths(),
                        pu.cu->slice->clpRngs());
    }
  }
}

void  InterPrediction::adjustAffineMergeCandidates(PredictionUnit &pu, AffineMergeCtx& affMrgCtx, int mrgCandIdx)
{
  uint32_t RdCandList[AFFINE_MRG_MAX_NUM_CANDS][AFFINE_MRG_MAX_NUM_CANDS];
  Distortion candCostList[AFFINE_MRG_MAX_NUM_CANDS][AFFINE_MRG_MAX_NUM_CANDS];

  for (uint32_t i = 0; i < AFFINE_MRG_MAX_NUM_CANDS; i++)
  {
    for (uint32_t j = 0; j < AFFINE_MRG_MAX_NUM_CANDS; j++)
    {
      RdCandList[i][j] = j;
      candCostList[i][j] = MAX_UINT;
    }
  }

  Distortion uiCost;

  DistParam cDistParam;
  cDistParam.applyWeight = false;

  int nWidth = pu.lumaSize().width;
  int nHeight = pu.lumaSize().height;

  if (!xAMLGetCurBlkTemplate(pu, nWidth, nHeight))
  {
    return;
  }

  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE > affMrgCtx.maxNumMergeCand)) ? affMrgCtx.maxNumMergeCand : ((mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE >= affMrgCtx.maxNumMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    uiCost = 0;

    // set merge information
    pu.interDir = affMrgCtx.interDirNeighbours[uiMergeCand];
    pu.mergeFlag = true;
    pu.regularMergeFlag = false;
    pu.mergeIdx = uiMergeCand;
    pu.cu->affine = true;
    pu.cu->affineType = affMrgCtx.affineType[uiMergeCand];
#if AFFINE_MMVD
    pu.afMmvdFlag = false;
#endif
    pu.cu->BcwIdx = affMrgCtx.BcwIdx[uiMergeCand];
#if INTER_LIC
    pu.cu->LICFlag = affMrgCtx.LICFlags[uiMergeCand];
#endif

    pu.mergeType = affMrgCtx.mergeType[uiMergeCand];
    if (pu.mergeType == MRG_TYPE_SUBPU_ATMVP)
    {
      pu.refIdx[0] = affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 0][0].refIdx;
      pu.refIdx[1] = affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1][0].refIdx;
      PU::spanMotionInfo(pu, *affMrgCtx.mrgCtx);
    }
    else
    {
      for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
      {
        for (int i = 0; i < 3; i++)
        {
          pu.mvAffi[refList][i] = affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + refList][i].mv;
        }
        pu.refIdx[refList] = affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + refList][0].refIdx;
      }

      PelUnitBuf pcBufPredRefTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
      PelUnitBuf pcBufPredCurTop = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE)));
      PelUnitBuf pcBufPredRefLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));
      PelUnitBuf pcBufPredCurLeft = (PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight)));

#if RPR_ENABLE
      bool bRefIsRescaled = false;
      for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
      {
        const RefPicList eRefPicList = refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0;
        bRefIsRescaled |= (pu.refIdx[refList] >= 0) ? pu.cu->slice->getRefPic(eRefPicList, pu.refIdx[refList])->isRefScaled(pu.cs->pps) : false;
      }
      if ( !bRefIsRescaled )
      {
#endif

      getAffAMLRefTemplate(pu, pcBufPredRefTop, pcBufPredRefLeft);

      if (m_bAMLTemplateAvailabe[0])
      {
        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

        uiCost += cDistParam.distFunc(cDistParam);
      }

      if (m_bAMLTemplateAvailabe[1])
      {
        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

        uiCost += cDistParam.distFunc(cDistParam);
      }

#if RPR_ENABLE
      }
#endif
    }
    updateCandList(uiMergeCand, uiCost, ADAPTIVE_AFFINE_SUB_GROUP_SIZE, RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE], candCostList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE]);
  }
  pu.mergeIdx = mrgCandIdx;    //restore the merge index
  updateAffineCandInfo(pu, affMrgCtx, RdCandList
    , mrgCandIdx
  );

}
void  InterPrediction::updateAffineCandInfo(PredictionUnit &pu, AffineMergeCtx& affMrgCtx, uint32_t(*RdCandList)[AFFINE_MRG_MAX_NUM_CANDS], int mrgCandIdx)
{
  AffineMergeCtx affMrgCtxTmp;
  const uint32_t maxNumAffineMergeCand = pu.cs->slice->getPicHeader()->getMaxNumAffineMergeCand();
  for (int i = 0; i < maxNumAffineMergeCand; i++)
  {
    for (int mvNum = 0; mvNum < 3; mvNum++)
    {
      affMrgCtxTmp.mvFieldNeighbours[(i << 1) + 0][mvNum].setMvField(Mv(), -1);
      affMrgCtxTmp.mvFieldNeighbours[(i << 1) + 1][mvNum].setMvField(Mv(), -1);
    }
    affMrgCtxTmp.interDirNeighbours[i] = 0;
    affMrgCtxTmp.affineType[i] = AFFINEMODEL_4PARAM;
    affMrgCtxTmp.mergeType[i] = MRG_TYPE_DEFAULT_N;
    affMrgCtxTmp.BcwIdx[i] = BCW_DEFAULT;
#if INTER_LIC
    affMrgCtxTmp.LICFlags[i] = false;
#endif
  }
  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE > affMrgCtx.maxNumMergeCand)) ? affMrgCtx.maxNumMergeCand : ((mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE >= affMrgCtx.maxNumMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    for (int mvNum = 0; mvNum < 3; mvNum++)
    {
      affMrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1) + 0][mvNum] = affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 0][mvNum];
      affMrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1) + 1][mvNum] = affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1][mvNum];
    }
    affMrgCtxTmp.interDirNeighbours[uiMergeCand] = affMrgCtx.interDirNeighbours[uiMergeCand];
    affMrgCtxTmp.affineType[uiMergeCand] = affMrgCtx.affineType[uiMergeCand];
    affMrgCtxTmp.mergeType[uiMergeCand] = affMrgCtx.mergeType[uiMergeCand];
    affMrgCtxTmp.BcwIdx[uiMergeCand] = affMrgCtx.BcwIdx[uiMergeCand];
#if INTER_LIC                                                   
    affMrgCtxTmp.LICFlags[uiMergeCand] = affMrgCtx.LICFlags[uiMergeCand];
#endif
  }
  //update
  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE > affMrgCtx.maxNumMergeCand)) ? affMrgCtx.maxNumMergeCand : ((mrgCandIdx / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE + 1)*ADAPTIVE_AFFINE_SUB_GROUP_SIZE >= affMrgCtx.maxNumMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    for (int mvNum = 0; mvNum < 3; mvNum++)
    {
      affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 0][mvNum] = affMrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_AFFINE_SUB_GROUP_SIZE] << 1) + 0][mvNum];
      affMrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1][mvNum] = affMrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_AFFINE_SUB_GROUP_SIZE] << 1) + 1][mvNum];
    }
    affMrgCtx.interDirNeighbours[uiMergeCand] = affMrgCtxTmp.interDirNeighbours[RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_AFFINE_SUB_GROUP_SIZE]];
    affMrgCtx.affineType[uiMergeCand] = affMrgCtxTmp.affineType[RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_AFFINE_SUB_GROUP_SIZE]];
    affMrgCtx.mergeType[uiMergeCand] = affMrgCtxTmp.mergeType[RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_AFFINE_SUB_GROUP_SIZE]];
    affMrgCtx.BcwIdx[uiMergeCand] = affMrgCtxTmp.BcwIdx[RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_AFFINE_SUB_GROUP_SIZE]];
#if INTER_LIC 
    affMrgCtx.LICFlags[uiMergeCand] = affMrgCtxTmp.LICFlags[RdCandList[uiMergeCand / ADAPTIVE_AFFINE_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_AFFINE_SUB_GROUP_SIZE]];
#endif
  }
}
void InterPrediction::xGetSublkAMLTemplate(const CodingUnit& cu,
  const ComponentID compID,
  const Picture&    refPic,
  const Mv&         mv,
  const int         sublkWidth,
  const int         sublkHeight,
  const int         posW,
  const int         posH,
  int*              numTemplate,
  Pel*              refLeftTemplate,
  Pel*              refAboveTemplate
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
     , bool afMMVD
#endif
                                           )
{
  const int       bitDepth = cu.cs->sps->getBitDepth(toChannelType(compID));
  const int       precShift = std::max(0, bitDepth - 12);

  const CodingUnit* const cuAbove = cu.cs->getCU(cu.blocks[compID].pos().offset(0, -1), toChannelType(compID));
  const CodingUnit* const cuLeft = cu.cs->getCU(cu.blocks[compID].pos().offset(-1, 0), toChannelType(compID));
  const CPelBuf refBuf = cuAbove || cuLeft ? refPic.getRecoBuf(refPic.blocks[compID]) : CPelBuf();

  // above
  if (cuAbove && posH == 0)
  {
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
    xGetPredBlkTpl<true>(cu, compID, refBuf, mv, posW, posH, sublkWidth, refAboveTemplate, afMMVD);
#else
    xGetPredBlkTpl<true>(cu, compID, refBuf, mv, posW, posH, sublkWidth, refAboveTemplate);
#endif

    for (int k = posW; k < posW + sublkWidth; k++)
    {
      int refVal = refAboveTemplate[k];
      refVal >>= precShift;
      refAboveTemplate[k] = refVal;
      numTemplate[0]++;
    }
  }

  // left
  if (cuLeft && posW == 0)
  {
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
    xGetPredBlkTpl<false>(cu, compID, refBuf, mv, posW, posH, sublkHeight, refLeftTemplate, afMMVD);
#else
    xGetPredBlkTpl<false>(cu, compID, refBuf, mv, posW, posH, sublkHeight, refLeftTemplate);
#endif

    for (int k = posH; k < posH + sublkHeight; k++)
    {
      int refVal = refLeftTemplate[k];
      refVal >>= precShift;
      refLeftTemplate[k] = refVal;
      numTemplate[1]++;
    }
  }
}
void InterPrediction::getAffAMLRefTemplate(PredictionUnit &pu, PelUnitBuf &pcBufPredRefTop, PelUnitBuf &pcBufPredRefLeft)
{
#if INTER_LIC
  int LICshift[2] = { 0 };
  int scale[2]    = { 0 };
  int offset[2]   = { 0 };
#endif
  const int bitDepth = pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA);
  if (xCheckIdenticalMotion(pu))
  {
    Pel *            refLeftTemplate  = m_acYuvRefAMLTemplate[1][0];
    Pel *            refAboveTemplate = m_acYuvRefAMLTemplate[0][0];
    int              numTemplate[2]   = { 0, 0 };   // 0:Above, 1:Left
    const RefPicList eRefPicList      = REF_PIC_LIST_0;
    xPredAffineTpl(pu, eRefPicList, numTemplate, refLeftTemplate, refAboveTemplate);
#if INTER_LIC
    if (pu.cu->LICFlag)
    {
      Pel *recLeftTemplate  = m_acYuvCurAMLTemplate[1][0];
      Pel *recAboveTemplate = m_acYuvCurAMLTemplate[0][0];
      xGetLICParamGeneral(*pu.cu, COMPONENT_Y, numTemplate, refLeftTemplate, refAboveTemplate, recLeftTemplate,
                          recAboveTemplate, LICshift[0], scale[0], offset[0]);
      if (m_bAMLTemplateAvailabe[0])
      {
        PelBuf &      dstBuf = pcBufPredRefTop.bufs[0];
        const ClpRng &clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y);
        dstBuf.linearTransform(scale[0], LICshift[0], offset[0], true, clpRng);
      }
      if (m_bAMLTemplateAvailabe[1])
      {
        PelBuf &      dstBuf = pcBufPredRefLeft.bufs[0];
        const ClpRng &clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y);
        dstBuf.linearTransform(scale[0], LICshift[0], offset[0], true, clpRng);
      }
    }
#endif
  }
  else
  {
    for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
    {
      if (pu.refIdx[refList] < 0)
      {
        continue;
      }
      RefPicList eRefPicList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);
      CHECK(pu.refIdx[refList] >= pu.cu->slice->getNumRefIdx(eRefPicList), "Invalid reference index");
      Pel *refLeftTemplate  = m_acYuvRefLeftTemplate[refList][0];
      Pel *refAboveTemplate = m_acYuvRefAboveTemplate[refList][0];
      int  numTemplate[2]   = { 0, 0 };   // 0:Above, 1:Left
      xPredAffineTpl(pu, eRefPicList, numTemplate, refLeftTemplate, refAboveTemplate);
#if INTER_LIC
      if (pu.cu->LICFlag)
      {
        Pel *recLeftTemplate  = m_acYuvCurAMLTemplate[1][0];
        Pel *recAboveTemplate = m_acYuvCurAMLTemplate[0][0];
        xGetLICParamGeneral(*pu.cu, COMPONENT_Y, numTemplate, refLeftTemplate, refAboveTemplate, recLeftTemplate,
                            recAboveTemplate, LICshift[refList], scale[refList], offset[refList]);
      }
#endif
    }
    if (m_bAMLTemplateAvailabe[0])
    {
      PelUnitBuf srcPred[2];
      srcPred[0] = PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAboveTemplate[0][0], pcBufPredRefTop.Y()));
      srcPred[1] = PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefAboveTemplate[1][0], pcBufPredRefTop.Y()));
#if INTER_LIC
      if (pu.cu->LICFlag)
      {
        for (int i = 0; i < 2; i++)
        {
          if (pu.refIdx[i] >= 0)
          {
            PelBuf &      dstBuf = srcPred[i].bufs[0];
            const ClpRng &clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y);
            dstBuf.linearTransform(scale[i], LICshift[i], offset[i], true, clpRng);
          }
        }
      }
#endif
      const int iRefIdx0 = pu.refIdx[0];
      const int iRefIdx1 = pu.refIdx[1];
      if (iRefIdx0 >= 0 && iRefIdx1 >= 0)
      {
        for (int i = 0; i < 2; i++)
        {
          PelBuf &  dstBuf   = srcPred[i].bufs[0];
          const int biShift  = IF_INTERNAL_PREC - bitDepth;
          const Pel biOffset = -IF_INTERNAL_OFFS;
          ClpRng    clpRngDummy;
          dstBuf.linearTransform(1, -biShift, biOffset, false, clpRngDummy);
        }
      }
      xWeightedAverageY(pu, srcPred[0], srcPred[1], pcBufPredRefTop, pu.cu->slice->getSPS()->getBitDepths(),
                        pu.cu->slice->clpRngs());
    }
    if (m_bAMLTemplateAvailabe[1])
    {
      PelUnitBuf srcPred[2];
      srcPred[0] = PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefLeftTemplate[0][0], pcBufPredRefLeft.Y()));
      srcPred[1] = PelUnitBuf(pu.chromaFormat, PelBuf(m_acYuvRefLeftTemplate[1][0], pcBufPredRefLeft.Y()));
#if INTER_LIC
      if (pu.cu->LICFlag)
      {
        for (int i = 0; i < 2; i++)
        {
          if (pu.refIdx[i] >= 0)
          {
            PelBuf &      dstBuf = srcPred[i].bufs[0];
            const ClpRng &clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y);
            dstBuf.linearTransform(scale[i], LICshift[i], offset[i], true, clpRng);
          }
        }
      }
#endif
      const int iRefIdx0 = pu.refIdx[0];
      const int iRefIdx1 = pu.refIdx[1];
      if (iRefIdx0 >= 0 && iRefIdx1 >= 0)
      {
        for (int i = 0; i < 2; i++)
        {
          PelBuf &  dstBuf   = srcPred[i].bufs[0];
          const int biShift  = IF_INTERNAL_PREC - bitDepth;
          const Pel biOffset = -IF_INTERNAL_OFFS;
          ClpRng    clpRngDummy;
          dstBuf.linearTransform(1, -biShift, biOffset, false, clpRngDummy);
        }
      }
      xWeightedAverageY(pu, srcPred[0], srcPred[1], pcBufPredRefLeft, pu.cu->slice->getSPS()->getBitDepths(),
                        pu.cu->slice->clpRngs());
    }
  }
}
#if JVET_Y0058_IBC_LIST_MODIFY
void  InterPrediction::adjustIBCMergeCandidates(PredictionUnit &pu, MergeCtx& mrgCtx, int mrgCandIdx)
{
  uint32_t RdCandList[IBC_MRG_MAX_NUM_CANDS][IBC_MRG_MAX_NUM_CANDS];
  Distortion candCostList[IBC_MRG_MAX_NUM_CANDS][IBC_MRG_MAX_NUM_CANDS];

  for (uint32_t i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++)
  {
    for (uint32_t j = 0; j < IBC_MRG_MAX_NUM_CANDS; j++)
    {
      RdCandList[i][j] = j;
      candCostList[i][j] = MAX_UINT;
    }
  }

  Distortion uiCost;

  DistParam cDistParam;
  cDistParam.applyWeight = false;

  /*const SPS &sps = *pu.cs->sps;
  Position puPos = pu.lumaPos();*/
  int nWidth = pu.lumaSize().width;
  int nHeight = pu.lumaSize().height;

  if (!xAMLIBCGetCurBlkTemplate(pu, nWidth, nHeight))
  {
    return;
  }

  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE)*ADAPTIVE_IBC_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE > mrgCtx.numValidMergeCand)) ? mrgCtx.numValidMergeCand : ((mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE >= mrgCtx.numValidMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    uiCost = 0;

    mrgCtx.setMergeInfo(pu, uiMergeCand);

    if (pu.bv == Mv(0, 0))
    {
      break;
    }

    PelBuf pcBufPredRefTop = PelBuf(m_acYuvRefAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE);
    PelBuf pcBufPredCurTop = PelBuf(m_acYuvCurAMLTemplate[0][0], nWidth, AML_MERGE_TEMPLATE_SIZE);
    PelBuf pcBufPredRefLeft = PelBuf(m_acYuvRefAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight);
    PelBuf pcBufPredCurLeft = PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nHeight);

    getIBCAMLRefTemplate(pu, nWidth, nHeight);

    if (m_bAMLTemplateAvailabe[0])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop, pcBufPredRefTop, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

      uiCost += cDistParam.distFunc(cDistParam);
    }

    if (m_bAMLTemplateAvailabe[1])
    {
      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft, pcBufPredRefLeft, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);

      uiCost += cDistParam.distFunc(cDistParam);
    }
    updateCandList(uiMergeCand, uiCost, ADAPTIVE_IBC_SUB_GROUP_SIZE, RdCandList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE], candCostList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE]);
  }

  updateIBCCandInfo(pu, mrgCtx, RdCandList
    , mrgCandIdx
  );
  pu.mergeIdx = mrgCandIdx;    //restore the merge index
}
void  InterPrediction::updateIBCCandInfo(PredictionUnit &pu, MergeCtx& mrgCtx, uint32_t(*RdCandList)[IBC_MRG_MAX_NUM_CANDS], int mrgCandIdx)
{
  MergeCtx mrgCtxTmp;
  for (uint32_t ui = 0; ui < IBC_MRG_MAX_NUM_CANDS; ++ui)
  {
    mrgCtxTmp.BcwIdx[ui] = BCW_DEFAULT;
    mrgCtxTmp.interDirNeighbours[ui] = 0;
    mrgCtxTmp.mvFieldNeighbours[(ui << 1)].refIdx = NOT_VALID;
    mrgCtxTmp.mvFieldNeighbours[(ui << 1) + 1].refIdx = NOT_VALID;
    mrgCtxTmp.useAltHpelIf[ui] = false;
#if INTER_LIC
    mrgCtxTmp.LICFlags[ui] = false;
#endif
  }
  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE)*ADAPTIVE_IBC_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE > mrgCtx.numValidMergeCand)) ? mrgCtx.numValidMergeCand : ((mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE >= mrgCtx.numValidMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    mrgCtx.setMergeInfo(pu, uiMergeCand);
    if (pu.bv == Mv(0, 0))
    {
      break;
    }
    mrgCtxTmp.BcwIdx[uiMergeCand] = mrgCtx.BcwIdx[uiMergeCand];
    mrgCtxTmp.interDirNeighbours[uiMergeCand] = mrgCtx.interDirNeighbours[uiMergeCand];
    mrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1)] = mrgCtx.mvFieldNeighbours[(uiMergeCand << 1)];
    mrgCtxTmp.mvFieldNeighbours[(uiMergeCand << 1) + 1] = mrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1];
    mrgCtxTmp.useAltHpelIf[uiMergeCand] = mrgCtx.useAltHpelIf[uiMergeCand];
#if INTER_LIC 
    mrgCtxTmp.LICFlags[uiMergeCand] = mrgCtx.LICFlags[uiMergeCand];
#endif
  }
  //update
  for (uint32_t uiMergeCand = ((mrgCandIdx < 0) ? 0 : (mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE)*ADAPTIVE_IBC_SUB_GROUP_SIZE); uiMergeCand < (((mrgCandIdx < 0) || ((mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE > mrgCtx.numValidMergeCand)) ? mrgCtx.numValidMergeCand : ((mrgCandIdx / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE)); ++uiMergeCand)
  {
    bool firstGroup = (uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE) == 0 ? true : false;
    bool lastGroup = ((uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE + 1)*ADAPTIVE_IBC_SUB_GROUP_SIZE >= mrgCtx.numValidMergeCand) ? true : false;
    if (lastGroup && !firstGroup)
    {
      break;
    }
    mrgCtx.setMergeInfo(pu, uiMergeCand);
    if (pu.bv == Mv(0, 0))
    {
      break;
    }
    mrgCtx.BcwIdx[uiMergeCand] = mrgCtxTmp.BcwIdx[RdCandList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_IBC_SUB_GROUP_SIZE]];
    mrgCtx.interDirNeighbours[uiMergeCand] = mrgCtxTmp.interDirNeighbours[RdCandList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_IBC_SUB_GROUP_SIZE]];
    mrgCtx.mvFieldNeighbours[(uiMergeCand << 1)] = mrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_IBC_SUB_GROUP_SIZE] << 1)];
    mrgCtx.mvFieldNeighbours[(uiMergeCand << 1) + 1] = mrgCtxTmp.mvFieldNeighbours[(RdCandList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_IBC_SUB_GROUP_SIZE] << 1) + 1];
    mrgCtx.useAltHpelIf[uiMergeCand] = mrgCtxTmp.useAltHpelIf[RdCandList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_IBC_SUB_GROUP_SIZE]];
#if INTER_LIC
    mrgCtx.LICFlags[uiMergeCand] = mrgCtxTmp.LICFlags[RdCandList[uiMergeCand / ADAPTIVE_IBC_SUB_GROUP_SIZE][uiMergeCand%ADAPTIVE_IBC_SUB_GROUP_SIZE]];
#endif
  }
}
bool InterPrediction::xAMLIBCGetCurBlkTemplate(PredictionUnit& pu, int nCurBlkWidth, int nCurBlkHeight)
{
  m_bAMLTemplateAvailabe[0] = xAMLIsTopTempAvailable(pu);
  m_bAMLTemplateAvailabe[1] = xAMLIsLeftTempAvailable(pu);

  if (!m_bAMLTemplateAvailabe[0] && !m_bAMLTemplateAvailabe[1])
  {
    return false;
  }

  const Picture&  currPic = *pu.cs->picture;
  const CPelBuf recBuf = currPic.getRecoBuf(pu.cs->picture->blocks[COMPONENT_Y]);
  /* std::vector<Pel>& invLUT = m_pcReshape->getInvLUT();*/

  if (m_bAMLTemplateAvailabe[0])
  {
    const Pel*    rec = recBuf.bufAt(pu.blocks[COMPONENT_Y].pos().offset(0, -AML_MERGE_TEMPLATE_SIZE));
    PelBuf pcYBuf = PelBuf(m_acYuvCurAMLTemplate[0][0], nCurBlkWidth, AML_MERGE_TEMPLATE_SIZE);
    Pel*   pcY = pcYBuf.bufAt(0, 0);
    for (int k = 0; k < nCurBlkWidth; k++)
    {
      for (int l = 0; l < AML_MERGE_TEMPLATE_SIZE; l++)
      {
        int recVal = rec[k + l * recBuf.stride];

        //if (m_pcReshape->getSliceReshaperInfo().getUseSliceReshaper() && m_pcReshape->getCTUFlag())
        //{
        //  recVal = invLUT[recVal];
        //}

        pcY[k + l * nCurBlkWidth] = recVal;
      }
    }
  }

  if (m_bAMLTemplateAvailabe[1])
  {
    PelBuf pcYBuf = PelBuf(m_acYuvCurAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nCurBlkHeight);
    Pel*   pcY = pcYBuf.bufAt(0, 0);
    const Pel*    rec = recBuf.bufAt(pu.blocks[COMPONENT_Y].pos().offset(-AML_MERGE_TEMPLATE_SIZE, 0));
    for (int k = 0; k < nCurBlkHeight; k++)
    {
      for (int l = 0; l < AML_MERGE_TEMPLATE_SIZE; l++)
      {
        int recVal = rec[recBuf.stride * k + l];

        //if (m_pcReshape->getSliceReshaperInfo().getUseSliceReshaper() && m_pcReshape->getCTUFlag())
        //{
        //  recVal = invLUT[recVal];
        //}

        pcY[AML_MERGE_TEMPLATE_SIZE * k + l] = recVal;
      }
    }
  }

  return true;
}
void InterPrediction::getIBCAMLRefTemplate(PredictionUnit &pu, int nCurBlkWidth, int nCurBlkHeight)
{
  Mv mvCurr;
  mvCurr = pu.bv;
  const int lumaShift = 2 + MV_FRACTIONAL_BITS_DIFF;
  const int horShift  = (lumaShift + ::getComponentScaleX(COMPONENT_Y, pu.chromaFormat));
  const int verShift  = (lumaShift + ::getComponentScaleY(COMPONENT_Y, pu.chromaFormat));
  const Picture&  currPic = *pu.cs->picture;
  const CPelBuf recBuf = currPic.getRecoBuf(pu.cs->picture->blocks[COMPONENT_Y]);
  /* std::vector<Pel>& invLUT = m_pcReshape->getInvLUT();*/
  if (m_bAMLTemplateAvailabe[0])
  {
    Mv mvTop(0, -AML_MERGE_TEMPLATE_SIZE);
    mvTop += mvCurr;

    MotionInfo miTop;
    miTop.mv[0] = Mv(mvTop.hor <<horShift , mvTop.ver<< verShift);
    miTop.refIdx[0] = MAX_NUM_REF;
    if (!PU::checkIsIBCCandidateValid(pu, miTop))
    {
      mvTop = mvCurr;
    }
    const Pel*    rec = recBuf.bufAt(pu.blocks[COMPONENT_Y].pos().offset(mvTop.hor, mvTop.ver));
    PelBuf pcYBuf = PelBuf(m_acYuvRefAMLTemplate[0][0], nCurBlkWidth, AML_MERGE_TEMPLATE_SIZE);
    Pel*   pcY = pcYBuf.bufAt(0, 0);
    for (int k = 0; k < nCurBlkWidth; k++)
    {
      for (int l = 0; l < AML_MERGE_TEMPLATE_SIZE; l++)
      {
        int recVal = rec[k + l * recBuf.stride];

        //if (m_pcReshape->getSliceReshaperInfo().getUseSliceReshaper() && m_pcReshape->getCTUFlag())
        //{
        //  recVal = invLUT[recVal];
        //}

        pcY[k + l * nCurBlkWidth] = recVal;
      }
    }
  }

  if (m_bAMLTemplateAvailabe[1])
  {
    Mv mvLeft(-AML_MERGE_TEMPLATE_SIZE, 0);
    mvLeft += mvCurr;

    MotionInfo miLeft;
    miLeft.mv[0] = Mv(mvLeft.hor <<horShift , mvLeft.ver<< verShift);
    miLeft.refIdx[0] = MAX_NUM_REF;
    if (!PU::checkIsIBCCandidateValid(pu, miLeft))
    {
      mvLeft = mvCurr;
    }
    PelBuf pcYBuf = PelBuf(m_acYuvRefAMLTemplate[1][0], AML_MERGE_TEMPLATE_SIZE, nCurBlkHeight);
    Pel*   pcY = pcYBuf.bufAt(0, 0);
    const Pel*    rec = recBuf.bufAt(pu.blocks[COMPONENT_Y].pos().offset( mvLeft.hor,  mvLeft.ver));
    for (int k = 0; k < nCurBlkHeight; k++)
    {
      for (int l = 0; l < AML_MERGE_TEMPLATE_SIZE; l++)
      {
        int recVal = rec[recBuf.stride * k + l];

        //if (m_pcReshape->getSliceReshaperInfo().getUseSliceReshaper() && m_pcReshape->getCTUFlag())
        //{
        //  recVal = invLUT[recVal];
        //}

        pcY[AML_MERGE_TEMPLATE_SIZE * k + l] = recVal;
      }
    }
  }
}
#endif
#endif
void InterPrediction::xFillIBCBuffer(CodingUnit &cu)
{
  for (auto &currPU : CU::traverseTUs(cu))
  {
    for (const CompArea &area : currPU.blocks)
    {
      if (!area.valid())
      {
        continue;
      }

      const unsigned int lcuWidth = cu.cs->slice->getSPS()->getMaxCUWidth();
      const int shiftSampleHor = ::getComponentScaleX(area.compID, cu.chromaFormat);
      const int shiftSampleVer = ::getComponentScaleY(area.compID, cu.chromaFormat);
      const int ctuSizeLog2Ver = floorLog2(lcuWidth) - shiftSampleVer;
      const int pux = area.x & ((m_IBCBufferWidth >> shiftSampleHor) - 1);
      const int puy = area.y & (( 1 << ctuSizeLog2Ver ) - 1);
      const CompArea dstArea = CompArea(area.compID, cu.chromaFormat, Position(pux, puy), Size(area.width, area.height));
      CPelBuf srcBuf = cu.cs->getRecoBuf(area);
      PelBuf dstBuf = m_IBCBuffer.getBuf(dstArea);

      dstBuf.copyFrom(srcBuf);
    }
  }
}

void InterPrediction::xIntraBlockCopy(PredictionUnit &pu, PelUnitBuf &predBuf, const ComponentID compID)
{
  const unsigned int lcuWidth = pu.cs->slice->getSPS()->getMaxCUWidth();
  const int shiftSampleHor = ::getComponentScaleX(compID, pu.chromaFormat);
  const int shiftSampleVer = ::getComponentScaleY(compID, pu.chromaFormat);
  const int ctuSizeLog2Ver = floorLog2(lcuWidth) - shiftSampleVer;
  pu.bv = pu.mv[REF_PIC_LIST_0];
  pu.bv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT);
  int refx, refy;
  if (compID == COMPONENT_Y)
  {
    refx = pu.Y().x + pu.bv.hor;
    refy = pu.Y().y + pu.bv.ver;
  }
  else
  {//Cb or Cr
    refx = pu.Cb().x + (pu.bv.hor >> shiftSampleHor);
    refy = pu.Cb().y + (pu.bv.ver >> shiftSampleVer);
  }
  refx &= ((m_IBCBufferWidth >> shiftSampleHor) - 1);
  refy &= ((1 << ctuSizeLog2Ver) - 1);

  if (refx + predBuf.bufs[compID].width <= (m_IBCBufferWidth >> shiftSampleHor))
  {
    const CompArea srcArea = CompArea(compID, pu.chromaFormat, Position(refx, refy), Size(predBuf.bufs[compID].width, predBuf.bufs[compID].height));
    const CPelBuf refBuf = m_IBCBuffer.getBuf(srcArea);
    predBuf.bufs[compID].copyFrom(refBuf);
  }
  else
  {//wrap around
    int width = (m_IBCBufferWidth >> shiftSampleHor) - refx;
    CompArea srcArea = CompArea(compID, pu.chromaFormat, Position(refx, refy), Size(width, predBuf.bufs[compID].height));
    CPelBuf srcBuf = m_IBCBuffer.getBuf(srcArea);
    PelBuf dstBuf = PelBuf(predBuf.bufs[compID].bufAt(Position(0, 0)), predBuf.bufs[compID].stride, Size(width, predBuf.bufs[compID].height));
    dstBuf.copyFrom(srcBuf);

    width = refx + predBuf.bufs[compID].width - (m_IBCBufferWidth >> shiftSampleHor);
    srcArea = CompArea(compID, pu.chromaFormat, Position(0, refy), Size(width, predBuf.bufs[compID].height));
    srcBuf = m_IBCBuffer.getBuf(srcArea);
    dstBuf = PelBuf(predBuf.bufs[compID].bufAt(Position((m_IBCBufferWidth >> shiftSampleHor) - refx, 0)), predBuf.bufs[compID].stride, Size(width, predBuf.bufs[compID].height));
    dstBuf.copyFrom(srcBuf);
  }
}

void InterPrediction::resetIBCBuffer(const ChromaFormat chromaFormatIDC, const int ctuSize)
{
  const UnitArea area = UnitArea(chromaFormatIDC, Area(0, 0, m_IBCBufferWidth, ctuSize));
  m_IBCBuffer.getBuf(area).fill(-1);
}

void InterPrediction::resetVPDUforIBC(const ChromaFormat chromaFormatIDC, const int ctuSize, const int vSize, const int xPos, const int yPos)
{
  const UnitArea area = UnitArea(chromaFormatIDC, Area(xPos & (m_IBCBufferWidth - 1), yPos & (ctuSize - 1), vSize, vSize));
  m_IBCBuffer.getBuf(area).fill(-1);
}

bool InterPrediction::isLumaBvValid(const int ctuSize, const int xCb, const int yCb, const int width, const int height, const int xBv, const int yBv)
{
  if(((yCb + yBv) & (ctuSize - 1)) + height > ctuSize)
  {
    return false;
  }
  int refTLx = xCb + xBv;
  int refTLy = (yCb + yBv) & (ctuSize - 1);
  PelBuf buf = m_IBCBuffer.Y();
  for(int x = 0; x < width; x += 4)
  {
    for(int y = 0; y < height; y += 4)
    {
      if(buf.at((x + refTLx) & (m_IBCBufferWidth - 1), y + refTLy) == -1) return false;
      if(buf.at((x + 3 + refTLx) & (m_IBCBufferWidth - 1), y + refTLy) == -1) return false;
      if(buf.at((x + refTLx) & (m_IBCBufferWidth - 1), y + 3 + refTLy) == -1) return false;
      if(buf.at((x + 3 + refTLx) & (m_IBCBufferWidth - 1), y + 3 + refTLy) == -1) return false;
    }
  }
  return true;
}

bool InterPrediction::xPredInterBlkRPR( const std::pair<int, int>& scalingRatio, const PPS& pps, const CompArea &blk, const Picture* refPic, const Mv& mv, Pel* dst, const int dstStride, const bool bi, const bool wrapRef, const ClpRng& clpRng, const int filterIndex, const bool useAltHpelIf )
{
  const ChromaFormat  chFmt = blk.chromaFormat;
  const ComponentID compID = blk.compID;
  const bool          rndRes = !bi;

  int shiftHor = MV_FRACTIONAL_BITS_INTERNAL + ::getComponentScaleX( compID, chFmt );
  int shiftVer = MV_FRACTIONAL_BITS_INTERNAL + ::getComponentScaleY( compID, chFmt );

  int width = blk.width;
  int height = blk.height;
  CPelBuf refBuf;

  const bool scaled = refPic->isRefScaled( &pps );

  if( scaled )
  {
    int row, col;
    int refPicWidth = refPic->getPicWidthInLumaSamples();
    int refPicHeight = refPic->getPicHeightInLumaSamples();

    int xFilter = filterIndex;
    int yFilter = filterIndex;
    const int rprThreshold1 = ( 1 << SCALE_RATIO_BITS ) * 5 / 4;
    const int rprThreshold2 = ( 1 << SCALE_RATIO_BITS ) * 7 / 4;
    if( filterIndex == 0 )
    {
      if( scalingRatio.first > rprThreshold2 )
      {
        xFilter = 4;
      }
      else if( scalingRatio.first > rprThreshold1 )
      {
        xFilter = 3;
      }

      if( scalingRatio.second > rprThreshold2 )
      {
        yFilter = 4;
      }
      else if( scalingRatio.second > rprThreshold1 )
      {
        yFilter = 3;
      }
    }
    if (filterIndex == 2)
    {
      if (isLuma(compID))
      {
        if (scalingRatio.first > rprThreshold2)
        {
          xFilter = 6;
        }
        else if (scalingRatio.first > rprThreshold1)
        {
          xFilter = 5;
        }

        if (scalingRatio.second > rprThreshold2)
        {
          yFilter = 6;
        }
        else if (scalingRatio.second > rprThreshold1)
        {
          yFilter = 5;
        }
      }
      else
      {
        if (scalingRatio.first > rprThreshold2)
        {
          xFilter = 4;
        }
        else if (scalingRatio.first > rprThreshold1)
        {
          xFilter = 3;
        }

        if (scalingRatio.second > rprThreshold2)
        {
          yFilter = 4;
        }
        else if (scalingRatio.second > rprThreshold1)
        {
          yFilter = 3;
        }
      }
    }

    const int posShift = SCALE_RATIO_BITS - 4;
    int stepX = ( scalingRatio.first + 8 ) >> 4;
    int stepY = ( scalingRatio.second + 8 ) >> 4;
    int64_t x0Int;
    int64_t y0Int;
    int offX = 1 << ( posShift - shiftHor - 1 );
    int offY = 1 << ( posShift - shiftVer - 1 );

    const int64_t posX = ( ( blk.pos().x << ::getComponentScaleX( compID, chFmt ) ) - ( pps.getScalingWindow().getWindowLeftOffset() * SPS::getWinUnitX( chFmt ) ) ) >> ::getComponentScaleX( compID, chFmt );
    const int64_t posY = ( ( blk.pos().y << ::getComponentScaleY( compID, chFmt ) ) - ( pps.getScalingWindow().getWindowTopOffset()  * SPS::getWinUnitY( chFmt ) ) ) >> ::getComponentScaleY( compID, chFmt );

    int addX = isLuma( compID ) ? 0 : int( 1 - refPic->cs->sps->getHorCollocatedChromaFlag() ) * 8 * ( scalingRatio.first - SCALE_1X.first );
    int addY = isLuma( compID ) ? 0 : int( 1 - refPic->cs->sps->getVerCollocatedChromaFlag() ) * 8 * ( scalingRatio.second - SCALE_1X.second );

    x0Int = ( ( posX << ( 4 + ::getComponentScaleX( compID, chFmt ) ) ) + mv.getHor() ) * (int64_t)scalingRatio.first + addX;
    x0Int = SIGN( x0Int ) * ( ( llabs( x0Int ) + ( (long long)1 << ( 7 + ::getComponentScaleX( compID, chFmt ) ) ) ) >> ( 8 + ::getComponentScaleX( compID, chFmt ) ) ) + ( ( refPic->getScalingWindow().getWindowLeftOffset() * SPS::getWinUnitX( chFmt ) ) << ( ( posShift - ::getComponentScaleX( compID, chFmt ) ) ) );

    y0Int = ( ( posY << ( 4 + ::getComponentScaleY( compID, chFmt ) ) ) + mv.getVer() ) * (int64_t)scalingRatio.second + addY;
    y0Int = SIGN( y0Int ) * ( ( llabs( y0Int ) + ( (long long)1 << ( 7 + ::getComponentScaleY( compID, chFmt ) ) ) ) >> ( 8 + ::getComponentScaleY( compID, chFmt ) ) ) + ( ( refPic->getScalingWindow().getWindowTopOffset() * SPS::getWinUnitY( chFmt ) ) << ( ( posShift - ::getComponentScaleY( compID, chFmt ) ) ) );

    const int extSize = isLuma( compID ) ? 1 : 2;
#if IF_12TAP
#if RPR_ENABLE
    const int iTap = 0;
#else
    const int iTap = 1;
#endif
    int vFilterSize = isLuma(compID) ? NTAPS_LUMA(iTap) : NTAPS_CHROMA;
#else
    int vFilterSize = isLuma( compID ) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif

    int yInt0 = ( (int32_t)y0Int + offY ) >> posShift;
#if IF_12TAP
    yInt0 = std::min(std::max(-(NTAPS_LUMA(iTap) / 2), yInt0), (refPicHeight >> ::getComponentScaleY(compID, chFmt)) + (NTAPS_LUMA(iTap) / 2));
#else
    yInt0 = std::min( std::max( -(NTAPS_LUMA / 2), yInt0 ), ( refPicHeight >> ::getComponentScaleY( compID, chFmt ) ) + (NTAPS_LUMA / 2) );
#endif

    int xInt0 = ( (int32_t)x0Int + offX ) >> posShift;
#if IF_12TAP
    xInt0 = std::min(std::max(-(NTAPS_LUMA(iTap) / 2), xInt0), (refPicWidth >> ::getComponentScaleX(compID, chFmt)) + (NTAPS_LUMA(iTap) / 2));
#else
    xInt0 = std::min( std::max( -(NTAPS_LUMA / 2), xInt0 ), ( refPicWidth >> ::getComponentScaleX( compID, chFmt ) ) + (NTAPS_LUMA / 2) );
#endif

    int refHeight = ((((int32_t)y0Int + (height-1) * stepY) + offY ) >> posShift) - ((((int32_t)y0Int + 0 * stepY) + offY ) >> posShift) + 1;
    refHeight = std::max<int>( 1, refHeight );

    CHECK( MAX_CU_SIZE * MAX_SCALING_RATIO + 16 < refHeight + vFilterSize - 1 + extSize, "Buffer is not large enough, increase MAX_SCALING_RATIO" );

    Pel buffer[( MAX_CU_SIZE + 16 ) * ( MAX_CU_SIZE * MAX_SCALING_RATIO + 16 )];
    int tmpStride = width;
    int xInt = 0, yInt = 0;

    for( col = 0; col < width; col++ )
    {
      int posX = (int32_t)x0Int + col * stepX;
      xInt = ( posX + offX ) >> posShift;
#if IF_12TAP
      xInt = std::min(std::max(-(NTAPS_LUMA(iTap) / 2), xInt), (refPicWidth >> ::getComponentScaleX(compID, chFmt)) + (NTAPS_LUMA(iTap) / 2));
#else
      xInt = std::min( std::max( -(NTAPS_LUMA / 2), xInt ), ( refPicWidth >> ::getComponentScaleX( compID, chFmt ) ) + (NTAPS_LUMA / 2) );
#endif
      int xFrac = ( ( posX + offX ) >> ( posShift - shiftHor ) ) & ( ( 1 << shiftHor ) - 1 );

      CHECK( xInt0 > xInt, "Wrong horizontal starting point" );

      Position offset = Position( xInt, yInt0 );
      refBuf = refPic->getRecoBuf( CompArea( compID, chFmt, offset, Size( 1, refHeight ) ), wrapRef );
      Pel* tempBuf = buffer + col;

      m_if.filterHor( compID, (Pel*)refBuf.buf - ( ( vFilterSize >> 1 ) - 1 ) * refBuf.stride, refBuf.stride, tempBuf, tmpStride, 1, refHeight + vFilterSize - 1 + extSize, xFrac, false, chFmt, clpRng, xFilter, false, useAltHpelIf && scalingRatio.first == 1 << SCALE_RATIO_BITS );
    }

    for( row = 0; row < height; row++ )
    {
      int posY = (int32_t)y0Int + row * stepY;
      yInt = ( posY + offY ) >> posShift;
#if IF_12TAP
      yInt = std::min(std::max(-(NTAPS_LUMA(iTap) / 2), yInt), (refPicHeight >> ::getComponentScaleY(compID, chFmt)) + (NTAPS_LUMA(iTap) / 2));
#else
      yInt = std::min( std::max( -(NTAPS_LUMA / 2), yInt ), ( refPicHeight >> ::getComponentScaleY( compID, chFmt ) ) + (NTAPS_LUMA / 2) );
#endif
      int yFrac = ( ( posY + offY ) >> ( posShift - shiftVer ) ) & ( ( 1 << shiftVer ) - 1 );

      CHECK( yInt0 > yInt, "Wrong vertical starting point" );

      Pel* tempBuf = buffer + ( yInt - yInt0 ) * tmpStride;

      JVET_J0090_SET_CACHE_ENABLE( false );
      m_if.filterVer( compID, tempBuf + ( ( vFilterSize >> 1 ) - 1 ) * tmpStride, tmpStride, dst + row * dstStride, dstStride, width, 1, yFrac, false, rndRes, chFmt, clpRng, yFilter, false, useAltHpelIf && scalingRatio.second == 1 << SCALE_RATIO_BITS );
      JVET_J0090_SET_CACHE_ENABLE( true );
    }
  }

  return scaled;
}


#if INTER_LIC
void InterPrediction::xLocalIlluComp(const PredictionUnit& pu,
                                     const ComponentID     compID,
                                     const Picture&        refPic,
                                     const Mv&             mv,
                                     const bool            biPred,
                                     PelBuf&               dstBuf
)
{
  Pel* refLeftTemplate  = m_pcLICRefLeftTemplate;
  Pel* refAboveTemplate = m_pcLICRefAboveTemplate;
  Pel* recLeftTemplate  = m_pcLICRecLeftTemplate;
  Pel* recAboveTemplate = m_pcLICRecAboveTemplate;
  int numTemplate[2] = { 0 , 0 }; // 0:Above, 1:Left
  xGetSublkTemplate(*pu.cu, compID, refPic, mv, pu.blocks[compID].width, pu.blocks[compID].height, 0, 0, numTemplate, refLeftTemplate, refAboveTemplate, recLeftTemplate, recAboveTemplate);

  int shift = 0, scale = 0, offset = 0;
  xGetLICParamGeneral(*pu.cu, compID, numTemplate, refLeftTemplate, refAboveTemplate, recLeftTemplate, recAboveTemplate, shift, scale, offset);

  const ClpRng& clpRng = pu.cu->cs->slice->clpRng(compID);
  dstBuf.linearTransform(scale, shift, offset, true, clpRng);
}

void InterPrediction::xGetSublkTemplate(const CodingUnit& cu,
                                        const ComponentID compID,
                                        const Picture&    refPic,
                                        const Mv&         mv,
                                        const int         sublkWidth,
                                        const int         sublkHeight,
                                        const int         posW,
                                        const int         posH,
                                        int*              numTemplate,
                                        Pel*              refLeftTemplate,
                                        Pel*              refAboveTemplate,
                                        Pel*              recLeftTemplate,
                                        Pel*              recAboveTemplate)
{
  const int       bitDepth = cu.cs->sps->getBitDepth(toChannelType(compID));
  const int       precShift = std::max(0, bitDepth - 12);

  const Picture&  currPic = *cu.cs->picture;
  const CodingUnit* const cuAbove = cu.cs->getCU(cu.blocks[compID].pos().offset(0, -1), toChannelType(compID));
  const CodingUnit* const cuLeft = cu.cs->getCU(cu.blocks[compID].pos().offset(-1, 0), toChannelType(compID));
  const CPelBuf recBuf = cuAbove || cuLeft ? currPic.getRecoBuf(cu.cs->picture->blocks[compID]) : CPelBuf();
  const CPelBuf refBuf = cuAbove || cuLeft ? refPic.getRecoBuf(refPic.blocks[compID]) : CPelBuf();

  std::vector<Pel>& invLUT = m_pcReshape->getInvLUT();

  // above
  if (cuAbove && posH == 0)
  {
    xGetPredBlkTpl<true>(cu, compID, refBuf, mv, posW, posH, sublkWidth, refAboveTemplate);
    const Pel*    rec = recBuf.bufAt(cu.blocks[compID].pos().offset(0, -1));

    for (int k = posW; k < posW + sublkWidth; k++)
    {
      int refVal = refAboveTemplate[k];
      int recVal = rec[k];

      if (isLuma(compID) && cu.cs->picHeader->getLmcsEnabledFlag() && m_pcReshape->getCTUFlag())
      {
        recVal = invLUT[recVal];
      }

      recVal >>= precShift;
      refVal >>= precShift;

      refAboveTemplate[k] = refVal;
      recAboveTemplate[k] = recVal;
      numTemplate[0]++;
    }
  }

  // left
  if (cuLeft && posW == 0)
  {
    xGetPredBlkTpl<false>(cu, compID, refBuf, mv, posW, posH, sublkHeight, refLeftTemplate);
    const Pel*    rec = recBuf.bufAt(cu.blocks[compID].pos().offset(-1, 0));

    for (int k = posH; k < posH + sublkHeight; k++)
    {
      int refVal = refLeftTemplate[k];
      int recVal = rec[recBuf.stride * k];

      if (isLuma(compID) && cu.cs->picHeader->getLmcsEnabledFlag() && m_pcReshape->getCTUFlag())
      {
        recVal = invLUT[recVal];
      }

      recVal >>= precShift;
      refVal >>= precShift;

      refLeftTemplate[k] = refVal;
      recLeftTemplate[k] = recVal;
      numTemplate[1]++;
    }
  }
}

void InterPrediction::xGetLICParamGeneral(const CodingUnit& cu,
                                          const ComponentID compID,
                                          int*              numTemplate,
                                          Pel*              refLeftTemplate,
                                          Pel*              refAboveTemplate,
                                          Pel*              recLeftTemplate,
                                          Pel*              recAboveTemplate,
                                          int&              shift,
                                          int&              scale,
                                          int&              offset
)
{
  const int       cuWidth = cu.blocks[compID].width;
  const int       cuHeight = cu.blocks[compID].height;

  const int       bitDepth = cu.cs->sps->getBitDepth(toChannelType(compID));
  const int       precShift = std::max(0, bitDepth - 12);
  const int       maxNumMinus1 = 30 - 2 * std::min(bitDepth, 12) - 1;
  const int       minDimBit = floorLog2(std::min(cuHeight, cuWidth));
  const int       minDim = 1 << minDimBit;
  int       minStepBit = minDim > 8 ? 1 : 0;
  while (minDimBit > minStepBit + maxNumMinus1) { minStepBit++; } //make sure log2(2*minDim/tmpStep) + 2*min(bitDepth,12) <= 30
  const int       numSteps = minDim >> minStepBit;
  const int       dimShift = minDimBit - minStepBit;

  //----- get correlation data -----
  int x = 0, y = 0, xx = 0, xy = 0, cntShift = 0;

  // above
  if (numTemplate[0] != 0)
  {
    for (int k = 0; k < numSteps; k++)
    {
      CHECK(((k * cuWidth) >> dimShift) >= cuWidth, "Out of range");

      int refVal = refAboveTemplate[((k * cuWidth) >> dimShift)];
      int recVal = recAboveTemplate[((k * cuWidth) >> dimShift)];

      x += refVal;
      y += recVal;
      xx += refVal * refVal;
      xy += refVal * recVal;
    }

    cntShift = dimShift;
  }

  // left
  if (numTemplate[1] != 0)
  {
    for (int k = 0; k < numSteps; k++)
    {
      CHECK(((k * cuHeight) >> dimShift) >= cuHeight, "Out of range");

      int refVal = refLeftTemplate[((k * cuHeight) >> dimShift)];
      int recVal = recLeftTemplate[((k * cuHeight) >> dimShift)];

      x += refVal;
      y += recVal;
      xx += refVal * refVal;
      xy += refVal * recVal;
    }

    cntShift += (cntShift ? 1 : dimShift);
  }

  //----- determine scale and offset -----
  shift = m_LICShift;
  if (cntShift == 0)
  {
    scale = (1 << shift);
    offset = 0;
    return;
  }

  const int cropShift = std::max(0, bitDepth - precShift + cntShift - 15);
  const int xzOffset = (xx >> m_LICRegShift);
  const int sumX = x << precShift;
  const int sumY = y << precShift;
  const int sumXX = ((xx + xzOffset) >> (cropShift << 1)) << cntShift;
  const int sumXY = ((xy + xzOffset) >> (cropShift << 1)) << cntShift;
  const int sumXsumX = (x >> cropShift) * (x >> cropShift);
  const int sumXsumY = (x >> cropShift) * (y >> cropShift);
  int a1 = sumXY - sumXsumY;
  int a2 = sumXX - sumXsumX;
  int scaleShiftA2 = getMSB(abs(a2)) - 6;
  int scaleShiftA1 = scaleShiftA2 - m_LICShiftDiff;
  scaleShiftA2 = std::max(0, scaleShiftA2);
  scaleShiftA1 = std::max(0, scaleShiftA1);
  const int scaleShiftA = scaleShiftA2 + 15 - shift - scaleShiftA1;
  a1 = a1 >> scaleShiftA1;
  a2 = Clip3(0, 63, a2 >> scaleShiftA2);
  scale = int((int64_t(a1) * int64_t(m_LICMultApprox[a2])) >> scaleShiftA);
  scale = Clip3(0, 1 << (shift + 2), scale);
  const int maxOffset = (1 << (bitDepth - 1)) - 1;
  const int minOffset = -1 - maxOffset;
  offset = (sumY - ((scale * sumX) >> shift) + ((1 << (cntShift)) >> 1)) >> cntShift;
  offset = Clip3(minOffset, maxOffset, offset);
}

template <bool TrueA_FalseL>
void InterPrediction::xGetPredBlkTpl(const CodingUnit& cu, const ComponentID compID, const CPelBuf& refBuf, const Mv& mv, const int posW, const int posH, const int tplSize, Pel* predBlkTpl
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
                      , bool AML
#endif
                                     )
{
  const int lumaShift = 2 + MV_FRACTIONAL_BITS_DIFF;
  const int horShift  = (lumaShift + ::getComponentScaleX(compID, cu.chromaFormat));
  const int verShift  = (lumaShift + ::getComponentScaleY(compID, cu.chromaFormat));

  const int xInt      = mv.getHor() >> horShift;
  const int yInt      = mv.getVer() >> verShift;
  const int xFrac     = mv.getHor() & ((1 << horShift) - 1);
  const int yFrac     = mv.getVer() & ((1 << verShift) - 1);

  const Pel* ref;
        Pel* dst;
        int refStride, dstStride, bw, bh;
  if( TrueA_FalseL )
  {
    ref       = refBuf.bufAt(cu.blocks[compID].pos().offset(xInt + posW, yInt + posH - 1));
    dst       = predBlkTpl + posW;
    refStride = refBuf.stride;
    dstStride = tplSize;
    bw        = tplSize;
    bh        = 1;
  }
  else
  {
    ref       = refBuf.bufAt(cu.blocks[compID].pos().offset(xInt + posW - 1, yInt + posH));
    dst       = predBlkTpl + posH;
    refStride = refBuf.stride;
    dstStride = 1;
    bw        = 1;
    bh        = tplSize;
  }

#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
  int   nFilterIdx =  AML  ? 1 : 0;
#else
  const int  nFilterIdx   = 0;
#endif
  const bool useAltHpelIf = false;

  if ( yFrac == 0 )
  {
    m_if.filterHor( compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, xFrac, true, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
  }
  else if ( xFrac == 0 )
  {
    m_if.filterVer( compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, yFrac, true, true, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
  }
  else
  {
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
#if IF_12TAP
    int vFilterSize = isLuma(compID) ? NTAPS_LUMA(0) : NTAPS_CHROMA;
#else
    int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif
    if (isLuma(compID) && nFilterIdx == 1)
    {
      vFilterSize = NTAPS_BILINEAR;
    }
#else
#if IF_12TAP
    const int vFilterSize = isLuma(compID) ? NTAPS_LUMA(0) : NTAPS_CHROMA;
#else
    const int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA;
#endif
#endif
    PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], Size(bw, bh+vFilterSize-1));

    m_if.filterHor( compID, (Pel*)ref - ((vFilterSize>>1) -1)*refStride, refStride, tmpBuf.buf, tmpBuf.stride, bw, bh+vFilterSize-1, xFrac, false, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
    JVET_J0090_SET_CACHE_ENABLE( false );
    m_if.filterVer( compID, tmpBuf.buf + ((vFilterSize>>1) -1)*tmpBuf.stride, tmpBuf.stride, dst, dstStride, bw, bh, yFrac, false, true, cu.chromaFormat, cu.slice->clpRng(compID), nFilterIdx, false, useAltHpelIf);
    JVET_J0090_SET_CACHE_ENABLE( true );
  }
}
#endif // INTER_LIC

#if TM_AMVP || TM_MRG
Distortion InterPrediction::deriveTMMv(const PredictionUnit& pu, bool fillCurTpl, Distortion curBestCost, RefPicList eRefList, int refIdx, int maxSearchRounds, Mv& mv, const MvField* otherMvf)
{
  CHECK(refIdx < 0, "Invalid reference index for TM");
  const CodingUnit& cu   = *pu.cu;
#if JVET_Y0128_NON_CTC
  if ( cu.slice->getRefPic(eRefList, refIdx)->isRefScaled( pu.cs->pps ) )
  {
    return std::numeric_limits<Distortion>::max();
  }
#endif
  const Picture& refPic  = *cu.slice->getRefPic(eRefList, refIdx)->unscaledPic;
  bool doSimilarityCheck = otherMvf == nullptr ? false : cu.slice->getRefPOC((RefPicList)eRefList, refIdx) == cu.slice->getRefPOC((RefPicList)(1 - eRefList), otherMvf->refIdx);

  InterPredResources interRes(m_pcReshape, m_pcRdCost, m_if, m_filteredBlockTmp[0][COMPONENT_Y]
                           ,  m_filteredBlock[3][1][0], m_filteredBlock[3][0][0]
  );
  TplMatchingCtrl tplCtrl(pu, interRes, refPic, fillCurTpl, COMPONENT_Y, true, maxSearchRounds, m_pcCurTplAbove, m_pcCurTplLeft, m_pcRefTplAbove, m_pcRefTplLeft, mv, (doSimilarityCheck ? &(otherMvf->mv) : nullptr), curBestCost);
  if (!tplCtrl.getTemplatePresentFlag())
  {
    return std::numeric_limits<Distortion>::max();
  }

  if (otherMvf == nullptr) // uni prediction
  {
    tplCtrl.deriveMvUni<TM_TPL_SIZE>();
    mv = tplCtrl.getFinalMv();
    return tplCtrl.getMinCost();
  }
  else // bi prediction
  {
#if JVET_Y0128_NON_CTC
    if ( cu.slice->getRefPic((RefPicList)(1 - eRefList), otherMvf->refIdx)->isRefScaled(pu.cs->pps) )
    {
      return std::numeric_limits<Distortion>::max();
    }
#endif
    const Picture& otherRefPic = *cu.slice->getRefPic((RefPicList)(1-eRefList), otherMvf->refIdx)->unscaledPic;
    tplCtrl.removeHighFreq<TM_TPL_SIZE>(otherRefPic, otherMvf->mv, getBcwWeight(cu.BcwIdx, eRefList));
    tplCtrl.deriveMvUni<TM_TPL_SIZE>();
    mv = tplCtrl.getFinalMv();

    int8_t intWeight = getBcwWeight(cu.BcwIdx, eRefList);
    return (tplCtrl.getMinCost() * intWeight + (g_BcwWeightBase >> 1)) >> g_BcwLog2WeightBase;
  }
}

#if TM_MRG
void InterPrediction::deriveTMMv(PredictionUnit& pu)
{
  if( !pu.tmMergeFlag )
  {
    return;
  }

  Distortion minCostUni[NUM_REF_PIC_LIST_01] = { std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max() };

  for (int iRefList = 0; iRefList < ( pu.cu->slice->isInterB() ? NUM_REF_PIC_LIST_01 : 1 ) ; ++iRefList)
  {
    if (pu.interDir & (iRefList + 1))
    {
      minCostUni[iRefList] = deriveTMMv(pu, true, std::numeric_limits<Distortion>::max(), (RefPicList)iRefList, pu.refIdx[iRefList], TM_MAX_NUM_OF_ITERATIONS, pu.mv[iRefList]);
    }
  }

  if (pu.cu->slice->isInterB() && pu.interDir == 3
#if MULTI_PASS_DMVR
    && !PU::checkBDMVRCondition(pu)
#endif
    )
  {
    if (minCostUni[0] == std::numeric_limits<Distortion>::max() || minCostUni[1] == std::numeric_limits<Distortion>::max())
    {
      return;
    }

    RefPicList eTargetPicList = (minCostUni[0] <= minCostUni[1]) ? REF_PIC_LIST_1 : REF_PIC_LIST_0;
    MvField    mvfBetterUni(pu.mv[1 - eTargetPicList], pu.refIdx[1 - eTargetPicList]);
    Distortion minCostBi = deriveTMMv(pu, true, std::numeric_limits<Distortion>::max(), eTargetPicList, pu.refIdx[eTargetPicList], TM_MAX_NUM_OF_ITERATIONS, pu.mv[eTargetPicList], &mvfBetterUni);

    if (minCostBi > (minCostUni[1 - eTargetPicList] + (minCostUni[1 - eTargetPicList] >> 3)))
    {
      pu.interDir = 1 + (1 - eTargetPicList);
      pu.mv    [eTargetPicList] = Mv();
      pu.refIdx[eTargetPicList] = NOT_VALID;
    }
  }
}
#endif // TM_MRG
#endif // TM_AMVP || TM_MRG

#if TM_AMVP || TM_MRG
TplMatchingCtrl::TplMatchingCtrl( const PredictionUnit&     pu,
                                        InterPredResources& interRes,
                                  const Picture&            refPic,
                                  const bool                fillCurTpl,
                                  const ComponentID         compID,
                                  const bool                useWeight,
                                  const int                 maxSearchRounds,
                                        Pel*                curTplAbove,
                                        Pel*                curTplLeft,
                                        Pel*                refTplAbove,
                                        Pel*                refTplLeft,
                                  const Mv&                 mvStart,
                                  const Mv*                 otherRefListMv,
                                  const Distortion          curBestCost
)
: m_cu              (*pu.cu)
, m_pu              (pu)
, m_interRes        (interRes)
, m_refPic          (refPic)
, m_mvStart         (mvStart)
, m_mvFinal         (mvStart)
, m_otherRefListMv  (otherRefListMv)
, m_minCost         (curBestCost)
, m_useWeight       (useWeight)
, m_maxSearchRounds (maxSearchRounds)
, m_compID          (compID)
{
  // Initialization
  const bool tplAvalableAbove = xFillCurTemplate<TM_TPL_SIZE, true >((fillCurTpl ? curTplAbove : nullptr));
  const bool tplAvalableLeft  = xFillCurTemplate<TM_TPL_SIZE, false>((fillCurTpl ? curTplLeft  : nullptr));
  m_curTplAbove = tplAvalableAbove ? PelBuf(curTplAbove, pu.lwidth(),   TM_TPL_SIZE ) : PelBuf();
  m_curTplLeft  = tplAvalableLeft  ? PelBuf(curTplLeft , TM_TPL_SIZE,   pu.lheight()) : PelBuf();
  m_refTplAbove = tplAvalableAbove ? PelBuf(refTplAbove, m_curTplAbove              ) : PelBuf();
  m_refTplLeft  = tplAvalableLeft  ? PelBuf(refTplLeft , m_curTplLeft               ) : PelBuf();
#if JVET_X0056_DMVD_EARLY_TERMINATION
  m_earlyTerminateTh = TM_TPL_SIZE * ((tplAvalableAbove ? m_pu.lwidth() : 0) + (tplAvalableLeft ? m_pu.lheight() : 0));
#endif

  // Pre-interpolate samples on search area
  m_refSrAbove = tplAvalableAbove && maxSearchRounds > 0 ? PelBuf(interRes.m_preFillBufA, m_curTplAbove.width + 2 * TM_SEARCH_RANGE, m_curTplAbove.height + 2 * TM_SEARCH_RANGE) : PelBuf();
  if (m_refSrAbove.buf != nullptr)
  {
    m_refSrAbove = xGetRefTemplate<TM_TPL_SIZE, true, TM_SEARCH_RANGE>(m_pu, m_refPic, mvStart, m_refSrAbove);
    m_refSrAbove = m_refSrAbove.subBuf(Position(TM_SEARCH_RANGE, TM_SEARCH_RANGE), m_curTplAbove);
  }

  m_refSrLeft  = tplAvalableLeft  && maxSearchRounds > 0 ? PelBuf(interRes.m_preFillBufL, m_curTplLeft .width + 2 * TM_SEARCH_RANGE, m_curTplLeft .height + 2 * TM_SEARCH_RANGE) : PelBuf();
  if (m_refSrLeft.buf != nullptr)
  {
    m_refSrLeft = xGetRefTemplate<TM_TPL_SIZE, false, TM_SEARCH_RANGE>(m_pu, m_refPic, mvStart, m_refSrLeft);
    m_refSrLeft = m_refSrLeft.subBuf(Position(TM_SEARCH_RANGE, TM_SEARCH_RANGE), m_curTplLeft);
  }
}

int TplMatchingCtrl::getDeltaMean(const PelBuf& bufCur, const PelBuf& bufRef, const int rowSubShift, const int bd)
{
  int64_t deltaSum = g_pelBufOP.getSumOfDifference(bufCur.buf, bufCur.stride, bufRef.buf, bufRef.stride, bufCur.width, bufCur.height, rowSubShift, bd);
  return int(deltaSum / (int64_t)bufCur.area());
}

template <int tplSize>
void TplMatchingCtrl::deriveMvUni()
{
  if (m_minCost == std::numeric_limits<Distortion>::max())
  {
    m_minCost = xGetTempMatchError<tplSize>(m_mvStart);
  }

  if (m_maxSearchRounds <= 0)
  {
    return;
  }

  int searchStepShift = (m_cu.imv == IMV_4PEL ? MV_FRACTIONAL_BITS_INTERNAL + 2 : MV_FRACTIONAL_BITS_INTERNAL);
  xRefineMvSearch<tplSize, TplMatchingCtrl::TMSEARCH_DIAMOND>(m_maxSearchRounds, searchStepShift);
  xRefineMvSearch<tplSize, TplMatchingCtrl::TMSEARCH_CROSS  >(                1, searchStepShift);
  xRefineMvSearch<tplSize, TplMatchingCtrl::TMSEARCH_CROSS  >(                1, searchStepShift - 1);
#if MULTI_PASS_DMVR
  if (!m_pu.bdmvrRefine)
  {
#endif
  xRefineMvSearch<tplSize, TplMatchingCtrl::TMSEARCH_CROSS  >(                1, searchStepShift - 2);
  xRefineMvSearch<tplSize, TplMatchingCtrl::TMSEARCH_CROSS  >(                1, searchStepShift - 3);
#if MULTI_PASS_DMVR
  }
  else
  {
    xDeriveCostBasedMv<TplMatchingCtrl::TMSEARCH_CROSS>();
  }
#endif
}

template <int tplSize>
void TplMatchingCtrl::removeHighFreq(const Picture& otherRefPic, const Mv& otherRefMv, const uint8_t curRefBcwWeight)
{
  xRemoveHighFreq<tplSize,  true>(otherRefPic, otherRefMv, curRefBcwWeight);
  xRemoveHighFreq<tplSize, false>(otherRefPic, otherRefMv, curRefBcwWeight);
}

template <int tplSize, bool TrueA_FalseL>
bool TplMatchingCtrl::xFillCurTemplate(Pel* tpl)
{
  const Position          posOffset = TrueA_FalseL ? Position(0, -tplSize) : Position(-tplSize, 0);
  const CodingUnit* const cuNeigh   = m_cu.cs->getCU(m_pu.blocks[m_compID].pos().offset(posOffset), toChannelType(m_compID));

  if (cuNeigh == nullptr)
  {
    return false;
  }

  if (tpl == nullptr)
  {
    return true;
  }

  const Picture&          currPic = *m_cu.cs->picture;
  const CPelBuf           recBuf  = currPic.getRecoBuf(m_cu.cs->picture->blocks[m_compID]);
        std::vector<Pel>& invLUT  = m_interRes.m_pcReshape->getInvLUT();
  const bool              useLUT  = isLuma(m_compID) && m_cu.cs->picHeader->getLmcsEnabledFlag() && m_interRes.m_pcReshape->getCTUFlag();
#if JVET_W0097_GPM_MMVD_TM & TM_MRG
  if (m_cu.geoFlag)
  {
    CHECK(m_pu.geoTmType == GEO_TM_OFF, "invalid geo template type value");
    if (m_pu.geoTmType == GEO_TM_SHAPE_A)
    {
      if (TrueA_FalseL == 0)
      {
        return false;
      }
    }
    if (m_pu.geoTmType == GEO_TM_SHAPE_L)
    {
      if (TrueA_FalseL == 1)
      {
        return false;
      }
    }
  }
#endif
  const Size dstSize = (TrueA_FalseL ? Size(m_pu.lwidth(), tplSize) : Size(tplSize, m_pu.lheight()));
  for (int h = 0; h < (int)dstSize.height; h++)
  {
    const Position recPos = TrueA_FalseL ? Position(0, -tplSize + h) : Position(-tplSize, h);
    const Pel*     rec    = recBuf.bufAt(m_pu.blocks[m_compID].pos().offset(recPos));
          Pel*     dst    = tpl + h * dstSize.width;

    for (int w = 0; w < (int)dstSize.width; w++)
    {
      int recVal = rec[w];
      dst[w] = useLUT ? invLUT[recVal] : recVal;
    }
  }

  return true;
}

template <int tplSize, bool TrueA_FalseL, int sr>
PelBuf TplMatchingCtrl::xGetRefTemplate(const PredictionUnit& curPu, const Picture& refPic, const Mv& _mv, PelBuf& dstBuf)
{
  // read from pre-interpolated buffer
  PelBuf& refSrBuf = TrueA_FalseL ? m_refSrAbove : m_refSrLeft;
  if (sr == 0 && refPic.getPOC() == m_refPic.getPOC() && refSrBuf.buf != nullptr)
  {
    Mv mvDiff = _mv - m_mvStart;
    if ((mvDiff.getAbsHor() & ((1 << MV_FRACTIONAL_BITS_INTERNAL) - 1)) == 0 && (mvDiff.getAbsVer() & ((1 << MV_FRACTIONAL_BITS_INTERNAL) - 1)) == 0)
    {
      mvDiff >>= MV_FRACTIONAL_BITS_INTERNAL;
      if (mvDiff.getAbsHor() <= TM_SEARCH_RANGE && mvDiff.getAbsVer() <= TM_SEARCH_RANGE)
      {
        return refSrBuf.subBuf(Position(mvDiff.getHor(), mvDiff.getVer()), dstBuf);
      }
    }
  }

  // Do interpolation on the fly
  Position blkPos  = ( TrueA_FalseL ? Position(curPu.lx(), curPu.ly() - tplSize) : Position(curPu.lx() - tplSize, curPu.ly()) );
  Size     blkSize = Size(dstBuf.width, dstBuf.height);
  Mv       mv      = _mv - Mv(sr << MV_FRACTIONAL_BITS_INTERNAL, sr << MV_FRACTIONAL_BITS_INTERNAL);
  clipMv( mv, blkPos, blkSize, *m_cu.cs->sps, *m_cu.cs->pps );

  const int lumaShift = 2 + MV_FRACTIONAL_BITS_DIFF;
  const int horShift  = (lumaShift + ::getComponentScaleX(m_compID, m_cu.chromaFormat));
  const int verShift  = (lumaShift + ::getComponentScaleY(m_compID, m_cu.chromaFormat));

  const int xInt  = mv.getHor() >> horShift;
  const int yInt  = mv.getVer() >> verShift;
  const int xFrac = mv.getHor() & ((1 << horShift) - 1);
  const int yFrac = mv.getVer() & ((1 << verShift) - 1);

  const CPelBuf refBuf = refPic.getRecoBuf(refPic.blocks[m_compID]);
  const Pel* ref       = refBuf.bufAt(blkPos.offset(xInt, yInt));
        Pel* dst       = dstBuf.buf;
        int  refStride = refBuf.stride;
        int  dstStride = dstBuf.stride;
        int  bw        = (int)blkSize.width;
        int  bh        = (int)blkSize.height;

  const int  nFilterIdx   = 1;
  const bool useAltHpelIf = false;
  const bool biMCForDMVR  = false;

  if ( yFrac == 0 )
  {
    m_interRes.m_if.filterHor( m_compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, xFrac, true, m_cu.chromaFormat, m_cu.slice->clpRng(m_compID), nFilterIdx, biMCForDMVR, useAltHpelIf );
  }
  else if ( xFrac == 0 )
  {
    m_interRes.m_if.filterVer( m_compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, yFrac, true, true, m_cu.chromaFormat, m_cu.slice->clpRng(m_compID), nFilterIdx, biMCForDMVR, useAltHpelIf );
  }
  else
  {
    const int vFilterSize = isLuma(m_compID) ? NTAPS_BILINEAR : NTAPS_CHROMA;
    PelBuf tmpBuf = PelBuf(m_interRes.m_ifBuf, Size(bw, bh+vFilterSize-1));

    m_interRes.m_if.filterHor( m_compID, (Pel*)ref - ((vFilterSize>>1) -1)*refStride, refStride, tmpBuf.buf, tmpBuf.stride, bw, bh+vFilterSize-1, xFrac, false, m_cu.chromaFormat, m_cu.slice->clpRng(m_compID), nFilterIdx, biMCForDMVR, useAltHpelIf );
    JVET_J0090_SET_CACHE_ENABLE( false );
    m_interRes.m_if.filterVer( m_compID, tmpBuf.buf + ((vFilterSize>>1) -1)*tmpBuf.stride, tmpBuf.stride, dst, dstStride, bw, bh, yFrac, false, true, m_cu.chromaFormat, m_cu.slice->clpRng(m_compID), nFilterIdx, biMCForDMVR, useAltHpelIf );
    JVET_J0090_SET_CACHE_ENABLE( true );
  }

  return dstBuf;
}

template <int tplSize, bool TrueA_FalseL>
void TplMatchingCtrl::xRemoveHighFreq(const Picture& otherRefPic, const Mv& otherRefMv, const uint8_t curRefBcwWeight)
{
  PelBuf& curTplBuf = TrueA_FalseL ? m_curTplAbove : m_curTplLeft;
  PelBuf  refTplBuf = TrueA_FalseL ? m_refTplAbove : m_refTplLeft;

  if (curTplBuf.buf != nullptr)
  {
    refTplBuf = xGetRefTemplate<tplSize, TrueA_FalseL, 0>(m_pu, otherRefPic, otherRefMv, refTplBuf);
    if (curRefBcwWeight != g_BcwWeights[BCW_DEFAULT])
    {
      curTplBuf.removeWeightHighFreq(refTplBuf, false, m_cu.slice->clpRng(m_compID), curRefBcwWeight);
    }
    else
    {
      curTplBuf.removeHighFreq(refTplBuf, false, m_cu.slice->clpRng(m_compID));
    }
  }
}

template <int tplSize, int searchPattern>
void TplMatchingCtrl::xRefineMvSearch(int maxSearchRounds, int searchStepShift)
{
  static const int finestMvdPrec[NUM_IMV_MODES] = { MV_FRACTIONAL_BITS_INTERNAL - 2, MV_FRACTIONAL_BITS_INTERNAL, MV_FRACTIONAL_BITS_INTERNAL + 2, MV_FRACTIONAL_BITS_INTERNAL - 1 };
  if (searchStepShift < finestMvdPrec[m_cu.imv] && (!m_pu.mergeFlag || m_cu.imv == IMV_HPEL))
  {
    return;
  }

  // Search pattern configuration
  static const Mv patternCross  [4] = { Mv(0, 1), Mv(1, 0), Mv(0, -1), Mv(-1, 0) };
  static const Mv patternDiamond[8] = { Mv(0, 2), Mv(1, 1), Mv(2, 0), Mv(1, -1), Mv(0, -2), Mv(-1, -1), Mv(-2, 0), Mv(-1, 1) };

  int       directStart = 0, directEnd = 0, directRounding = 0, directMask = 0;
  const Mv *pSearchOffset = nullptr;
#if MULTI_PASS_DMVR
  Distortion *costArray   = nullptr; 
#endif
  if (searchPattern == TMSEARCH_CROSS)
  {
    directEnd      = 3;
    directRounding = 4;
    directMask     = 0x03;
    pSearchOffset  = patternCross;
#if MULTI_PASS_DMVR
    memset(m_tmCostArrayCross, -1, sizeof(m_tmCostArrayCross));
    costArray      = m_tmCostArrayCross;
    costArray[4]   = m_minCost;
#endif
  }
  else if (searchPattern == TMSEARCH_DIAMOND)
  {
    directEnd      = 7;
    directRounding = 8;
    directMask     = 0x07;
    pSearchOffset  = patternDiamond;
#if MULTI_PASS_DMVR
    memset(m_tmCostArrayDiamond, -1, sizeof(m_tmCostArrayDiamond));
    costArray      = m_tmCostArrayDiamond;
    costArray[8]   = m_minCost;
#endif
  }
  else
  {
    CHECK(true, "Unknown search method for TM");
  }

  // Iterative search
  for (int uiRound = 0; uiRound < maxSearchRounds; uiRound++)
  {
    int directBest = -1;
    Mv  mvCurCenter(m_mvFinal);
#if JVET_X0056_DMVD_EARLY_TERMINATION
    Distortion prevMinCost = m_minCost;
#endif
    for (int nIdx = directStart; nIdx <= directEnd; nIdx++)
    {
      int nDirect = (nIdx + directRounding) & directMask;

      Mv mvOffset = pSearchOffset[nDirect];
      mvOffset  <<= searchStepShift;
      Mv mvCand   = mvCurCenter + mvOffset;
      Distortion cost = InterPrediction::getDecoderSideDerivedMvCost(m_mvStart, mvCand, TM_SEARCH_RANGE, DECODER_SIDE_MV_WEIGHT); // MV cost is used just for skipping search
      if (cost >= m_minCost || (m_otherRefListMv != nullptr && *m_otherRefListMv == mvCand))
      {
        continue;
      }

      cost = xGetTempMatchError<tplSize>(mvCand);
#if MULTI_PASS_DMVR
      costArray[nDirect] = cost;
#endif

      if (cost < m_minCost)
      {
        m_minCost  = cost;
        m_mvFinal  = mvCand;
        directBest = nDirect;
      }
    }

    if (directBest == -1)
    {
      break;
    }
#if JVET_X0056_DMVD_EARLY_TERMINATION
    if (uiRound > 0 && prevMinCost < m_minCost + m_earlyTerminateTh)
    {
      break;
    }
#endif

    int nStep   = searchPattern == TMSEARCH_DIAMOND ? (2 - (directBest & 0x01)) : 1;
    directStart = directBest - nStep;
    directEnd   = directBest + nStep;

#if MULTI_PASS_DMVR
    if ((uiRound + 1) < maxSearchRounds)
    {
      xNextTmCostAarray<searchPattern>(directBest);
    }
#endif
  }
}

#if MULTI_PASS_DMVR
template <int searchPattern>
void TplMatchingCtrl::xNextTmCostAarray(int bestDirect)
{
  Distortion *costLog = searchPattern == TMSEARCH_CROSS ? m_tmCostArrayCross : (searchPattern == TMSEARCH_DIAMOND ? m_tmCostArrayDiamond : nullptr);

  if (searchPattern == TMSEARCH_CROSS)
  {
    CHECK(bestDirect < 0 || bestDirect > 3, "Error: Unknown bestDirect");

    int prevCenter = (bestDirect + 2) & 0x3;
    costLog[prevCenter] = costLog[4];
    costLog[4]          = costLog[bestDirect];

    for (int offset = 1; offset < 4; ++offset)
    {
      costLog[(prevCenter + offset + 4) & 0x3] = std::numeric_limits<Distortion>::max();
    }
  }
  else if (searchPattern == TMSEARCH_DIAMOND)
  {
  }
  else
  {
    CHECK(true, "Unknown search method for TM");
  }
}

template <int searchPattern>
void TplMatchingCtrl::xDeriveCostBasedMv()
{
  if (m_minCost == 0)
  {
    return;
  }

  if (searchPattern == TMSEARCH_CROSS)
  {
    xDeriveCostBasedOffset<true >(m_tmCostArrayCross[3], m_tmCostArrayCross[4], m_tmCostArrayCross[1], 0);
    xDeriveCostBasedOffset<false>(m_tmCostArrayCross[2], m_tmCostArrayCross[4], m_tmCostArrayCross[0], 0);
  }

  else
  {
    CHECK(true, "Unknown search method for TM");
  }
}

template <bool TrueX_FalseY>
void TplMatchingCtrl::xDeriveCostBasedOffset(Distortion costLorA, Distortion costCenter, Distortion costRorB, int log2StepSize)
{
  if (!m_pu.mergeFlag || m_cu.imv != IMV_OFF)
  {
    return;
  }
  if (costLorA == std::numeric_limits<Distortion>::max() || costRorB == std::numeric_limits<Distortion>::max() || (costCenter > costLorA || costCenter > costRorB))
  {
    return;
  }

  const int extraMvFracBit = MV_FRACTIONAL_BITS_INTERNAL - 1;

  int&    mvComp      = TrueX_FalseY ? m_mvFinal.hor : m_mvFinal.ver;
  int64_t numerator   = (int64_t)(costLorA - costRorB);
  int64_t denominator = (int64_t)((costLorA + costRorB - (costCenter << 1)) << 1);
  
  if (denominator != 0)
  {
    if (costCenter != costLorA && costCenter != costRorB)
    {
      if (extraMvFracBit > 1 || log2StepSize > 1)
      {
        mvComp += xBinaryDivision(numerator, denominator, extraMvFracBit + log2StepSize);
      }
    }
    else
    {
      const int off = 1 << (extraMvFracBit - 1);
      mvComp += ((costCenter == costLorA ? -off : off) << log2StepSize);
    }
  }
}

int TplMatchingCtrl::xBinaryDivision(int64_t numerator, int64_t denominator, int fracBits)
{
  if (fracBits < 2) // Because the result of division is assumed to be less than 0.5
  {
    return 0;
  }

  int sign = 0;
  if (numerator < 0)
  {
    sign = 1;
    numerator = -numerator;
  }

  numerator   <<=  fracBits;
  denominator <<= (fracBits - 2); // This "-2" is by the assumption that the result of division is always less than 0.5

  int quotient = 0;
  for (int binIdx = 0; binIdx < fracBits - 2; ++binIdx)
  {
    if (numerator >= denominator)
    {
      numerator -= denominator;
      ++quotient;
    }
    quotient    <<= 1;
    denominator >>= 1;
  }

  if (numerator >= denominator)
  {
    ++quotient;
  }

  return sign ? -quotient : quotient;
}
#endif

template <int tplSize>
Distortion TplMatchingCtrl::xGetTempMatchError(const Mv& mv)
{
  if (!getTemplatePresentFlag())
  {
    return std::numeric_limits<Distortion>::max();
  }

  Distortion sum = 0;
  sum += xGetTempMatchError<tplSize, true >(mv);
  sum += xGetTempMatchError<tplSize, false>(mv);
  return sum;
}

template <int tplSize, bool TrueA_FalseL>
Distortion TplMatchingCtrl::xGetTempMatchError(const Mv& mv)
{
  PelBuf& curTplBuf = TrueA_FalseL ? m_curTplAbove : m_curTplLeft;
  PelBuf  refTplBuf = TrueA_FalseL ? m_refTplAbove : m_refTplLeft;

  if (curTplBuf.buf == nullptr)
  {
    return 0;
  }

  const int rowSubShift   = 0;
  const int bitDepth      = m_cu.slice->clpRng(m_compID).bd;

  // fetch reference template block
  refTplBuf = xGetRefTemplate<tplSize, TrueA_FalseL, 0>(m_pu, m_refPic, mv, refTplBuf);

  // compute matching cost
  Distortion partSum = 0;
  if (m_useWeight)
  {
    DistParam cDistParam;
    cDistParam.applyWeight = false;
#if INTER_LIC
    cDistParam.useMR = m_cu.LICFlag;
#endif
    int tmWeightIdx  = (m_pu.lwidth() >= TM_MIN_CU_SIZE_FOR_ALT_WEIGHTED_COST && m_pu.lheight() >= TM_MIN_CU_SIZE_FOR_ALT_WEIGHTED_COST ? 1 : 0);
    m_interRes.m_pcRdCost->setDistParam( cDistParam, curTplBuf, refTplBuf, bitDepth, TrueA_FalseL, tmWeightIdx, rowSubShift, m_compID );
    CHECK(TM_TPL_SIZE != 4, "The distortion function of template matching is implemetned currently only for size=4.");
    partSum = cDistParam.distFunc( cDistParam );
  }
  else
  {
    DistParam cDistParam;
    cDistParam.applyWeight = false;
#if INTER_LIC
    cDistParam.useMR = m_cu.LICFlag;
#endif
    m_interRes.m_pcRdCost->setDistParam( cDistParam, curTplBuf, refTplBuf, bitDepth, m_compID, false );
    cDistParam.subShift = rowSubShift;

    partSum = cDistParam.distFunc( cDistParam );
#if FULL_NBIT
    partSum >>= (bitDepth > 8 ? bitDepth - 8 : 0);
#endif
  }

  return partSum;
}
#endif // TM_AMVP || TM_MRG

#if TM_AMVP || TM_MRG || MULTI_PASS_DMVR
Distortion InterPrediction::getDecoderSideDerivedMvCost(const Mv& mvStart, const Mv& mvCur, int searchRangeInFullPel, int weight)
{
  int searchRange = searchRangeInFullPel << MV_FRACTIONAL_BITS_INTERNAL;
  Mv mvDist = mvStart - mvCur;
  Distortion cost = std::numeric_limits<Distortion>::max();
  if (mvDist.getAbsHor() <= searchRange && mvDist.getAbsVer() <= searchRange)
  {
    cost = (mvDist.getAbsHor() + mvDist.getAbsVer()) * weight;
    cost >>= MV_FRACTIONAL_BITS_DIFF;
  }

  return cost;
}

#if MULTI_PASS_DMVR
void InterPrediction::xBDMVRUpdateSquareSearchCostLog( Distortion* costLog, int bestDirect )
{
  CHECK(bestDirect < 0 || bestDirect > 7, "Error: Unknown bestDirect");

  int prevCenter      = ( bestDirect + 4 ) & 0x7;
  costLog[prevCenter] = costLog[8];
  costLog[8]          = costLog[bestDirect];

  if( prevCenter & 0x1 )
  {
    costLog[( prevCenter - 1 + 8 ) & 0x7] = costLog[( prevCenter - 2 + 8 ) & 0x7];
    costLog[( prevCenter + 1 + 8 ) & 0x7] = costLog[( prevCenter + 2 + 8 ) & 0x7];
    costLog[( prevCenter - 2 + 8 ) & 0x7] = costLog[( prevCenter - 3 + 8 ) & 0x7];
    costLog[( prevCenter + 2 + 8 ) & 0x7] = costLog[( prevCenter + 3 + 8 ) & 0x7];
    for( int offset = 3 ; offset < 6 ; ++ offset )
    {
      costLog[( prevCenter + offset + 8 ) & 0x7] = std::numeric_limits<Distortion>::max();
    }
  }
  else
  {
    costLog[( prevCenter - 1 + 8 ) & 0x7] = costLog[( prevCenter - 3 + 8 ) & 0x7];
    costLog[( prevCenter + 1 + 8 ) & 0x7] = costLog[( prevCenter + 3 + 8 ) & 0x7];
    for( int offset = 2 ; offset < 7 ; ++ offset )
    {
      costLog[( prevCenter + offset + 8 ) & 0x7] = std::numeric_limits<Distortion>::max();
    }
  }
}
#endif
#endif

#if TM_AMVP
void InterPrediction::clearTplAmvpBuffer()
{
  for (int imv = 0; imv < NUM_IMV_MODES; ++imv)
  {
    for (int refIdx = 0; refIdx < MAX_NUM_REF; ++refIdx)
    {
      m_tplAmvpInfo   [imv][0][refIdx] = AMVPInfo();
      m_tplAmvpInfo   [imv][1][refIdx] = AMVPInfo();
#if INTER_LIC
      m_tplAmvpInfoLIC[imv][0][refIdx] = AMVPInfo();
      m_tplAmvpInfoLIC[imv][1][refIdx] = AMVPInfo();
#endif
    }
  }
}

void InterPrediction::writeTplAmvpBuffer(const AMVPInfo& src, const CodingUnit& cu, RefPicList eRefList, int refIdx)
{
#if INTER_LIC
  AMVPInfo& dst = (cu.LICFlag ? m_tplAmvpInfoLIC : m_tplAmvpInfo)[cu.imv][eRefList][refIdx];
#else
  AMVPInfo& dst = m_tplAmvpInfo[cu.imv][eRefList][refIdx];
#endif
  dst = src;
}

bool InterPrediction::readTplAmvpBuffer(AMVPInfo& dst, const CodingUnit& cu, RefPicList eRefList, int refIdx)
{
#if INTER_LIC
  AMVPInfo& src = (cu.LICFlag ? m_tplAmvpInfoLIC : m_tplAmvpInfo)[cu.imv][eRefList][refIdx];
#else
  AMVPInfo& src = m_tplAmvpInfo[cu.imv][eRefList][refIdx];
#endif
  if (src.numCand > 0)
  {
    dst = src;
    return true;
  }
  return false;
}
#endif

#if MULTI_PASS_DMVR
#if JVET_X0049_ADAPT_DMVR
bool InterPrediction::processBDMVRPU2Dir(PredictionUnit& pu, bool subPURefine[2], Mv(&finalMvDir)[2])
{
  const int lumaArea = pu.lumaSize().area();
  bool       bUseMR = lumaArea > 64;
#if JVET_Y0089_DMVR_BCW
  bUseMR |= (pu.cu->BcwIdx != BCW_DEFAULT);
#endif
  subPURefine[0] = subPURefine[1] = true;

  Distortion minCost = std::numeric_limits<Distortion>::max();
  Mv         mvInitial_PU[2] = { pu.mv[0], pu.mv[1] };
  Mv         mvFinal[2] = { pu.mv[0], pu.mv[1] };

  Distortion initCost = xBDMVRGetMatchingError(pu, mvInitial_PU, bUseMR, false);
  if (initCost < lumaArea)
  {
    subPURefine[0] = false;
    subPURefine[1] = false;
    finalMvDir[0] = mvFinal[0];
    finalMvDir[1] = mvFinal[1];
    return false;
  }

  minCost = xBDMVRMvOneTemplateHPelSquareSearch<1>(mvFinal, initCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false);
  subPURefine[0] = minCost >= lumaArea;
  finalMvDir[0] = mvFinal[0];

  mvFinal[0] = mvInitial_PU[0];
  mvFinal[1] = mvInitial_PU[1];
  minCost = xBDMVRMvOneTemplateHPelSquareSearch<2>(mvFinal, initCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false);
  subPURefine[1] = minCost >= lumaArea;
  finalMvDir[1] = mvFinal[1];

  return true;
}

void InterPrediction::processBDMVRSubPU(PredictionUnit& pu, bool subPURefine)
{
  if (!subPURefine)
  {
    // span motion to subPU
    const int dy = std::min<int>(pu.lumaSize().height, DMVR_SUBCU_HEIGHT);
    const int dx = std::min<int>(pu.lumaSize().width, DMVR_SUBCU_WIDTH);
    Position puPos = pu.lumaPos();

    int subPuIdx = 0;
    const int dmvrSubPuStrideIncr = DMVR_SUBPU_STRIDE - std::max(1, (int)(pu.lumaSize().width >> DMVR_SUBCU_WIDTH_LOG2));
    for (int y = puPos.y, yStart = 0; y < (puPos.y + pu.lumaSize().height); y = y + dy, yStart = yStart + dy)
    {
      for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + dx, xStart = xStart + dx)
      {
        m_bdmvrSubPuMvBuf[REF_PIC_LIST_0][subPuIdx] = pu.mv[0];
        m_bdmvrSubPuMvBuf[REF_PIC_LIST_1][subPuIdx] = pu.mv[1];
        subPuIdx++;
      }
      subPuIdx += dmvrSubPuStrideIncr;
    }
    return;
  }

  const int dy = std::min<int>(pu.lumaSize().height, DMVR_SUBCU_HEIGHT);
  const int dx = std::min<int>(pu.lumaSize().width, DMVR_SUBCU_WIDTH);
  Position puPos = pu.lumaPos();
  PredictionUnit subPu = pu;

  int subPuIdx = 0;
  const int dmvrSubPuStrideIncr = DMVR_SUBPU_STRIDE - std::max(1, (int)(pu.lumaSize().width >> DMVR_SUBCU_WIDTH_LOG2));
  Distortion minCost = std::numeric_limits<Distortion>::max();
  const Mv   mvInitial[2] = { pu.mv[0], pu.mv[1] };
  Mv         mvFinal[2] = { pu.mv[0], pu.mv[1] };
  Mv         mvOffset;

  const Distortion earlyTerminateTh = dx * dy;
  const int adaptiveSearchRangeHor = (dx >> 1) < BDMVR_INTME_RANGE ? (dx >> 1) : BDMVR_INTME_RANGE;
  const int adaptiveSearchRangeVer = (dy >> 1) < BDMVR_INTME_RANGE ? (dy >> 1) : BDMVR_INTME_RANGE;
  const bool adaptRange = (adaptiveSearchRangeHor != BDMVR_INTME_RANGE || adaptiveSearchRangeVer != BDMVR_INTME_RANGE);
  const int maxSearchRound = pu.bmMergeFlag ? BM_MRG_SUB_PU_INT_MAX_SRCH_ROUND : BDMVR_INTME_FULL_SEARCH_MAX_NUM_ITERATIONS;

  // prepare cDistParam for cost calculation
  DistParam cDistParam;
  cDistParam.applyWeight = false;
  cDistParam.useMR = false;
#if JVET_Y0089_DMVR_BCW
  cDistParam.useMR |= (pu.cu->BcwIdx != BCW_DEFAULT);
#endif

  Pel* pelBuffer[2] = { nullptr, nullptr };
  pelBuffer[0] = m_filteredBlock[3][REF_PIC_LIST_0][0] + BDMVR_CENTER_POSITION;
  pelBuffer[1] = m_filteredBlock[3][REF_PIC_LIST_1][0] + BDMVR_CENTER_POSITION;

  PelUnitBuf predBuf[2] = { PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_0], BDMVR_BUF_STRIDE, dx, dy)),
    PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_1], BDMVR_BUF_STRIDE, dx, dy)) };

  bool useHadamard = true;          // STAD cost function
  m_pcRdCost->setDistParam(cDistParam, predBuf[0].Y(), predBuf[1].Y(), pu.cu->slice->clpRng(COMPONENT_Y).bd, COMPONENT_Y, useHadamard);

  // prepare buffer for pre-interpolaction 
  const Picture& refPic0 = *pu.cu->slice->getRefPic(REF_PIC_LIST_0, pu.refIdx[REF_PIC_LIST_0])->unscaledPic;
  const Picture& refPic1 = *pu.cu->slice->getRefPic(REF_PIC_LIST_1, pu.refIdx[REF_PIC_LIST_1])->unscaledPic;

  int iWidthExt = dx + (BDMVR_INTME_RANGE << 1);
  int iHeightExt = dy + (BDMVR_INTME_RANGE << 1);
  int iWidthOffset = BDMVR_SIMD_IF_FACTOR - (iWidthExt & (BDMVR_SIMD_IF_FACTOR - 1));
  iWidthOffset &= (BDMVR_SIMD_IF_FACTOR - 1);
  iWidthExt += iWidthOffset; // This ensures that iWidthExt is a factor-of-n number, assuming BDMVR_SIMD_IF_FACTOR is equal to n

  PelUnitBuf predBufExt[2] = { (PelUnitBuf(pu.chromaFormat, PelBuf(m_filteredBlock[3][REF_PIC_LIST_0][0], BDMVR_BUF_STRIDE, iWidthExt, iHeightExt))),
    (PelUnitBuf(pu.chromaFormat, PelBuf(m_filteredBlock[3][REF_PIC_LIST_1][0], BDMVR_BUF_STRIDE, iWidthExt, iHeightExt))) };

  Mv mvTopLeft[2] = { mvInitial[0] - Mv((BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL), (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL)),
    mvInitial[1] - Mv((BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL), (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL)) };
  for (int y = puPos.y, yStart = 0; y < (puPos.y + pu.lumaSize().height); y = y + dy, yStart = yStart + dy)
  {
    for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + dx, xStart = xStart + dx)
    {
      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, dx, dy)));

      minCost = std::numeric_limits<Distortion>::max();

      // Pre-interpolation
      xBDMVRFillBlkPredPelBuffer(subPu, refPic0, mvTopLeft[0], predBufExt[0], pu.cs->slice->clpRng(COMPONENT_Y));
      xBDMVRFillBlkPredPelBuffer(subPu, refPic1, mvTopLeft[1], predBufExt[1], pu.cs->slice->clpRng(COMPONENT_Y));

      if (adaptRange)
      {
        minCost = xBDMVRMvIntPelFullSearch<true, true>(mvOffset, minCost, mvInitial,
          maxSearchRound,
          adaptiveSearchRangeHor, adaptiveSearchRangeVer,
          pu.bmMergeFlag,
          earlyTerminateTh, cDistParam,
          pelBuffer, BDMVR_BUF_STRIDE);
      }
      else
      {
        minCost = xBDMVRMvIntPelFullSearch<false, true>(mvOffset, minCost, mvInitial,
          maxSearchRound,
          adaptiveSearchRangeHor, adaptiveSearchRangeVer,
          pu.bmMergeFlag,
          earlyTerminateTh, cDistParam,
          pelBuffer, BDMVR_BUF_STRIDE);
      }
      if (minCost >= earlyTerminateTh)
      {
        int bestOffsetIdx = (mvOffset.getVer() + BDMVR_INTME_RANGE) * BDMVR_INTME_STRIDE + (mvOffset.getHor() + BDMVR_INTME_RANGE);
        mvOffset <<= MV_FRACTIONAL_BITS_INTERNAL;

        mvFinal[0] = mvInitial[0] + mvOffset;
        mvFinal[1] = mvInitial[1] - mvOffset;
        minCost = m_sadEnlargeArrayBilMrg[bestOffsetIdx];
        Distortion tmpCost = getDecoderSideDerivedMvCost(mvInitial[0], mvFinal[0], BDMVR_INTME_RANGE + 1, DECODER_SIDE_MV_WEIGHT);
        if (minCost >= tmpCost)
        {
          minCost += tmpCost;
          minCost = xBDMVRMvSquareSearch<true>(mvFinal, minCost/*std::numeric_limits<Distortion>::max()*/, subPu, mvInitial, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, false, true);
        }
      }
      else
      {
        mvOffset <<= MV_FRACTIONAL_BITS_INTERNAL;

        mvFinal[0] = mvInitial[0] + mvOffset;
        mvFinal[1] = mvInitial[1] - mvOffset;
      }

      m_bdmvrSubPuMvBuf[REF_PIC_LIST_0][subPuIdx] = mvFinal[0];
      m_bdmvrSubPuMvBuf[REF_PIC_LIST_1][subPuIdx] = mvFinal[1];
      subPuIdx++;
    }
    subPuIdx += dmvrSubPuStrideIncr;
  }
}

#endif

bool InterPrediction::processBDMVR(PredictionUnit& pu)
{
  if( !pu.cs->slice->getSPS()->getUseDMVDMode() || !pu.cs->slice->isInterB() )
  {
    return false;
  }
  CHECK( !pu.mergeFlag, "Merge mode must be used here" );
  CHECK( pu.refIdx[0] < 0 || pu.refIdx[1] < 0, "Bilateral DMVR is performed for bi-prediction" );

  const int lumaArea = pu.lumaSize().area();
  bool subPURefine = true;
  Mv puOrgMv[2] = { pu.mv[0], pu.mv[1] };
  {
    Distortion minCost = std::numeric_limits<Distortion>::max();
    bool       bUseMR = lumaArea > 64;
#if JVET_Y0089_DMVR_BCW
    bUseMR    |= (pu.cu->BcwIdx != BCW_DEFAULT);
#endif
    Mv         mvFinal_PU[2] = { pu.mv[0], pu.mv[1] };
    Mv         mvInitial_PU[2] = { pu.mv[0], pu.mv[1] };

#if JVET_X0049_BDMVR_SW_OPT
#if JVET_X0049_ADAPT_DMVR
    if (pu.bmDir == 1)
    {
      minCost = xBDMVRGetMatchingError(pu, mvInitial_PU, bUseMR, false);
      if (minCost >= lumaArea)
      {
        minCost = xBDMVRMvOneTemplateHPelSquareSearch<1>(mvFinal_PU, minCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false);
      }
    }
    else if (pu.bmDir == 2)
    {
      minCost = xBDMVRGetMatchingError(pu, mvInitial_PU, bUseMR, false);
      if (minCost >= lumaArea)
      {
        minCost = xBDMVRMvOneTemplateHPelSquareSearch<2>(mvFinal_PU, minCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false);
      }
    }
    else
#endif
    {
      minCost = xBDMVRMvSquareSearch<false>( mvFinal_PU, minCost, pu, mvInitial_PU, BDMVR_INTME_SQUARE_SEARCH_MAX_NUM_ITERATIONS, MV_FRACTIONAL_BITS_INTERNAL,     bUseMR, false );
      if (minCost > 0)
      {
        minCost = xBDMVRMvSquareSearch<true>(mvFinal_PU, minCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, bUseMR, false);
      }
    }
#else
    minCost = xBDMVRMvSquareSearch( mvFinal_PU, minCost, pu, mvInitial_PU, BDMVR_INTME_SQUARE_SEARCH_MAX_NUM_ITERATIONS, MV_FRACTIONAL_BITS_INTERNAL,     bUseMR, false );
    minCost = xBDMVRMvSquareSearch( mvFinal_PU, minCost, pu, mvInitial_PU, 2, MV_FRACTIONAL_BITS_INTERNAL - 1,     bUseMR, false );
#endif

    subPURefine = minCost >= lumaArea;
    pu.mv[REF_PIC_LIST_0] = mvFinal_PU[0];
    pu.mv[REF_PIC_LIST_1] = mvFinal_PU[1];
  }

#if TM_MRG
  if (pu.tmMergeFlag)
  {
    deriveTMMv(pu);
    if (pu.interDir != 3)
    {
      return false;
    }
  }
#endif

  if (!subPURefine)
  {
    // span motion to subPU
    const int dy = std::min<int>(pu.lumaSize().height, DMVR_SUBCU_HEIGHT);
    const int dx = std::min<int>(pu.lumaSize().width, DMVR_SUBCU_WIDTH);
    Position puPos = pu.lumaPos();

    int subPuIdx = 0;
    const int dmvrSubPuStrideIncr = DMVR_SUBPU_STRIDE - std::max(1, (int)(pu.lumaSize().width >> DMVR_SUBCU_WIDTH_LOG2));
    for (int y = puPos.y, yStart = 0; y < (puPos.y + pu.lumaSize().height); y = y + dy, yStart = yStart + dy)
    {
      for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + dx, xStart = xStart + dx)
      {
        m_bdmvrSubPuMvBuf[REF_PIC_LIST_0][subPuIdx] = pu.mv[0];
        m_bdmvrSubPuMvBuf[REF_PIC_LIST_1][subPuIdx] = pu.mv[1];
        subPuIdx++;
      }
      subPuIdx += dmvrSubPuStrideIncr;
    }
    pu.mv[0] = puOrgMv[0];
    pu.mv[1] = puOrgMv[1];
    return true;
  }

  const int dy = std::min<int>(pu.lumaSize().height, DMVR_SUBCU_HEIGHT);
  const int dx = std::min<int>(pu.lumaSize().width,  DMVR_SUBCU_WIDTH);
  Position puPos = pu.lumaPos();
  PredictionUnit subPu = pu;

  int subPuIdx = 0;
  const int dmvrSubPuStrideIncr = DMVR_SUBPU_STRIDE - std::max(1, (int)(pu.lumaSize().width >> DMVR_SUBCU_WIDTH_LOG2));
#if JVET_X0049_BDMVR_SW_OPT
  Distortion minCost = std::numeric_limits<Distortion>::max();
  const Mv   mvInitial[2] = { pu.mv[0], pu.mv[1] };
  Mv         mvFinal[2] = { pu.mv[0], pu.mv[1] };
  Mv         mvOffset;

  const Distortion earlyTerminateTh = dx * dy;
  const int adaptiveSearchRangeHor = (dx >> 1) < BDMVR_INTME_RANGE ? (dx >> 1) : BDMVR_INTME_RANGE;
  const int adaptiveSearchRangeVer = (dy >> 1) < BDMVR_INTME_RANGE ? (dy >> 1) : BDMVR_INTME_RANGE;
  const bool adaptRange = (adaptiveSearchRangeHor != BDMVR_INTME_RANGE || adaptiveSearchRangeVer != BDMVR_INTME_RANGE);
#if JVET_X0049_ADAPT_DMVR
  const int maxSearchRound = pu.bmMergeFlag ? BM_MRG_SUB_PU_INT_MAX_SRCH_ROUND : BDMVR_INTME_FULL_SEARCH_MAX_NUM_ITERATIONS;
#else
  const int maxSearchRound = BDMVR_INTME_FULL_SEARCH_MAX_NUM_ITERATIONS;
#endif

  // prepare cDistParam for cost calculation
  DistParam cDistParam;
  cDistParam.applyWeight = false;
  cDistParam.useMR = false;
#if JVET_Y0089_DMVR_BCW
  cDistParam.useMR |= (pu.cu->BcwIdx != BCW_DEFAULT);
#endif

  Pel* pelBuffer[2] = { nullptr, nullptr };
  pelBuffer[0] = m_filteredBlock[3][REF_PIC_LIST_0][0] + BDMVR_CENTER_POSITION;
  pelBuffer[1] = m_filteredBlock[3][REF_PIC_LIST_1][0] + BDMVR_CENTER_POSITION;

  PelUnitBuf predBuf[2] = { PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_0], BDMVR_BUF_STRIDE, dx, dy)),
    PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_1], BDMVR_BUF_STRIDE, dx, dy)) };

  bool useHadamard = true;          // STAD cost function
  m_pcRdCost->setDistParam(cDistParam, predBuf[0].Y(), predBuf[1].Y(), pu.cu->slice->clpRng(COMPONENT_Y).bd, COMPONENT_Y, useHadamard);

  // prepare buffer for pre-interpolaction 
  const Picture& refPic0 = *pu.cu->slice->getRefPic(REF_PIC_LIST_0, pu.refIdx[REF_PIC_LIST_0])->unscaledPic;
  const Picture& refPic1 = *pu.cu->slice->getRefPic(REF_PIC_LIST_1, pu.refIdx[REF_PIC_LIST_1])->unscaledPic;

  int iWidthExt = dx + (BDMVR_INTME_RANGE << 1);
  int iHeightExt = dy + (BDMVR_INTME_RANGE << 1);
  int iWidthOffset = BDMVR_SIMD_IF_FACTOR - (iWidthExt & (BDMVR_SIMD_IF_FACTOR - 1));
  iWidthOffset &= (BDMVR_SIMD_IF_FACTOR - 1);
  iWidthExt += iWidthOffset; // This ensures that iWidthExt is a factor-of-n number, assuming BDMVR_SIMD_IF_FACTOR is equal to n

  PelUnitBuf predBufExt[2] = { (PelUnitBuf(pu.chromaFormat, PelBuf(m_filteredBlock[3][REF_PIC_LIST_0][0], BDMVR_BUF_STRIDE, iWidthExt, iHeightExt))),
    (PelUnitBuf(pu.chromaFormat, PelBuf(m_filteredBlock[3][REF_PIC_LIST_1][0], BDMVR_BUF_STRIDE, iWidthExt, iHeightExt))) };

  Mv mvTopLeft[2] = { mvInitial[0] - Mv((BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL), (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL)),
                      mvInitial[1] - Mv((BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL), (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL)) };
#endif
  for (int y = puPos.y, yStart = 0; y < (puPos.y + pu.lumaSize().height); y = y + dy, yStart = yStart + dy)
  {
    for (int x = puPos.x, xStart = 0; x < (puPos.x + pu.lumaSize().width); x = x + dx, xStart = xStart + dx)
    {
#if JVET_X0049_BDMVR_SW_OPT
      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, dx, dy)));

      minCost = std::numeric_limits<Distortion>::max();

      // Pre-interpolation
      xBDMVRFillBlkPredPelBuffer(subPu, refPic0, mvTopLeft[0], predBufExt[0], pu.cs->slice->clpRng(COMPONENT_Y));
      xBDMVRFillBlkPredPelBuffer(subPu, refPic1, mvTopLeft[1], predBufExt[1], pu.cs->slice->clpRng(COMPONENT_Y));
      
      if (adaptRange)
      {
        minCost = xBDMVRMvIntPelFullSearch<true, true>(mvOffset, minCost, mvInitial,
          maxSearchRound,
          adaptiveSearchRangeHor, adaptiveSearchRangeVer,
#if JVET_X0056_DMVD_EARLY_TERMINATION
          true,
#elif JVET_X0049_ADAPT_DMVR
          pu.bmMergeFlag,
#else
          false,
#endif
          earlyTerminateTh, cDistParam,
          pelBuffer, BDMVR_BUF_STRIDE);
      }
      else
      {
        minCost = xBDMVRMvIntPelFullSearch<false, true>(mvOffset, minCost, mvInitial,
          maxSearchRound,
          adaptiveSearchRangeHor, adaptiveSearchRangeVer,
#if JVET_X0056_DMVD_EARLY_TERMINATION
          true,
#elif JVET_X0049_ADAPT_DMVR
          pu.bmMergeFlag,
#else
          false,
#endif
          earlyTerminateTh, cDistParam,
          pelBuffer, BDMVR_BUF_STRIDE);
      }
      if (minCost >= earlyTerminateTh)
      {
        int bestOffsetIdx = (mvOffset.getVer() + BDMVR_INTME_RANGE) * BDMVR_INTME_STRIDE + (mvOffset.getHor() + BDMVR_INTME_RANGE);
        mvOffset <<= MV_FRACTIONAL_BITS_INTERNAL;

        mvFinal[0] = mvInitial[0] + mvOffset;
        mvFinal[1] = mvInitial[1] - mvOffset;
        minCost = m_sadEnlargeArrayBilMrg[bestOffsetIdx];
        Distortion tmpCost = getDecoderSideDerivedMvCost(mvInitial[0], mvFinal[0], BDMVR_INTME_RANGE + 1, DECODER_SIDE_MV_WEIGHT);
        if (minCost >= tmpCost)
        {
          minCost += tmpCost;
          minCost = xBDMVRMvSquareSearch<true>(mvFinal, minCost/*std::numeric_limits<Distortion>::max()*/, subPu, mvInitial, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, false, true);
        }
      }
      else
      {
        mvOffset <<= MV_FRACTIONAL_BITS_INTERNAL;

        mvFinal[0] = mvInitial[0] + mvOffset;
        mvFinal[1] = mvInitial[1] - mvOffset;
      }
#else
      subPu.UnitArea::operator=(UnitArea(pu.chromaFormat, Area(x, y, dx, dy)));
      Distortion minCost = std::numeric_limits<Distortion>::max();
      bool       bUseMR = subPu.lumaSize().area() > 64;
      Mv         mvInitial[2] = { pu.mv[0], pu.mv[1] };
      Mv         mvFinal[2] = { pu.mv[0], pu.mv[1] };
      const int subPuBufOffset = 0;  // will do interpolation inside search

      minCost = xBDMVRMvIntPelFullSearch( mvFinal, minCost, subPu, mvInitial, BDMVR_INTME_SQUARE_SEARCH_MAX_NUM_ITERATIONS, MV_FRACTIONAL_BITS_INTERNAL, bUseMR, subPuBufOffset );
      minCost = (minCost < dx * dy) ? 0 : std::numeric_limits<Distortion>::max();
      minCost = xBDMVRMvSquareSearch( mvFinal, minCost, subPu, mvInitial, 2, MV_FRACTIONAL_BITS_INTERNAL - 1, false,  true);
#endif

      m_bdmvrSubPuMvBuf[REF_PIC_LIST_0][subPuIdx] = mvFinal[0];
      m_bdmvrSubPuMvBuf[REF_PIC_LIST_1][subPuIdx] = mvFinal[1];
      subPuIdx++;
    }
    subPuIdx += dmvrSubPuStrideIncr;
  }

  pu.mv[0] = puOrgMv[0];
  pu.mv[1] = puOrgMv[1];
  return true;
}

void InterPrediction::xBDMVRFillBlkPredPelBuffer(const PredictionUnit& pu, const Picture& refPic, const Mv &_mv, PelUnitBuf &dstBuf, const ClpRng& clpRng)
{
  const ComponentID compID = COMPONENT_Y;
  const CPelBuf     refBuf = refPic.getRecoBuf(refPic.blocks[compID]);

  const int lumaShift = 2 + MV_FRACTIONAL_BITS_DIFF;
  const int horShift = (lumaShift + ::getComponentScaleX(compID, pu.chromaFormat));
  const int verShift = (lumaShift + ::getComponentScaleY(compID, pu.chromaFormat));

  Mv mv(_mv);
  clipMv(mv, pu.lumaPos(), pu.lumaSize(), *pu.cu->cs->sps, *pu.cu->cs->pps);
  const int xInt  = mv.getHor() >> horShift;
  const int yInt  = mv.getVer() >> verShift;
  const int xFrac = mv.getHor() & ((1 << horShift) - 1);
  const int yFrac = mv.getVer() & ((1 << verShift) - 1);

  const Pel* ref       = refBuf.bufAt(pu.blocks[compID].pos().offset(xInt, yInt));
        Pel* dst       = dstBuf.bufs[compID].buf;
        int  refStride = refBuf.stride;
        int  dstStride = dstBuf.bufs[compID].stride;
        int  bw        = (int)dstBuf.bufs[compID].width;
        int  bh        = (int)dstBuf.bufs[compID].height;

  const int  nFilterIdx   = 0;
  const bool useAltHpelIf = pu.cu->imv == IMV_HPEL;
  const bool biMCForDMVR  = true;

  if ( yFrac == 0 )
  {
    m_if.filterHor( compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, xFrac, false/*rndRes=!bi*/,
                    pu.chromaFormat, pu.cu->slice->clpRng(compID), biMCForDMVR, biMCForDMVR, useAltHpelIf );
  }
  else if ( xFrac == 0 )
  {
    m_if.filterVer( compID, (Pel*) ref, refStride, dst, dstStride, bw, bh, yFrac, true, false/*rndRes=!bi*/,
        pu.chromaFormat, pu.cu->slice->clpRng(compID), biMCForDMVR, biMCForDMVR, useAltHpelIf );
  }
  else
  {
#if IF_12TAP
    int vFilterSize = isLuma(compID) ? (nFilterIdx == 1 ? NTAPS_BILINEAR : NTAPS_LUMA(0)) : NTAPS_CHROMA;
#else
    int vFilterSize = isLuma(compID) ? (nFilterIdx == 1 ? NTAPS_BILINEAR : NTAPS_LUMA) : NTAPS_CHROMA;
#endif
    if (biMCForDMVR)
    {
      vFilterSize = NTAPS_BILINEAR;
    }
    PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], Size(bw + 2 * BDMVR_INTME_RANGE, bh + 2 * BDMVR_INTME_RANGE));

    m_if.filterHor( compID, (Pel*)ref - ((vFilterSize>>1) -1)*refStride, refStride, tmpBuf.buf, tmpBuf.stride, bw, bh+vFilterSize-1, xFrac,
                    false, pu.chromaFormat, pu.cu->slice->clpRng(compID), biMCForDMVR, biMCForDMVR, useAltHpelIf );
    JVET_J0090_SET_CACHE_ENABLE( false );
    m_if.filterVer( compID, tmpBuf.buf + ((vFilterSize>>1) -1)*tmpBuf.stride, tmpBuf.stride, dst, dstStride, bw, bh, yFrac,
                    false, false/*rndRes=!bi*/, pu.chromaFormat, pu.cu->slice->clpRng(compID), biMCForDMVR, biMCForDMVR, useAltHpelIf );
    JVET_J0090_SET_CACHE_ENABLE( true );
  }
}

#if JVET_X0049_ADAPT_DMVR
template <uint8_t dir>
#endif
void InterPrediction::xBDMVRPreInterpolation(const PredictionUnit& pu, const Mv (&mvCenter)[2], bool doPreInterpolationFP, bool doPreInterpolationHP)
{
  if (doPreInterpolationFP)
  {
    for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
    {
#if JVET_X0049_ADAPT_DMVR
      if (!(dir & (1 << refList)))
      {
        continue;
      }
#endif
      const Picture& refPic  = *pu.cu->slice->getRefPic((RefPicList)refList, pu.refIdx[refList])->unscaledPic;

      int dstStride    = MAX_CU_SIZE + ( BDMVR_INTME_RANGE << 1 ) + ( BDMVR_SIMD_IF_FACTOR - 2 );
      int iWidthExt    = (int)pu.lwidth () + ( BDMVR_INTME_RANGE << 1 );
      int iHeightExt   = (int)pu.lheight() + ( BDMVR_INTME_RANGE << 1 );
      int iWidthOffset = BDMVR_SIMD_IF_FACTOR - ( iWidthExt & ( BDMVR_SIMD_IF_FACTOR - 1 ) );
      iWidthOffset    &= ( BDMVR_SIMD_IF_FACTOR - 1 );
      iWidthExt       += iWidthOffset; // This ensures that iWidthExt is a factor-of-n number, assuming BDMVR_SIMD_IF_FACTOR is equal to n

      Mv mv = mvCenter[refList] - Mv((BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL), (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL));
      PelUnitBuf predBuf = ( PelUnitBuf(pu.chromaFormat, PelBuf(m_filteredBlock[3][refList][0], dstStride, iWidthExt, iHeightExt ) ) );
      xBDMVRFillBlkPredPelBuffer(pu, refPic, mv, predBuf, pu.cs->slice->clpRng(COMPONENT_Y));
    }
  }

  if (doPreInterpolationHP)
  {
    const int offset = 0 - ( 1 << ( MV_FRACTIONAL_BITS_INTERNAL - 1 ) );
    const Mv  cPhaseOffset[3] = { Mv( offset , 0 ), Mv( offset, offset ), Mv( 0 , offset ) };

    for (int refList = 0; refList < NUM_REF_PIC_LIST_01 ; refList++)
    {
#if JVET_X0049_ADAPT_DMVR
      if (!(dir & (1 << refList)))
      {
        continue;
      }
#endif
      const Picture& refPic = *pu.cu->slice->getRefPic((RefPicList)refList, pu.refIdx[refList])->unscaledPic;

      for (int phaseIdx = 0 ; phaseIdx < 3 ; phaseIdx++)
      {
        int iRefStride   = MAX_CU_SIZE  + ( BDMVR_INTME_RANGE << 1 ) + ( BDMVR_SIMD_IF_FACTOR - 2 );
        int iWidthExt    = (int)pu.lwidth () + 1 - (   phaseIdx  >> 1);
        int iHeightExt   = (int)pu.lheight() + 1 - ((2-phaseIdx) >> 1);
        int iWidthOffset = BDMVR_SIMD_IF_FACTOR - ( iWidthExt & ( BDMVR_SIMD_IF_FACTOR - 1 ) );
        iWidthOffset    &= ( BDMVR_SIMD_IF_FACTOR - 1 );
        iWidthExt       += iWidthOffset; // This ensures that iWidthExt is a factor-of-n number, assuming BDMVR_SIMD_IF_FACTOR is equal to n

        Mv mv = mvCenter[refList] + cPhaseOffset[phaseIdx];
        PelUnitBuf predBuf = PelUnitBuf( pu.chromaFormat, PelBuf( m_filteredBlock[phaseIdx][refList][0], iRefStride, iWidthExt, iHeightExt ) );
        xBDMVRFillBlkPredPelBuffer( pu, refPic, mv, predBuf, pu.cs->slice->clpRng(COMPONENT_Y) );
      }
    }
  }
}

#if JVET_X0049_BDMVR_SW_OPT
template <bool adaptRange, bool useHadamard>
Distortion InterPrediction::xBDMVRMvIntPelFullSearch(Mv&mvOffset, Distortion curBestCost, const Mv(&initialMv)[2], const int32_t maxSearchRounds, const int maxHorOffset, const int maxVerOffset, const bool earlySkip, const Distortion earlyTerminateTh, DistParam &cDistParam, Pel* pelBuffer[2], const int stride)
{
  // check initial cost  
  mvOffset.setZero();
  cDistParam.org.buf = pelBuffer[0];
  cDistParam.cur.buf = pelBuffer[1];

#if FULL_NBIT
  if (useHadamard)
  {
    curBestCost = cDistParam.distFunc(cDistParam) >> 1;  // magic shift, benefit for early terminate
  }
  else
  {
    int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0;
    curBestCost = cDistParam.distFunc(cDistParam) >> precisionAdj;
  }
#else
  curBestCost = cDistParam.distFunc(cDistParam);
#endif

  m_sadEnlargeArrayBilMrg[BDMVR_INTME_CENTER] = curBestCost;
  curBestCost = curBestCost - (curBestCost >> 2);  // cost tuning

  if (curBestCost < earlyTerminateTh)
  {
    return curBestCost;
  }

  Distortion tmCost = MAX_UINT64;
  Distortion prevMinCost = MAX_UINT64;

  for (int searchPrio = 1; searchPrio < maxSearchRounds; searchPrio++)
  {
    prevMinCost = curBestCost;
    for (int currIdx = 0; currIdx < m_searchEnlargeOffsetNum[searchPrio]; currIdx++)
    {
      tmCost = 0;
      int horOffset = m_searchEnlargeOffsetBilMrg[searchPrio][currIdx].getHor();
      int verOffset = m_searchEnlargeOffsetBilMrg[searchPrio][currIdx].getVer();
      int searchOffsetIdx = m_searchEnlargeOffsetToIdx[searchPrio][currIdx];

      if (adaptRange)
      {
        if (abs(horOffset) > maxHorOffset || abs(verOffset) > maxVerOffset)
        {
          continue;
        }
      }
        
      int bufOffset = verOffset * stride + horOffset;
      cDistParam.org.buf = pelBuffer[0] + bufOffset;
      cDistParam.cur.buf = pelBuffer[1] - bufOffset;

#if FULL_NBIT
      if (useHadamard)
      {
        m_sadEnlargeArrayBilMrg[searchOffsetIdx] = cDistParam.distFunc(cDistParam) >> 1;  // magic shift, benefit for early terminate
      }
      else
      {
        int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0;
        m_sadEnlargeArrayBilMrg[searchOffsetIdx] = cDistParam.distFunc(cDistParam) >> precisionAdj;
      }
#else
      m_sadEnlargeArrayBilMrg[searchOffsetIdx] = cDistParam.distFunc(cDistParam);
#endif

      tmCost += m_sadEnlargeArrayBilMrg[searchOffsetIdx];
      tmCost += (m_sadEnlargeArrayBilMrg[searchOffsetIdx] >> m_costShiftBilMrg1[searchOffsetIdx]);
      tmCost += (m_sadEnlargeArrayBilMrg[searchOffsetIdx] >> m_costShiftBilMrg2[searchOffsetIdx]);

      if (tmCost < curBestCost)
      {
        mvOffset = Mv(horOffset, verOffset);
        curBestCost = tmCost;
      }
    }

    if (curBestCost < earlyTerminateTh)
    {
      break;
    }
    if (earlySkip && searchPrio > 1 && prevMinCost - curBestCost < earlyTerminateTh)
    {
      break;
    }
  }
  return curBestCost;
}
#else
Distortion InterPrediction::xBDMVRMvIntPelFullSearch(Mv(&curBestMv)[2], Distortion curBestCost, PredictionUnit& pu, const Mv(&initialMv)[2], int32_t maxSearchRounds, int32_t searchStepShift, bool useMR, const int subPuBufOffset)
{
  bool doPreInterpolation = true;
  bool useHadamard = true;          // STAD cost function
  useMR = false;                    // STAD cost function
  const int adaptiveSearchRangeHor = (pu.lwidth() >> 1) < BDMVR_INTME_RANGE ? (pu.lwidth() >> 1) : BDMVR_INTME_RANGE;
  const int adaptiveSearchRangeVer = (pu.lheight() >> 1) < BDMVR_INTME_RANGE ? (pu.lheight() >> 1) : BDMVR_INTME_RANGE;

  // Calculate TM cost of initial MVs, if it is not set
  if (curBestCost == std::numeric_limits<Distortion>::max())
  {
    curBestCost = xBDMVRGetMatchingError( pu, curBestMv, subPuBufOffset, useHadamard, useMR,
                                          doPreInterpolation, searchStepShift, curBestMv, initialMv, -1 );
  }

  for (int i = 0; i < BDMVR_INTME_AREA; i++)
  {
    m_sadEnlargeArrayBilMrg[i] = MAX_UINT64;
  }
  m_sadEnlargeArrayBilMrg[BDMVR_INTME_CENTER] = curBestCost;
  curBestCost = curBestCost - (curBestCost >> 2);  // cost tuning
  const Distortion earlyTerminateTh = pu.lumaSize().area();
  Distortion tmCost = MAX_UINT64;
#if JVET_X0056_DMVD_EARLY_TERMINATION
  Distortion prevMinCost = MAX_UINT64;
#endif

  for( int searchPrio = 0 ; searchPrio < BDMVR_INTME_FULL_SEARCH_MAX_NUM_ITERATIONS; searchPrio++ )
  {
    if( curBestCost < earlyTerminateTh )
    {
      break;
    }

#if JVET_X0056_DMVD_EARLY_TERMINATION
    prevMinCost = curBestCost;
#endif
    for (int searchOffsetIdx = 0; searchOffsetIdx < BDMVR_INTME_AREA; searchOffsetIdx++)
    {
      tmCost = 0;
      if( m_searchPriorityBilMrg[searchOffsetIdx] != searchPrio )
      {
        continue;
      }
      // adaptive search area base on block dimension
      if( m_searchEnlargeOffsetBilMrg[searchOffsetIdx].getAbsVer() > adaptiveSearchRangeVer )
      {
        continue;
      }
      if( m_searchEnlargeOffsetBilMrg[searchOffsetIdx].getAbsHor() > adaptiveSearchRangeHor )
      {
        continue;
      }

      Mv mvOffset(m_searchEnlargeOffsetBilMrg[searchOffsetIdx].getHor() << searchStepShift, m_searchEnlargeOffsetBilMrg[searchOffsetIdx].getVer() << searchStepShift);
      Mv mvCand[2] = {initialMv[0] + mvOffset, initialMv[1] - mvOffset};

      if ( m_sadEnlargeArrayBilMrg[searchOffsetIdx] == MAX_UINT64 )
      {
        m_sadEnlargeArrayBilMrg[searchOffsetIdx] = xBDMVRGetMatchingError( pu, mvCand, subPuBufOffset, useHadamard, useMR,
                                                                             doPreInterpolation, searchStepShift, initialMv, initialMv, -1 );
      }
      tmCost += m_sadEnlargeArrayBilMrg[searchOffsetIdx];
      tmCost += (m_sadEnlargeArrayBilMrg[searchOffsetIdx] >> m_costShiftBilMrg1[searchOffsetIdx]);
      tmCost += (m_sadEnlargeArrayBilMrg[searchOffsetIdx] >> m_costShiftBilMrg2[searchOffsetIdx]);

      if( tmCost < curBestCost )
      {
        curBestCost  = tmCost;
        curBestMv[0] = mvCand[0];
        curBestMv[1] = mvCand[1];
      }
    }
#if JVET_X0056_DMVD_EARLY_TERMINATION
    if (searchPrio > 1 && prevMinCost - curBestCost < earlyTerminateTh)
    {
      break;
    }
#endif
  }

  return curBestCost;
}
#endif

#if JVET_X0049_BDMVR_SW_OPT
template<bool hPel>
#endif
Distortion InterPrediction::xBDMVRMvSquareSearch(Mv (&curBestMv)[2], Distortion curBestCost, PredictionUnit& pu, const Mv (&initialMv)[2], int32_t maxSearchRounds, int32_t searchStepShift, bool useMR, bool useHadmard)
{
#if !JVET_X0049_BDMVR_SW_OPT
  if (curBestCost == 0)
  {
    return 0;
  }
#endif
  static const Mv   cSearchOffset[8] = { Mv( -1 , 1 ) , Mv( 0 , 1 ) , Mv(  1 ,  1 ) , Mv(  1 ,  0 ) , Mv(  1 , -1 ) , Mv(  0 , -1 ) , Mv( -1 , -1 ) , Mv( -1 , 0 )  };
               int  nDirectStart     = 0;
               int  nDirectEnd       = 7;
         const int  nDirectRounding  = 8;
         const int  nDirectMask      = 0x07;
               bool doPreInterpolation = searchStepShift == MV_FRACTIONAL_BITS_INTERNAL;

  // Calculate TM cost of initial MVs, if it is not set
  if (curBestCost == std::numeric_limits<Distortion>::max())
  {
    CHECK(searchStepShift < MV_FRACTIONAL_BITS_INTERNAL - 1, "this is not possible");
#if JVET_X0049_BDMVR_SW_OPT
    if (hPel)
    {
      doPreInterpolation = true;
      Distortion tmCost = getDecoderSideDerivedMvCost(initialMv[0], curBestMv[0], BDMVR_INTME_RANGE + (MV_FRACTIONAL_BITS_INTERNAL - searchStepShift), DECODER_SIDE_MV_WEIGHT);
      curBestCost = xBDMVRGetMatchingError(pu, curBestMv, useMR, useHadmard);
#else
    if (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1)
    {
      doPreInterpolation = true;
      Distortion tmCost = getDecoderSideDerivedMvCost(initialMv[0], curBestMv[0], BDMVR_INTME_RANGE + (MV_FRACTIONAL_BITS_INTERNAL - searchStepShift), DECODER_SIDE_MV_WEIGHT);
      curBestCost = xBDMVRGetMatchingError( pu, curBestMv, 0/*subPuOffset*/, useHadmard, useMR,
                                            doPreInterpolation, MV_FRACTIONAL_BITS_INTERNAL, curBestMv, curBestMv, -1 );
#endif
      if( curBestCost < tmCost )
      {
        return curBestCost;
      }

      curBestCost += tmCost;
    }
    else
    {
#if JVET_X0049_ADAPT_DMVR
      curBestCost = xBDMVRGetMatchingError<3>(pu, curBestMv, 0/*subPuOffset*/, useHadmard, useMR,
        doPreInterpolation, searchStepShift, curBestMv, initialMv, -1);
#else
      curBestCost = xBDMVRGetMatchingError( pu, curBestMv, 0/*subPuOffset*/, useHadmard, useMR,
                                            doPreInterpolation, searchStepShift, curBestMv, initialMv, -1 );
#endif
    }
  }

  Distortion localCostArray[9] = { std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(),
                                   std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(),
                                   std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(), curBestCost };

  // Iterative search process
  for( uint32_t uiRound = 0 ; uiRound < maxSearchRounds ; uiRound++ )
  {
    int nBestDirect    = -1;
    Mv  mvCurCenter[2] = {curBestMv[0], curBestMv[1]};
    doPreInterpolation |= (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1);

    for( int nIdx = nDirectStart ; nIdx <= nDirectEnd ; nIdx++ )
    {
      int nDirect = ( nIdx + nDirectRounding ) & nDirectMask;

      Mv mvOffset(cSearchOffset[nDirect].getHor() << searchStepShift, cSearchOffset[nDirect].getVer() << searchStepShift);

#if JVET_X0049_BDMVR_SW_OPT
      if(hPel && uiRound > 0)
#else
      if (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1 && uiRound > 0)
#endif
      {
        if( ( nDirect % 2 ) == 0 )
        {
          continue;
        }
      }
      Mv mvCand[2] = {mvCurCenter[0] + mvOffset, mvCurCenter[1] - mvOffset};
#if JVET_X0049_BDMVR_SW_OPT
      if(!hPel)
#else
      if (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL)
#endif
      {
        int currentIdx = BDMVR_INTME_CENTER + ((mvCand[0] -initialMv[0]).hor >> searchStepShift) +
                                              ((mvCand[0] -initialMv[0]).ver >> searchStepShift) * BDMVR_INTME_STRIDE;
        if( currentIdx < 0 || currentIdx >= BDMVR_INTME_AREA )
        {
          continue;
        }
      }

      Distortion tmCost = getDecoderSideDerivedMvCost(initialMv[0], mvCand[0], BDMVR_INTME_RANGE + (MV_FRACTIONAL_BITS_INTERNAL - searchStepShift), DECODER_SIDE_MV_WEIGHT);

      if (tmCost > curBestCost)
      {
        localCostArray[nDirect] = 2 * tmCost;
        continue;
      }

#if JVET_X0049_ADAPT_DMVR
      tmCost += xBDMVRGetMatchingError<3>(pu, mvCand, 0/*subPuOffset*/, useHadmard, useMR,
        doPreInterpolation, searchStepShift, mvCurCenter, initialMv, nDirect);
#else
      tmCost += xBDMVRGetMatchingError( pu, mvCand, 0/*subPuOffset*/, useHadmard, useMR,
                                        doPreInterpolation, searchStepShift, mvCurCenter, initialMv, nDirect );
#endif
      localCostArray[nDirect] = tmCost;
#if JVET_X0049_BDMVR_SW_OPT
      if(hPel && uiRound > 0)
#else
      if (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1 && uiRound > 0)
#endif
      {
        continue;
      }

      if( tmCost < curBestCost )
      {
        nBestDirect  = nDirect;
        curBestCost  = tmCost;
        curBestMv[0] = mvCand[0];
        curBestMv[1] = mvCand[1];
      }
    }

    if( nBestDirect == -1 )
    {
      break;
    }

    int nStep    = 2 - ( nBestDirect & 0x01 );
    nDirectStart = nBestDirect - nStep;
    nDirectEnd   = nBestDirect + nStep;

    if ((uiRound + 1) < maxSearchRounds)
    {
      xBDMVRUpdateSquareSearchCostLog(localCostArray, nBestDirect);
    }
  }

#if JVET_X0049_BDMVR_SW_OPT
  if(!hPel)
#else
  if (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL)
#endif
  {
    return curBestCost;
  }

  // Model-based fractional MVD optimization
  Mv mvDiff = curBestMv[0] - initialMv[0];
  if (localCostArray[8] > 0 && localCostArray[8] == curBestCost && mvDiff.getAbsHor() != (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL) && mvDiff.getAbsVer() != (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL))
  {
    uint64_t sadbuffer[5];
    sadbuffer[0] = (uint64_t)localCostArray[8]; // center
    sadbuffer[1] = (uint64_t)localCostArray[7]; // left
    sadbuffer[2] = (uint64_t)localCostArray[5]; // above
    sadbuffer[3] = (uint64_t)localCostArray[3]; // right
    sadbuffer[4] = (uint64_t)localCostArray[1]; // bottom

    int32_t tempDeltaMv[2] = {0, 0};
    xSubPelErrorSrfc(sadbuffer, tempDeltaMv);

    curBestMv[0] += Mv(tempDeltaMv[0], tempDeltaMv[1]);
    curBestMv[1] -= Mv(tempDeltaMv[0], tempDeltaMv[1]);
  }

  return curBestCost;
}

#if JVET_X0049_ADAPT_DMVR
template <uint8_t dir>
Distortion InterPrediction::xBDMVRMvOneTemplateHPelSquareSearch(Mv(&curBestMv)[2], Distortion curBestCost, PredictionUnit& pu,
  const Mv(&initialMv)[2], int32_t maxSearchRounds, int32_t searchStepShift,
  bool useMR, bool useHadmard)
{
  if (curBestCost == 0)
  {
    return 0;
  }

  static const Mv   cSearchOffset[8] = { Mv(-1 , 1) , Mv(0 , 1) , Mv(1 ,  1) , Mv(1 ,  0) , Mv(1 , -1) , Mv(0 , -1) , Mv(-1 , -1) , Mv(-1 , 0) };
  int  nDirectStart = 0;
  int  nDirectEnd = 7;
  const int  nDirectRounding = 8;
  const int  nDirectMask = 0x07;
  bool doPreInterpolation = searchStepShift == MV_FRACTIONAL_BITS_INTERNAL;
  const int curRefList = (dir >> 1);
  const int templateRefList = 1 - curRefList;
  // Calculate TM cost of initial MVs, if it is not set
  if (curBestCost == std::numeric_limits<Distortion>::max())
  {
    CHECK(searchStepShift < MV_FRACTIONAL_BITS_INTERNAL - 1, "this is not possible");
    Distortion tmCost = getDecoderSideDerivedMvCost(initialMv[curRefList], curBestMv[curRefList], BDMVR_INTME_RANGE + (MV_FRACTIONAL_BITS_INTERNAL - searchStepShift), DECODER_SIDE_MV_WEIGHT);
    curBestCost = xBDMVRGetMatchingError(pu, curBestMv, useMR, useHadmard);
    if (curBestCost < tmCost)
    {
      return curBestCost;
    }

    curBestCost += tmCost;
  }

  Distortion localCostArray[9] = { std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(),
    std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(),
    std::numeric_limits<Distortion>::max(), std::numeric_limits<Distortion>::max(), curBestCost };

  // Iterative search process
  for (uint32_t uiRound = 0; uiRound < maxSearchRounds; uiRound++)
  {
    int nBestDirect = -1;
    Mv  mvCurCenter[2] = { curBestMv[0], curBestMv[1] };
    doPreInterpolation |= (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1);

    for (int nIdx = nDirectStart; nIdx <= nDirectEnd; nIdx++)
    {
      int nDirect = (nIdx + nDirectRounding) & nDirectMask;

      Mv mvOffset(cSearchOffset[nDirect].getHor() << searchStepShift, cSearchOffset[nDirect].getVer() << searchStepShift);

      if (uiRound > 0)
      {
        if ((nDirect % 2) == 0)
        {
          continue;
        }
      }
      Mv mvCand[2] = { mvCurCenter[0] + mvOffset, mvCurCenter[1] - mvOffset };
      mvCand[templateRefList] = initialMv[templateRefList];
      Distortion tmCost = getDecoderSideDerivedMvCost(initialMv[curRefList], mvCand[curRefList], BDMVR_INTME_RANGE + (MV_FRACTIONAL_BITS_INTERNAL - searchStepShift), DECODER_SIDE_MV_WEIGHT);

      if (tmCost > curBestCost)
      {
        localCostArray[nDirect] = 2 * tmCost;
        continue;
      }

      tmCost += xBDMVRGetMatchingError<dir>(pu, mvCand, 0/*subPuOffset*/, useHadmard, useMR,
        doPreInterpolation, searchStepShift, mvCurCenter, initialMv, nDirect);
      localCostArray[nDirect] = tmCost;
      if (uiRound > 0)
      {
        continue;
      }

      if (tmCost < curBestCost)
      {
        nBestDirect = nDirect;
        curBestCost = tmCost;
        curBestMv[0] = mvCand[0];
        curBestMv[1] = mvCand[1];
      }
    }

    if (nBestDirect == -1)
    {
      break;
    }

    int nStep = 2 - (nBestDirect & 0x01);
    nDirectStart = nBestDirect - nStep;
    nDirectEnd = nBestDirect + nStep;

    if ((uiRound + 1) < maxSearchRounds)
    {
      xBDMVRUpdateSquareSearchCostLog(localCostArray, nBestDirect);
    }
  }

  CHECK(curBestMv[templateRefList] != initialMv[templateRefList], "this is not possible");
  // Model-based fractional MVD optimization
  Mv mvDiff = curBestMv[curRefList] - initialMv[curRefList];
  if (localCostArray[8] > 0 && localCostArray[8] == curBestCost && mvDiff.getAbsHor() != (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL) && mvDiff.getAbsVer() != (BDMVR_INTME_RANGE << MV_FRACTIONAL_BITS_INTERNAL))
  {
    uint64_t sadbuffer[5];
    sadbuffer[0] = (uint64_t)localCostArray[8]; // center
    sadbuffer[1] = (uint64_t)localCostArray[7]; // left
    sadbuffer[2] = (uint64_t)localCostArray[5]; // above
    sadbuffer[3] = (uint64_t)localCostArray[3]; // right
    sadbuffer[4] = (uint64_t)localCostArray[1]; // bottom

    int32_t tempDeltaMv[2] = { 0, 0 };
    xSubPelErrorSrfc(sadbuffer, tempDeltaMv);

    if (dir == 1)
      curBestMv[0] += Mv(tempDeltaMv[0], tempDeltaMv[1]);
    else
      curBestMv[1] -= Mv(tempDeltaMv[0], tempDeltaMv[1]);
  }

  return curBestCost;
}
#endif

#if JVET_X0049_BDMVR_SW_OPT
Distortion InterPrediction::xBDMVRGetMatchingError(const PredictionUnit& pu, const Mv(&mv)[2], bool useMR, bool useHadmard)
#else
Distortion InterPrediction::xBDMVRGetMatchingError(const PredictionUnit& pu, const Mv(&mv)[2], bool useMR)
#endif
{
  // Fill L0'a and L1's prediction blocks
#if JVET_X0049_BDMVR_SW_OPT
        Pel*     pelBuffer[2] = { m_filteredBlock[3][REF_PIC_LIST_0][0] + BDMVR_CENTER_POSITION, m_filteredBlock[3][REF_PIC_LIST_1][0] + BDMVR_CENTER_POSITION };
        const SizeType stride = BDMVR_BUF_STRIDE;
#else
        Pel*     pelBuffer[2] = { m_filteredBlock[3][REF_PIC_LIST_0][0], m_filteredBlock[3][REF_PIC_LIST_1][0] };
  const SizeType stride       = pu.lwidth();
#endif
  PelUnitBuf     predBuf[2]   = { PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_0], stride, pu.lwidth(), pu.lheight())),
                                  PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_1], stride, pu.lwidth(), pu.lheight())) };

  for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
  {
#if JVET_X0083_BM_AMVP_MERGE_MODE
    if (pu.amvpMergeModeFlag[1 - refList])
    {
      continue;
    }
#endif
    const Picture&   refPic  = *pu.cu->slice->getRefPic((RefPicList)refList, pu.refIdx[refList])->unscaledPic;
    xBDMVRFillBlkPredPelBuffer( pu, refPic, mv[refList] , predBuf[refList], pu.cs->slice->clpRng(COMPONENT_Y) );
  }

  // Compute distortion between L0'a and L1's prediction blocks
  DistParam cDistParam;
  cDistParam.applyWeight = false;
  cDistParam.useMR       = useMR;

#if JVET_X0049_BDMVR_SW_OPT
  m_pcRdCost->setDistParam(cDistParam, predBuf[0].Y(), predBuf[1].Y(), pu.cu->slice->clpRng(COMPONENT_Y).bd, COMPONENT_Y, useHadmard);
#if FULL_NBIT
  if (useHadmard)
  {
    return cDistParam.distFunc(cDistParam) >> 1;  // magic shift, benefit for early terminate
  }
  else
  {
    int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0;
    return cDistParam.distFunc(cDistParam) >> precisionAdj;
  }
#else
  return cDistParam.distFunc(cDistParam);
#endif
#else
  m_pcRdCost->setDistParam( cDistParam, predBuf[0].Y(), predBuf[1].Y(), pu.cu->slice->clpRng(COMPONENT_Y).bd, COMPONENT_Y, false );
#if FULL_NBIT
  int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0;
  return cDistParam.distFunc( cDistParam ) >> precisionAdj;
#else
  return cDistParam.distFunc( cDistParam );
#endif
#endif
}

#if MULTI_PASS_DMVR
#if JVET_X0049_ADAPT_DMVR
template <uint8_t dir>
#endif
Distortion InterPrediction::xBDMVRGetMatchingError(const PredictionUnit& pu, const Mv (&mv)[2], const int subPuBufOffset, bool useHadmard, bool useMR
                                                 , bool& doPreInterpolation, int32_t searchStepShift, const Mv (&mvCenter)[2]
                                                 , const Mv (&mvInitial)[2]
                                                 , int nDirect
)
{
  // Pre-interpolation
  if (doPreInterpolation)
  {
#if JVET_X0049_ADAPT_DMVR
    xBDMVRPreInterpolation<dir>(pu, mvCenter, searchStepShift == MV_FRACTIONAL_BITS_INTERNAL, searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1);
#else
    xBDMVRPreInterpolation( pu, mvCenter, searchStepShift == MV_FRACTIONAL_BITS_INTERNAL, searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1 );
#endif
    doPreInterpolation = false;
  }

  // Locate L0'a and L1's prediction blocks in pre-interpolation buffer
#if JVET_X0049_BDMVR_SW_OPT
  const int32_t stride = BDMVR_BUF_STRIDE;
#else
  const int32_t stride = MAX_CU_SIZE + ( BDMVR_INTME_RANGE << 1 ) + ( BDMVR_SIMD_IF_FACTOR - 2 );
#endif
  Pel* pelBuffer[2] = { nullptr, nullptr };

  if (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL)
  {
    Mv mvDiff[2] = { mv[0] - mvInitial[0], mv[1] - mvInitial[1] };
    mvDiff[0]  >>= MV_FRACTIONAL_BITS_INTERNAL;
    mvDiff[1]  >>= MV_FRACTIONAL_BITS_INTERNAL;

#if JVET_X0049_ADAPT_DMVR
    if (dir == 1)
    {
      // fix template at refList1 
      CHECK(subPuBufOffset != 0, "this is not possible");
      pelBuffer[0] = m_filteredBlock[3][REF_PIC_LIST_0][0] + subPuBufOffset + BDMVR_CENTER_POSITION + mvDiff[0].getVer() * stride + mvDiff[0].getHor();
      pelBuffer[1] = m_filteredBlock[3][REF_PIC_LIST_1][0] + BDMVR_CENTER_POSITION;
    }
    else if (dir == 2)
    {
      CHECK(subPuBufOffset != 0, "this is not possible");
      pelBuffer[0] = m_filteredBlock[3][REF_PIC_LIST_0][0] + BDMVR_CENTER_POSITION;
      pelBuffer[1] = m_filteredBlock[3][REF_PIC_LIST_1][0] + subPuBufOffset + BDMVR_CENTER_POSITION + mvDiff[1].getVer() * stride + mvDiff[1].getHor();
    }
    else
    {
      pelBuffer[0] = m_filteredBlock[3][REF_PIC_LIST_0][0] + subPuBufOffset + BDMVR_CENTER_POSITION + mvDiff[0].getVer() * stride + mvDiff[0].getHor();
      pelBuffer[1] = m_filteredBlock[3][REF_PIC_LIST_1][0] + subPuBufOffset + BDMVR_CENTER_POSITION + mvDiff[1].getVer() * stride + mvDiff[1].getHor();
    }
#else
    pelBuffer[0] = m_filteredBlock[3][REF_PIC_LIST_0][0] + subPuBufOffset + ( BDMVR_INTME_RANGE + mvDiff[0].getVer() ) * stride + BDMVR_INTME_RANGE + mvDiff[0].getHor();
    pelBuffer[1] = m_filteredBlock[3][REF_PIC_LIST_1][0] + subPuBufOffset + ( BDMVR_INTME_RANGE + mvDiff[1].getVer() ) * stride + BDMVR_INTME_RANGE + mvDiff[1].getHor();
#endif
  }
  else if (searchStepShift == MV_FRACTIONAL_BITS_INTERNAL - 1)
  {
           const int32_t  cFracBufOffset[8] = { stride, stride, stride + 1, 1, 1, 0, 0, 0 };
    static const uint32_t phaseIdxList[4]   = { 1, 2, 1, 0 };

    int phaseIdx = phaseIdxList[ nDirect & 0x3 ];
#if JVET_X0049_ADAPT_DMVR 
    if (dir == 3)
    {
      pelBuffer[0] = m_filteredBlock[phaseIdx][REF_PIC_LIST_0][0] + cFracBufOffset[nDirect];
      pelBuffer[1] = m_filteredBlock[phaseIdx][REF_PIC_LIST_1][0] + cFracBufOffset[(nDirect + 4) & 0x7];
    }
    else if (dir == 1)
    {
      pelBuffer[0] = m_filteredBlock[phaseIdx][REF_PIC_LIST_0][0] + cFracBufOffset[nDirect];
      pelBuffer[1] = m_filteredBlock[3][REF_PIC_LIST_1][0] + BDMVR_CENTER_POSITION;
    }
    else
    {
      pelBuffer[0] = m_filteredBlock[3][REF_PIC_LIST_0][0] + BDMVR_CENTER_POSITION;
      pelBuffer[1] = m_filteredBlock[phaseIdx][REF_PIC_LIST_1][0] + cFracBufOffset[(nDirect + 4) & 0x7];
    }
#else
    pelBuffer[0] = m_filteredBlock[phaseIdx][REF_PIC_LIST_0][0] + cFracBufOffset[  nDirect            ];
    pelBuffer[1] = m_filteredBlock[phaseIdx][REF_PIC_LIST_1][0] + cFracBufOffset[( nDirect + 4 ) & 0x7];
#endif
  }
  else
  {
    return xBDMVRGetMatchingError(pu, mv, useMR);
  }

  PelUnitBuf predBuf[2] = { PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_0], stride, pu.lwidth(), pu.lheight())),
                            PelUnitBuf(pu.chromaFormat, PelBuf(pelBuffer[REF_PIC_LIST_1], stride, pu.lwidth(), pu.lheight())) };

  // Compute distortion between L0'a and L1's prediction blocks
  DistParam cDistParam;
  cDistParam.applyWeight = false;
  cDistParam.useMR       = useMR;

  m_pcRdCost->setDistParam( cDistParam, predBuf[0].Y(), predBuf[1].Y(), pu.cu->slice->clpRng(COMPONENT_Y).bd, COMPONENT_Y, useHadmard );
#if FULL_NBIT
  if (useHadmard)
  {
    return cDistParam.distFunc( cDistParam ) >> 1;  // magic shift, benefit for early terminate
  }
  else
  {
    int32_t precisionAdj = cDistParam.bitDepth > 8 ? cDistParam.bitDepth - 8 : 0;
    return cDistParam.distFunc( cDistParam ) >> precisionAdj;
  }
#else
  return cDistParam.distFunc( cDistParam );
#endif
}
#endif
#endif

#if MULTI_HYP_PRED
void InterPrediction::xAddHypMC(PredictionUnit& pu, PelUnitBuf& predBuf, PelUnitBuf* predBufWOBIO, const bool lumaOnly)
{
  CHECK(pu.Y().area() <= MULTI_HYP_PRED_RESTRICT_BLOCK_SIZE, "Multi Hyp: Block too small!");
  CHECK(pu.cu->geoFlag, "multi-hyp does not work with geo");
  CHECK(pu.ciipFlag, "multi-hyp does not work with intra/inter");
  CHECK(!pu.mergeFlag && pu.interDir != 3, "multihyp selected for AMVP uni prediction");

  // get prediction for current additional hypothesis
  const UnitArea unitAreaFromPredBuf(predBuf.chromaFormat, Area(Position(0, 0), predBuf.Y()));
  PelUnitBuf tempBuf = m_additionalHypothesisStorage.getBuf(unitAreaFromPredBuf);
  const auto savedAffine = pu.cu->affine;
  const auto savedIMV = pu.cu->imv;
#if INTER_LIC
  auto savedLICFlag = pu.cu->LICFlag;
#endif
  MultiHypVec savedHypVec = pu.addHypData;
  pu.addHypData.clear();
  pu.mvRefine = true;
  motionCompensation(pu, predBuf, REF_PIC_LIST_X, true, !lumaOnly, predBufWOBIO);
  pu.mvRefine = false;
#if INTER_LIC
  m_storeBeforeLIC = false;
#endif
  PredictionUnit fakePredData = pu;
  fakePredData.cu->affine = false;
  fakePredData.mergeFlag = false;
  fakePredData.mergeType = MRG_TYPE_DEFAULT_N;
  fakePredData.mmvdMergeFlag = false;
  fakePredData.ciipFlag = false;
#if MULTI_PASS_DMVR
  fakePredData.bdmvrRefine = false;
#endif

  for (int i = 0; i < savedHypVec.size(); i++)
  {
    const MultiHypPredictionData mhData = savedHypVec[i];

    // get legacy ref list and ref idx
    const auto &MHRefPics = pu.cs->slice->getMultiHypRefPicList();
    CHECK(mhData.refIdx < 0, "Multi Hyp: mhData.refIdx < 0");
    const int iRefPicList = mhData.isMrg ? mhData.refList : MHRefPics[mhData.refIdx].refList;
    const int iRefIdx = mhData.isMrg ? mhData.refIdx : MHRefPics[mhData.refIdx].refIdx;

    // construct fake object using legacy indexing
    fakePredData.interDir = iRefPicList + 1;
    fakePredData.mv[iRefPicList] = mhData.mv;
    fakePredData.refIdx[iRefPicList] = iRefIdx;
    fakePredData.refIdx[1 - iRefPicList] = -1;
#if INTER_LIC
    fakePredData.cu->LICFlag = mhData.LICFlag;
#endif
    fakePredData.cu->imv = mhData.imv;
    fakePredData.mvRefine = true;
    motionCompensation(fakePredData, tempBuf, REF_PIC_LIST_X, true, !lumaOnly);
    fakePredData.mvRefine = false;

    CHECK(mhData.weightIdx < 0, "Multi Hyp: mhData.weightIdx < 0");
    CHECK(mhData.weightIdx >= MULTI_HYP_PRED_NUM_WEIGHTS, "Multi Hyp: mhData.weightIdx >= MULTI_HYP_PRED_NUM_WEIGHTS");
    predBuf.addHypothesisAndClip(tempBuf, g_addHypWeight[mhData.weightIdx], pu.cs->slice->clpRngs(), lumaOnly);
  }
#if INTER_LIC
  pu.cu->LICFlag = savedLICFlag;
#endif
  pu.cu->imv = savedIMV;
  pu.cu->affine = savedAffine;
  pu.addHypData = savedHypVec;
}
#endif

#if JVET_X0083_BM_AMVP_MERGE_MODE
void InterPrediction::getAmvpMergeModeMergeList(PredictionUnit& pu, MvField* mvFieldAmListCommon, const int decAmvpRefIdx)
{
  RefPicList refListMerge = pu.amvpMergeModeFlag[0] ? REF_PIC_LIST_0 : REF_PIC_LIST_1;
  RefPicList refListAmvp = RefPicList(1 - refListMerge);
#if JVET_Y0129_MVD_SIGNAL_AMVP_MERGE_MODE
  for (int idx = 0; idx < pu.cu->slice->getNumRefIdx(refListAmvp) * AMVP_MAX_NUM_CANDS_MEM; idx++)
#else
  for (int idx = 0; idx < pu.cu->slice->getNumRefIdx(refListAmvp) * AMVP_MAX_NUM_CANDS; idx++)
#endif
  {
    mvFieldAmListCommon[idx] = MvField();
    mvFieldAmListCommon[MAX_NUM_AMVP_CANDS_MAX_REF + idx] = MvField();
  }
  int amvpRefIdxStart = 0;
  int amvpRefIdxEnd = pu.cu->slice->getNumRefIdx(refListAmvp);
  int decAmvpMvpIdx = -1;
  if (decAmvpRefIdx >= 0)
  {
    amvpRefIdxStart = decAmvpRefIdx;
    amvpRefIdxEnd = decAmvpRefIdx + 1;
    decAmvpMvpIdx = pu.mvpIdx[refListAmvp];
  }
#if !JVET_Y0128_NON_CTC
  const int curPoc = pu.cu->slice->getPOC();
#endif
  const bool useMR = pu.lumaSize().area() > 64;

  for (int refIdxAmvp = amvpRefIdxStart; refIdxAmvp < amvpRefIdxEnd; refIdxAmvp++)
  {
#if JVET_Y0128_NON_CTC
    if (pu.cu->slice->getAmvpMergeModeValidRefIdx(refListAmvp, refIdxAmvp) == false)
    {
      continue;
    }
    CHECK(pu.cu->slice->getRefPic(refListAmvp, refIdxAmvp)->isRefScaled(pu.cu->cs->pps), "this is not possible");
#else
    const int amvpRefPoc = pu.cu->slice->getRefPOC(refListAmvp, refIdxAmvp);
    bool findValidMergeRefPic = false;
    for (int refIdxCandMerge = 0; refIdxCandMerge < pu.cu->slice->getNumRefIdx(refListMerge); refIdxCandMerge++)
    {
      const int candMergePoc = pu.cu->slice->getRefPOC(refListMerge, refIdxCandMerge);
      if ((amvpRefPoc - curPoc) * (candMergePoc - curPoc) < 0)
      {
        findValidMergeRefPic = true;
        break;
      }
    }
    if (findValidMergeRefPic == false)
    {
      continue;
    }
#endif
    pu.refIdx[refListAmvp] = refIdxAmvp;
    AMVPInfo amvpInfo;
    PU::fillMvpCand( pu, refListAmvp, refIdxAmvp, amvpInfo
#if TM_AMVP
                   , this
#endif
    );
    MergeCtx bmMergeCtx;
    PU::getInterMergeCandidates(pu, bmMergeCtx, 0, AMVP_MERGE_MODE_MERGE_LIST_MAX_CANDS - 1);

    int bestMvpIdxLoopStart = 0;
    int bestMvpIdxLoopEnd = amvpInfo.numCand;
    if (decAmvpRefIdx >= 0)
    {
      bestMvpIdxLoopStart = decAmvpMvpIdx;
      bestMvpIdxLoopEnd = bestMvpIdxLoopStart + 1;
    }
    for (int bestMvpIdxToTest = bestMvpIdxLoopStart; bestMvpIdxToTest < bestMvpIdxLoopEnd; bestMvpIdxToTest++)
    {
#if JVET_Y0129_MVD_SIGNAL_AMVP_MERGE_MODE
      const int mvFieldMergeIdx = refIdxAmvp * AMVP_MAX_NUM_CANDS_MEM + bestMvpIdxToTest;
#else
      const int mvFieldMergeIdx = refIdxAmvp * AMVP_MAX_NUM_CANDS + bestMvpIdxToTest;
#endif
      const int mvFieldAmvpIdx = MAX_NUM_AMVP_CANDS_MAX_REF + mvFieldMergeIdx;
      pu.mv[refListAmvp] = amvpInfo.mvCand[bestMvpIdxToTest];

      // BM select merge candidate
      struct bmCostSort
      {
        int mergeIdx;
        Distortion bmCost;
      };
      bmCostSort temp;
      const auto CostIncSort = [](const bmCostSort &x, const bmCostSort &y) { return x.bmCost < y.bmCost; };
      std::vector<bmCostSort> input;
      // here to select the merge cand which has minimum BM cost, at each cand, the cost is derived by  minBMcost(mvpIdx0, mvpIdx1)
      if (bmMergeCtx.numValidMergeCand > 1)
      {
        // pre Fill AMVP prediction blocks
#if JVET_X0049_BDMVR_SW_OPT
        Pel* pelBufferAmvp = m_filteredBlock[3][refListAmvp][0] + BDMVR_CENTER_POSITION;
        const SizeType stride = BDMVR_BUF_STRIDE;
#else
        Pel* pelBufferAmvp        = m_filteredBlock[3][refListAmvp][0];
        const SizeType stride     = pu.lwidth();
#endif
        PelUnitBuf predBufAmvp    = PelUnitBuf(pu.chromaFormat, PelBuf(pelBufferAmvp, stride, pu.lwidth(), pu.lheight()));
        const Picture& refPicAmvp = *pu.cu->slice->getRefPic((RefPicList)refListAmvp, pu.refIdx[refListAmvp])->unscaledPic;
        xBDMVRFillBlkPredPelBuffer( pu, refPicAmvp, pu.mv[refListAmvp] , predBufAmvp, pu.cs->slice->clpRng(COMPONENT_Y) );
        Mv mvAmBdmvr[2/*refListId*/];
        for (int mergeIdx = 0; mergeIdx < bmMergeCtx.numValidMergeCand; mergeIdx++)
        {
          pu.refIdx[refListMerge] = bmMergeCtx.mvFieldNeighbours[(mergeIdx << 1) + refListMerge].refIdx;
          mvAmBdmvr[refListAmvp] = amvpInfo.mvCand[bestMvpIdxToTest];
          mvAmBdmvr[refListMerge] = bmMergeCtx.mvFieldNeighbours[(mergeIdx << 1) + refListMerge].mv;
#if JVET_Y0128_NON_CTC
          CHECK(pu.cu->slice->getRefPic((RefPicList)refListMerge, pu.refIdx[refListMerge])->isRefScaled(pu.cs->pps), "this is not possible");
#endif
          Distortion tmpBmCost = xBDMVRGetMatchingError(pu, mvAmBdmvr, useMR);
          temp.mergeIdx = mergeIdx;
          temp.bmCost = tmpBmCost;
          input.push_back(temp);
        }
        stable_sort(input.begin(), input.end(), CostIncSort);
      }
#if JVET_Y0129_MVD_SIGNAL_AMVP_MERGE_MODE
      else
      {
        temp.mergeIdx = 0;
        temp.bmCost = 0;
        input.push_back(temp);
      }
#else
      if (bmMergeCtx.numValidMergeCand == 1)
      {
        pu.mv[refListMerge] = bmMergeCtx.mvFieldNeighbours[refListMerge].mv;
        pu.refIdx[refListMerge] = bmMergeCtx.mvFieldNeighbours[refListMerge].refIdx;
      }
      else
#endif
      {
        pu.mv[refListMerge] = bmMergeCtx.mvFieldNeighbours[(input[0].mergeIdx << 1) + refListMerge].mv;
        pu.refIdx[refListMerge] = bmMergeCtx.mvFieldNeighbours[(input[0].mergeIdx << 1) + refListMerge].refIdx;
      }
#if JVET_Y0129_MVD_SIGNAL_AMVP_MERGE_MODE
      if (bestMvpIdxToTest == 0 || bestMvpIdxToTest == 2)
      {
#endif
      amvpMergeModeMvRefinement(pu, mvFieldAmListCommon, mvFieldMergeIdx, mvFieldAmvpIdx);
#if JVET_Y0129_MVD_SIGNAL_AMVP_MERGE_MODE
      }
      else if (bmMergeCtx.numValidMergeCand == 1)
      {
        mvFieldAmListCommon[mvFieldMergeIdx].refIdx = bmMergeCtx.mvFieldNeighbours[(input[0].mergeIdx << 1) + refListMerge].refIdx;
        mvFieldAmListCommon[mvFieldMergeIdx].mv = bmMergeCtx.mvFieldNeighbours[(input[0].mergeIdx << 1) + refListMerge].mv;
        mvFieldAmListCommon[mvFieldAmvpIdx].refIdx = refIdxAmvp;
        mvFieldAmListCommon[mvFieldAmvpIdx].mv = amvpInfo.mvCand[bestMvpIdxToTest];
      }
      else
      {
        pu.mv[refListAmvp] = amvpInfo.mvCand[bestMvpIdxToTest];
        pu.refIdx[refListAmvp] = refIdxAmvp;
        pu.mv[refListMerge] = bmMergeCtx.mvFieldNeighbours[(input[1].mergeIdx << 1) + refListMerge].mv;
        pu.refIdx[refListMerge] = bmMergeCtx.mvFieldNeighbours[(input[1].mergeIdx << 1) + refListMerge].refIdx;
        amvpMergeModeMvRefinement(pu, mvFieldAmListCommon, mvFieldMergeIdx, mvFieldAmvpIdx);
      }
      if (bestMvpIdxToTest == 2)
      {
        mvFieldAmListCommon[mvFieldAmvpIdx].mv.roundTransPrecInternal2Amvr(pu.cu->imv);
      }
#endif
    } // bestMvpIdxLoop
  }  // refIdxAmvp loop
}
void InterPrediction::amvpMergeModeMvRefinement(PredictionUnit& pu, MvField* mvFieldAmListCommon, const int mvFieldMergeIdx, const int mvFieldAmvpIdx)
{
  const RefPicList refListMerge = pu.amvpMergeModeFlag[0] ? REF_PIC_LIST_0 : REF_PIC_LIST_1;
  const RefPicList refListAmvp = RefPicList(1 - refListMerge);
  const int curPoc = pu.cu->slice->getPOC();
  const int mergeRefPoc = pu.cu->slice->getRefPOC(refListMerge, pu.refIdx[refListMerge]);
  const bool useMR = pu.lumaSize().area() > 64;
  const int amvpRefPoc = pu.cu->slice->getRefPOC(refListAmvp, pu.refIdx[refListAmvp]);
#if JVET_Y0128_NON_CTC
  CHECK(pu.cu->slice->getRefPic(REF_PIC_LIST_0, pu.refIdx[0])->isRefScaled(pu.cs->pps), "this is not possible");
  CHECK(pu.cu->slice->getRefPic(REF_PIC_LIST_1, pu.refIdx[1])->isRefScaled(pu.cs->pps), "this is not possible");
#endif
  if ((mergeRefPoc - curPoc) == (curPoc - amvpRefPoc))
  {
    Mv         mvInitial[2];
    mvInitial[refListAmvp] = pu.mv[refListAmvp];;
    mvInitial[refListMerge] = pu.mv[refListMerge];
    Mv         mvFinal[2] = { mvInitial[0], mvInitial[1] };
    Distortion curBmCost = std::numeric_limits<Distortion>::max();
#if JVET_X0049_BDMVR_SW_OPT
    curBmCost = xBDMVRMvSquareSearch<false>(mvFinal, curBmCost, pu, mvInitial,
        AMVP_MERGE_MODE_REDUCED_MV_REFINE_SEARCH_ROUND, MV_FRACTIONAL_BITS_INTERNAL, useMR, false);
    curBmCost = xBDMVRMvSquareSearch<true>(mvFinal, curBmCost, pu, mvInitial,
      2, MV_FRACTIONAL_BITS_INTERNAL - 1, useMR, false);
#else
    curBmCost = xBDMVRMvSquareSearch( mvFinal, curBmCost, pu, mvInitial,
        AMVP_MERGE_MODE_REDUCED_MV_REFINE_SEARCH_ROUND, MV_FRACTIONAL_BITS_INTERNAL, useMR, false );
    curBmCost = xBDMVRMvSquareSearch( mvFinal, curBmCost, pu, mvInitial,
        2, MV_FRACTIONAL_BITS_INTERNAL - 1, useMR, false );
#endif
    pu.mv[refListMerge] = mvFinal[refListMerge];
    pu.mv[refListAmvp] = mvFinal[refListAmvp];
  }
#if TM_AMVP || TM_MRG
  else
  {
    Distortion tmCost[2];
    tmCost[refListMerge] = deriveTMMv(pu, true, std::numeric_limits<Distortion>::max(), refListMerge, pu.refIdx[refListMerge], 0, pu.mv[refListMerge]);
    tmCost[refListAmvp] = deriveTMMv(pu, true, std::numeric_limits<Distortion>::max(), refListAmvp, pu.refIdx[refListAmvp], 0, pu.mv[refListAmvp]);
    RefPicList refListToBeRefined = (tmCost[refListMerge] < tmCost[refListAmvp]) ? refListAmvp : refListMerge;
    MvField    mvfBetterUni(pu.mv[1 - refListToBeRefined], pu.refIdx[1 - refListToBeRefined]);
    deriveTMMv(pu, true, std::numeric_limits<Distortion>::max(), refListToBeRefined, pu.refIdx[refListToBeRefined],
        AMVP_MERGE_MODE_REDUCED_MV_REFINE_SEARCH_ROUND, pu.mv[refListToBeRefined], &mvfBetterUni);
  }
#endif
#if !JVET_Y0129_MVD_SIGNAL_AMVP_MERGE_MODE
  pu.mv[refListAmvp].roundTransPrecInternal2Amvr(pu.cu->imv);
#endif
  mvFieldAmListCommon[mvFieldMergeIdx].refIdx = pu.refIdx[refListMerge];
  mvFieldAmListCommon[mvFieldMergeIdx].mv = pu.mv[refListMerge];
  mvFieldAmListCommon[mvFieldAmvpIdx].refIdx = pu.refIdx[refListAmvp];
  mvFieldAmListCommon[mvFieldAmvpIdx].mv = pu.mv[refListAmvp];
}
#endif
#if JVET_Y0067_ENHANCED_MMVD_MVD_SIGN_PRED
  void InterPrediction::deriveMvdSign(const Mv& cMvPred, const Mv& cMvdKnownAtDecoder, PredictionUnit& pu, RefPicList eRefList, int refIdx, std::vector<Mv>& cMvdDerived)
  {
    const static int patternsX[4][2] =
    {
      { +1, +1 },
      { +1, -1 },
    };
    
    const static int patternsY[4][2] =
    {
      { +1, +1 },
      { -1, +1 },
    };
    
    const static int patternsXY[4][2] =
    {
      { +1, +1 },
      { +1, -1 },
      { -1, +1 },
      { -1, -1 },
    };
    
    typedef int Int2[2];
    const Int2* patterns = 0;
    uint16_t patternsNum = 0;
    if (cMvdKnownAtDecoder.getHor() == 0)
    {
      patterns = patternsX;
      patternsNum = 2;
    }
    else if (cMvdKnownAtDecoder.getVer() == 0)
    {
      patterns = patternsY;
      patternsNum = 2;
    }
    else
    {
      patterns = patternsXY;
      patternsNum = 4;
    }
    cMvdDerived.resize(patternsNum);
    for (int n = 0; n < patternsNum; ++n)
    {
      auto sign = patterns[n];
      auto cMv = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
      cMvdDerived[n] = cMv;
    }
    if (!pu.lumaPos().x && !pu.lumaPos().y)
    {
      return;
    }
    CHECK(refIdx < 0, "Invalid reference index for FRUC");
    
    const Picture& refPic = *pu.cu->slice->getRefPic(eRefList, refIdx)->unscaledPic;
    InterPredResources interRes(m_pcReshape, m_pcRdCost, m_if, m_filteredBlockTmp[0][COMPONENT_Y]
                                , m_filteredBlock[3][1][0], m_filteredBlock[3][0][0]
                                );
    
    TplMatchingCtrl tplCtrl(pu, interRes, refPic, true, COMPONENT_Y, true, 0, m_pcCurTplAbove, m_pcCurTplLeft, m_pcRefTplAbove, m_pcRefTplLeft, Mv(0, 0), nullptr, 0);
    
    std::vector<std::pair<Mv, Distortion>> aMvCostVec(patternsNum);
    
    for (int n = 0; n < patternsNum; ++n)
    {
      auto cMvdTest = cMvdDerived[n];
      Mv cMvTest = cMvPred + cMvdTest;
      Distortion uiCost = tplCtrl.xGetTempMatchError<TM_TPL_SIZE>(cMvTest);
      aMvCostVec[n] = { cMvdTest, uiCost };
    }
    
    std::stable_sort(aMvCostVec.begin(), aMvCostVec.end(), [](const std::pair<Mv, Distortion> & l, const std::pair<Mv, Distortion> & r) {return l.second < r.second; });
    
    for (int n = 0; n < patternsNum; ++n)
    {
      cMvdDerived[n] = aMvCostVec[n].first;
    }
  }
  void InterPrediction::deriveMvdSignSMVD(const Mv& cMvPred, const Mv& cMvPred2, const Mv& cMvdKnownAtDecoder, PredictionUnit& pu, std::vector<Mv>& cMvdDerived)
  {
    const static int patternsX[4][2] =
    {
      { +1, +1 },
      { +1, -1 },
    };
    
    const static int patternsY[4][2] =
    {
      { +1, +1 },
      { -1, +1 },
    };
    
    const static int patternsXY[4][2] =
    {
      { +1, +1 },
      { +1, -1 },
      { -1, +1 },
      { -1, -1 },
    };
    
    typedef int Int2[2];
    const Int2* patterns = 0;
    uint16_t patternsNum = 0;
    if (cMvdKnownAtDecoder.getHor() == 0)
    {
      patterns = patternsX;
      patternsNum = 2;
    }
    else if (cMvdKnownAtDecoder.getVer() == 0)
    {
      patterns = patternsY;
      patternsNum = 2;
    }
    else
    {
      patterns = patternsXY;
      patternsNum = 4;
    }
    
    cMvdDerived.resize(patternsNum);
    for (int n = 0; n < patternsNum; ++n)
    {
      auto sign = patterns[n];
      auto cMv = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
      cMvdDerived[n] = cMv;
    }
    
    if (!pu.lumaPos().x && !pu.lumaPos().y)
    {
      return;
    }
    std::vector<std::pair<Mv, Distortion>> aMvCostVec(patternsNum);
    InterPredResources interRes(m_pcReshape, m_pcRdCost, m_if, m_filteredBlockTmp[0][COMPONENT_Y]
                                , m_filteredBlock[3][1][0], m_filteredBlock[3][0][0]
                                );
    
    // For L0
    int refIdx = pu.cs->slice->getSymRefIdx(REF_PIC_LIST_0);
    CHECK(refIdx < 0, "Invalid reference index for SMVD L0");
    const Picture& refPic = *pu.cu->slice->getRefPic(REF_PIC_LIST_0, refIdx)->unscaledPic;
    
    TplMatchingCtrl tplCtrl(pu, interRes, refPic, true, COMPONENT_Y, true, 0, m_pcCurTplAbove, m_pcCurTplLeft, m_pcRefTplAbove, m_pcRefTplLeft, Mv(0, 0), nullptr, 0);
    
    for (int n = 0; n < patternsNum; ++n)
    {
      auto cMvdTest = cMvdDerived[n];
      Mv cMvTest = cMvPred + cMvdTest;
      Distortion uiCost = tplCtrl.xGetTempMatchError<TM_TPL_SIZE>(cMvTest);
      aMvCostVec[n] = { cMvdTest, uiCost };
    }
    
    std::stable_sort(aMvCostVec.begin(), aMvCostVec.end(), [](const std::pair<Mv, Distortion> & l, const std::pair<Mv, Distortion> & r) {return l.second < r.second; });
    
    for (int n = 0; n < patternsNum; ++n)
    {
      cMvdDerived[n] = aMvCostVec[n].first;
    }
  }
  void InterPrediction::deriveMvdSignAffine(const Mv& cMvPred, const Mv& cMvPred2, const Mv& cMvPred3, const Mv& cMvdKnownAtDecoder, const Mv& cMvdKnownAtDecoder2, const Mv& cMvdKnownAtDecoder3,
                                            PredictionUnit& pu, RefPicList eRefList, int refIdx, std::vector<Mv>& cMvdDerived, std::vector<Mv>& cMvdDerived2, std::vector<Mv>& cMvdDerived3)
  {
    int patterns2[2][1] =
    {
      { +1 },
      { -1 },
    };
    
    int patterns4[4][2] =
    {
      { +1, +1 },
      { +1, -1 },
      { -1, +1 },
      { -1, -1 },
    };
    
    int patterns8[8][3] =
    {
      {+1, +1, +1 },
      {+1, +1, -1 },
      {+1, -1, +1 },
      {+1, -1, -1 },
      {-1, +1, +1 },
      {-1, +1, -1 },
      {-1, -1, +1 },
      {-1, -1, -1 },
    };
    
    int patterns16[16][4] =
    {
      {+1, +1, +1, +1 },
      {+1, +1, +1, -1 },
      {+1, +1, -1, +1 },
      {+1, +1, -1, -1 },
      {+1, -1, +1, +1 },
      {+1, -1, +1, -1 },
      {+1, -1, -1, +1 },
      {+1, -1, -1, -1 },
      {-1, +1, +1, +1 },
      {-1, +1, +1, -1 },
      {-1, +1, -1, +1 },
      {-1, +1, -1, -1 },
      {-1, -1, +1, +1 },
      {-1, -1, +1, -1 },
      {-1, -1, -1, +1 },
      {-1, -1, -1, -1 },
    };
    
    int patterns32[32][5] =
    {
      {+1, +1, +1, +1, +1 },
      {+1, +1, +1, +1, -1 },
      {+1, +1, +1, -1, +1 },
      {+1, +1, +1, -1, -1 },
      {+1, +1, -1, +1, +1 },
      {+1, +1, -1, +1, -1 },
      {+1, +1, -1, -1, +1 },
      {+1, +1, -1, -1, -1 },
      {+1, -1, +1, +1, +1 },
      {+1, -1, +1, +1, -1 },
      {+1, -1, +1, -1, +1 },
      {+1, -1, +1, -1, -1 },
      {+1, -1, -1, +1, +1 },
      {+1, -1, -1, +1, -1 },
      {+1, -1, -1, -1, +1 },
      {+1, -1, -1, -1, -1 },
      {-1, +1, +1, +1, +1 },
      {-1, +1, +1, +1, -1 },
      {-1, +1, +1, -1, +1 },
      {-1, +1, +1, -1, -1 },
      {-1, +1, -1, +1, +1 },
      {-1, +1, -1, +1, -1 },
      {-1, +1, -1, -1, +1 },
      {-1, +1, -1, -1, -1 },
      {-1, -1, +1, +1, +1 },
      {-1, -1, +1, +1, -1 },
      {-1, -1, +1, -1, +1 },
      {-1, -1, +1, -1, -1 },
      {-1, -1, -1, +1, +1 },
      {-1, -1, -1, +1, -1 },
      {-1, -1, -1, -1, +1 },
      {-1, -1, -1, -1, -1 },
    };
    
    int patterns64[64][6] =
    {
      {+1, +1, +1, +1, +1, +1 },
      {+1, +1, +1, +1, +1, -1 },
      {+1, +1, +1, +1, -1, +1 },
      {+1, +1, +1, +1, -1, -1 },
      {+1, +1, +1, -1, +1, +1 },
      {+1, +1, +1, -1, +1, -1 },
      {+1, +1, +1, -1, -1, +1 },
      {+1, +1, +1, -1, -1, -1 },
      {+1, +1, -1, +1, +1, +1 },
      {+1, +1, -1, +1, +1, -1 },
      {+1, +1, -1, +1, -1, +1 },
      {+1, +1, -1, +1, -1, -1 },
      {+1, +1, -1, -1, +1, +1 },
      {+1, +1, -1, -1, +1, -1 },
      {+1, +1, -1, -1, -1, +1 },
      {+1, +1, -1, -1, -1, -1 },
      {+1, -1, +1, +1, +1, +1 },
      {+1, -1, +1, +1, +1, -1 },
      {+1, -1, +1, +1, -1, +1 },
      {+1, -1, +1, +1, -1, -1 },
      {+1, -1, +1, -1, +1, +1 },
      {+1, -1, +1, -1, +1, -1 },
      {+1, -1, +1, -1, -1, +1 },
      {+1, -1, +1, -1, -1, -1 },
      {+1, -1, -1, +1, +1, +1 },
      {+1, -1, -1, +1, +1, -1 },
      {+1, -1, -1, +1, -1, +1 },
      {+1, -1, -1, +1, -1, -1 },
      {+1, -1, -1, -1, +1, +1 },
      {+1, -1, -1, -1, +1, -1 },
      {+1, -1, -1, -1, -1, +1 },
      {+1, -1, -1, -1, -1, -1 },
      {-1, +1, +1, +1, +1, +1 },
      {-1, +1, +1, +1, +1, -1 },
      {-1, +1, +1, +1, -1, +1 },
      {-1, +1, +1, +1, -1, -1 },
      {-1, +1, +1, -1, +1, +1 },
      {-1, +1, +1, -1, +1, -1 },
      {-1, +1, +1, -1, -1, +1 },
      {-1, +1, +1, -1, -1, -1 },
      {-1, +1, -1, +1, +1, +1 },
      {-1, +1, -1, +1, +1, -1 },
      {-1, +1, -1, +1, -1, +1 },
      {-1, +1, -1, +1, -1, -1 },
      {-1, +1, -1, -1, +1, +1 },
      {-1, +1, -1, -1, +1, -1 },
      {-1, +1, -1, -1, -1, +1 },
      {-1, +1, -1, -1, -1, -1 },
      {-1, -1, +1, +1, +1, +1 },
      {-1, -1, +1, +1, +1, -1 },
      {-1, -1, +1, +1, -1, +1 },
      {-1, -1, +1, +1, -1, -1 },
      {-1, -1, +1, -1, +1, +1 },
      {-1, -1, +1, -1, +1, -1 },
      {-1, -1, +1, -1, -1, +1 },
      {-1, -1, +1, -1, -1, -1 },
      {-1, -1, -1, +1, +1, +1 },
      {-1, -1, -1, +1, +1, -1 },
      {-1, -1, -1, +1, -1, +1 },
      {-1, -1, -1, +1, -1, -1 },
      {-1, -1, -1, -1, +1, +1 },
      {-1, -1, -1, -1, +1, -1 },
      {-1, -1, -1, -1, -1, +1 },
      {-1, -1, -1, -1, -1, -1 },
    };
    std::vector<int> isZeroComp(6, 0);
    if (cMvdKnownAtDecoder.getHor() == 0)
    {
      isZeroComp[0] = 1;
    }
    if (cMvdKnownAtDecoder.getVer() == 0)
    {
      isZeroComp[1] = 1;
    }
    if (cMvdKnownAtDecoder2.getHor() == 0)
    {
      isZeroComp[2] = 1;
    }
    if (cMvdKnownAtDecoder2.getVer() == 0)
    {
      isZeroComp[3] = 1;
    }
    if (cMvdKnownAtDecoder3.getHor() == 0)
    {
      isZeroComp[4] = 1;
    }
    if (cMvdKnownAtDecoder3.getVer() == 0)
    {
      isZeroComp[5] = 1;
    }
    int nZeroComp = isZeroComp[0] + isZeroComp[1] + isZeroComp[2] + isZeroComp[3] + isZeroComp[4] + isZeroComp[5];
    CHECK(nZeroComp == 6, "nnZeroComp == 6");
    
    uint16_t patternsNum = 0;
    std::vector<Mv> MvdCand[3];
    if (nZeroComp == 0)
    {
      patternsNum = 64;
      MvdCand[0].resize(patternsNum);
      MvdCand[1].resize(patternsNum);
      MvdCand[2].resize(patternsNum);
      for (int n = 0; n < patternsNum; ++n)
      {
        auto sign = patterns64[n];
        MvdCand[0][n] = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
        MvdCand[1][n] = Mv(sign[2] * cMvdKnownAtDecoder2.getHor(), sign[3] * cMvdKnownAtDecoder2.getVer());
        MvdCand[2][n] = Mv(sign[4] * cMvdKnownAtDecoder3.getHor(), sign[5] * cMvdKnownAtDecoder3.getVer());
      }
    }
    else if (nZeroComp == 1)
    {
      patternsNum = 32;
      MvdCand[0].resize(patternsNum);
      MvdCand[1].resize(patternsNum);
      MvdCand[2].resize(patternsNum);
      for (int n = 0; n < patternsNum; ++n)
      {
        auto signR = patterns32[n];
        int sign[6];
        int k = 0;
        for (int i = 0; i < 6; i++)
        {
          if (isZeroComp[i])
          {
            sign[i] = +1;
          }
          else
          {
            sign[i] = signR[k];
            k++;
          }
        }
        MvdCand[0][n] = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
        MvdCand[1][n] = Mv(sign[2] * cMvdKnownAtDecoder2.getHor(), sign[3] * cMvdKnownAtDecoder2.getVer());
        MvdCand[2][n] = Mv(sign[4] * cMvdKnownAtDecoder3.getHor(), sign[5] * cMvdKnownAtDecoder3.getVer());
      }
    }
    else if (nZeroComp == 2)
    {
      patternsNum = 16;
      MvdCand[0].resize(patternsNum);
      MvdCand[1].resize(patternsNum);
      MvdCand[2].resize(patternsNum);
      for (int n = 0; n < patternsNum; ++n)
      {
        auto signR = patterns16[n];
        int sign[6];
        int k = 0;
        for (int i = 0; i < 6; i++)
        {
          if (isZeroComp[i])
          {
            sign[i] = +1;
          }
          else
          {
            sign[i] = signR[k];
            k++;
          }
        }
        MvdCand[0][n] = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
        MvdCand[1][n] = Mv(sign[2] * cMvdKnownAtDecoder2.getHor(), sign[3] * cMvdKnownAtDecoder2.getVer());
        MvdCand[2][n] = Mv(sign[4] * cMvdKnownAtDecoder3.getHor(), sign[5] * cMvdKnownAtDecoder3.getVer());
      }
    }
    else if (nZeroComp == 3)
    {
      patternsNum = 8;
      MvdCand[0].resize(patternsNum);
      MvdCand[1].resize(patternsNum);
      MvdCand[2].resize(patternsNum);
      for (int n = 0; n < patternsNum; ++n)
      {
        auto signR = patterns8[n];
        int sign[6];
        int k = 0;
        for (int i = 0; i < 6; i++)
        {
          if (isZeroComp[i])
          {
            sign[i] = +1;
          }
          else
          {
            sign[i] = signR[k];
            k++;
          }
        }
        MvdCand[0][n] = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
        MvdCand[1][n] = Mv(sign[2] * cMvdKnownAtDecoder2.getHor(), sign[3] * cMvdKnownAtDecoder2.getVer());
        MvdCand[2][n] = Mv(sign[4] * cMvdKnownAtDecoder3.getHor(), sign[5] * cMvdKnownAtDecoder3.getVer());
      }
    }
    else if (nZeroComp == 4)
    {
      patternsNum = 4;
      MvdCand[0].resize(patternsNum);
      MvdCand[1].resize(patternsNum);
      MvdCand[2].resize(patternsNum);
      for (int n = 0; n < patternsNum; ++n)
      {
        auto signR = patterns4[n];
        int sign[6];
        int k = 0;
        for (int i = 0; i < 6; i++)
        {
          if (isZeroComp[i])
          {
            sign[i] = +1;
          }
          else
          {
            sign[i] = signR[k];
            k++;
          }
        }
        MvdCand[0][n] = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
        MvdCand[1][n] = Mv(sign[2] * cMvdKnownAtDecoder2.getHor(), sign[3] * cMvdKnownAtDecoder2.getVer());
        MvdCand[2][n] = Mv(sign[4] * cMvdKnownAtDecoder3.getHor(), sign[5] * cMvdKnownAtDecoder3.getVer());
      }
    }
    else
    {
      patternsNum = 2;
      MvdCand[0].resize(patternsNum);
      MvdCand[1].resize(patternsNum);
      MvdCand[2].resize(patternsNum);
      for (int n = 0; n < patternsNum; ++n)
      {
        auto signR = patterns2[n];
        int sign[6];
        int k = 0;
        for (int i = 0; i < 6; i++)
        {
          if (isZeroComp[i])
          {
            sign[i] = +1;
          }
          else
          {
            sign[i] = signR[k];
            k++;
          }
        }
        MvdCand[0][n] = Mv(sign[0] * cMvdKnownAtDecoder.getHor(), sign[1] * cMvdKnownAtDecoder.getVer());
        MvdCand[1][n] = Mv(sign[2] * cMvdKnownAtDecoder2.getHor(), sign[3] * cMvdKnownAtDecoder2.getVer());
        MvdCand[2][n] = Mv(sign[4] * cMvdKnownAtDecoder3.getHor(), sign[5] * cMvdKnownAtDecoder3.getVer());
      }
    }
    
    cMvdDerived.resize(patternsNum);
    cMvdDerived2.resize(patternsNum);
    cMvdDerived3.resize(patternsNum);
    
    for (int n = 0; n < patternsNum; ++n)
    {
      cMvdDerived[n] = MvdCand[0][n];
      cMvdDerived2[n] = MvdCand[1][n];
      cMvdDerived3[n] = MvdCand[2][n];
    }
    
    if (!pu.lumaPos().x && !pu.lumaPos().y)
    {
      for (int n = 0; n < patternsNum; ++n)
      {
        cMvdDerived[n] = MvdCand[0][n];
        cMvdDerived2[n] = MvdCand[1][n];
        cMvdDerived3[n] = MvdCand[2][n];
      }
      return;
    }
    
    /////////////////////////////////////////////////////////////////
    Pel* refLeftTemplate = m_pcLICRefLeftTemplate;
    Pel* refAboveTemplate = m_pcLICRefAboveTemplate;
    Pel* recLeftTemplate = m_pcLICRecLeftTemplate;
    Pel* recAboveTemplate = m_pcLICRecAboveTemplate;
    int numTemplate[2] = { 0 , 0 }; // 0:Above, 1:Left
    
    const int width = pu.Y().width;
    const int height = pu.Y().height;
    int blockWidth = AFFINE_MIN_BLOCK_SIZE;
    int blockHeight = AFFINE_MIN_BLOCK_SIZE;
    
    const int iHalfBW = blockWidth >> 1;
    const int iHalfBH = blockHeight >> 1;
    
    const int iBit = MAX_CU_DEPTH;
    const int shift = iBit - 4 + MV_FRACTIONAL_BITS_INTERNAL;
    int iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY;
    
    CHECK(refIdx < 0, "Invalid reference index for FRUC");
    const Picture& refPic = *pu.cu->slice->getRefPic(eRefList, refIdx)->unscaledPic;
    std::vector<std::pair<int, Distortion>> aMvCostVec(patternsNum);
    Distortion uiCost = 0;
    
    for (int n = 0; n < patternsNum; ++n)
    {
      uiCost = 0;
      //--------------------- (derive CPMVs)----------------------------------------------//
      Mv mvLT = cMvPred + MvdCand[0][n];
      Mv mvRT = cMvPred2 + MvdCand[1][n];
      mvRT += MvdCand[0][n];
      
      Mv mvLB;
      if (pu.cu->affineType == AFFINEMODEL_6PARAM)
      {
        mvLB = cMvPred3 + MvdCand[2][n];
        mvLB += MvdCand[0][n];
      }
      //--------------- Calculate dMVs ------------------------------------------//
      iDMvHorX = (mvRT - mvLT).getHor() << (iBit - floorLog2(width));
      iDMvHorY = (mvRT - mvLT).getVer() << (iBit - floorLog2(width));
      if (pu.cu->affineType == AFFINEMODEL_6PARAM)
      {
        iDMvVerX = (mvLB - mvLT).getHor() << (iBit - floorLog2(height));
        iDMvVerY = (mvLB - mvLT).getVer() << (iBit - floorLog2(height));
      }
      else
      {
        iDMvVerX = -iDMvHorY;
        iDMvVerY = iDMvHorX;
      }
      
      int iMvScaleHor = mvLT.getHor() << iBit;
      int iMvScaleVer = mvLT.getVer() << iBit;
      
      int mvScaleHorLine = iMvScaleHor + iDMvHorX * iHalfBW + iDMvVerX * iHalfBH;
      int mvScaleVerLine = iMvScaleVer + iDMvHorY * iHalfBW + iDMvVerY * iHalfBH;
      
      int deltaMvHorXBlk = iDMvHorX * blockWidth;
      int deltaMvHorYBlk = iDMvHorY * blockWidth;
      
      // get prediction block by block
      
      for (int h = 0; h < height; h += blockHeight)
      {
        int mvScaleHorBlk = mvScaleHorLine;
        int mvScaleVerBlk = mvScaleVerLine;
        
        for (int w = 0; w < width; w += blockWidth)
        {
          if (w != 0 && h != 0) continue; //applies only on boundary subblocks.
          int iMvScaleTmpHor, iMvScaleTmpVer;
          
          iMvScaleTmpHor = mvScaleHorBlk;
          iMvScaleTmpVer = mvScaleVerBlk;
          
          mvScaleHorBlk += deltaMvHorXBlk;
          mvScaleVerBlk += deltaMvHorYBlk;
          
          roundAffineMv(iMvScaleTmpHor, iMvScaleTmpVer, shift);
          Mv tmpMv(iMvScaleTmpHor, iMvScaleTmpVer);
          tmpMv.clipToStorageBitDepth();
          //clip
          clipMv(tmpMv, pu.lumaPos(), pu.lumaSize(), *pu.cs->sps, *pu.cs->pps);
          iMvScaleTmpHor = tmpMv.getHor();
          iMvScaleTmpVer = tmpMv.getVer();
          uiCost += xGetSublkTemplateCost(*pu.cu, COMPONENT_Y, refPic, Mv(iMvScaleTmpHor, iMvScaleTmpVer), blockWidth, blockHeight, w, h, numTemplate, refLeftTemplate, refAboveTemplate, recLeftTemplate, recAboveTemplate);
        }
      }
      aMvCostVec[n] = { n, uiCost };
    }
    //--------------------------------------------------------------------------------//
    /////////////////////////////////////////////////////////////////
    std::stable_sort(aMvCostVec.begin(), aMvCostVec.end(), [](const std::pair<int, Distortion> & l, const std::pair<int, Distortion> & r) {return l.second < r.second; });
    for (int n = 0; n < patternsNum; ++n)
    {
      int index = aMvCostVec[n].first;
      cMvdDerived[n] = MvdCand[0][index];
      cMvdDerived2[n] = MvdCand[1][index];
      cMvdDerived3[n] = MvdCand[2][index];
    }
  }
  
  Distortion InterPrediction::xGetSublkTemplateCost(const CodingUnit& cu,
                                                    const ComponentID compID,
                                                    const Picture&    refPic,
                                                    const Mv&         mv,
                                                    const int         sublkWidth,
                                                    const int         sublkHeight,
                                                    const int         posW,
                                                    const int         posH,
                                                    int*              numTemplate,
                                                    Pel*              refLeftTemplate,
                                                    Pel*              refAboveTemplate,
                                                    Pel*              recLeftTemplate,
                                                    Pel*              recAboveTemplate)
  {
    const int       bitDepth = cu.cs->sps->getBitDepth(toChannelType(compID));
    const int       precShift = std::max(0, bitDepth - 12);
    Distortion cost = 0;
    
    const Picture&  currPic = *cu.cs->picture;
    const CodingUnit* const cuAbove = cu.cs->getCU(cu.blocks[compID].pos().offset(0, -1), toChannelType(compID));
    const CodingUnit* const cuLeft = cu.cs->getCU(cu.blocks[compID].pos().offset(-1, 0), toChannelType(compID));
    const CPelBuf recBuf = cuAbove || cuLeft ? currPic.getRecoBuf(cu.cs->picture->blocks[compID]) : CPelBuf();
    const CPelBuf refBuf = cuAbove || cuLeft ? refPic.getRecoBuf(refPic.blocks[compID]) : CPelBuf();
    
    std::vector<Pel>& invLUT = m_pcReshape->getInvLUT();
    
    // above
    if (cuAbove && posH == 0)
    {
      xGetPredBlkTpl<true>(cu, compID, refBuf, mv, posW, posH, sublkWidth, refAboveTemplate);
      const Pel*    rec = recBuf.bufAt(cu.blocks[compID].pos().offset(0, -1));
      for (int k = posW; k < posW + sublkWidth; k++)
      {
        int refVal = refAboveTemplate[k];
        int recVal = rec[k];
        
        if (isLuma(compID) && cu.cs->picHeader->getLmcsEnabledFlag() && m_pcReshape->getCTUFlag())
        {
          recVal = invLUT[recVal];
        }
        
        recVal >>= precShift;
        refVal >>= precShift;
        
        refAboveTemplate[k] = refVal;
        recAboveTemplate[k] = recVal;
        numTemplate[0]++;
        cost += (Distortion)(refVal - recVal) * (refVal - recVal);
      }
    }
    
    // left
    if (cuLeft && posW == 0)
    {
      xGetPredBlkTpl<false>(cu, compID, refBuf, mv, posW, posH, sublkHeight, refLeftTemplate);
      const Pel*    rec = recBuf.bufAt(cu.blocks[compID].pos().offset(-1, 0));
      for (int k = posH; k < posH + sublkHeight; k++)
      {
        int refVal = refLeftTemplate[k];
        int recVal = rec[recBuf.stride * k];
        
        if (isLuma(compID) && cu.cs->picHeader->getLmcsEnabledFlag() && m_pcReshape->getCTUFlag())
        {
          recVal = invLUT[recVal];
        }
        
        recVal >>= precShift;
        refVal >>= precShift;
        
        refLeftTemplate[k] = refVal;
        recLeftTemplate[k] = recVal;
        numTemplate[1]++;
        cost += (Distortion)(refVal - recVal) * (refVal - recVal);
      }
    }
    return cost;
  }
  int InterPrediction::deriveMVSDIdxFromMVDAffine(PredictionUnit& pu, RefPicList eRefList, std::vector<Mv>& cMvdDerived, std::vector<Mv>& cMvdDerived2, std::vector<Mv>& cMvdDerived3)
  {
    int mvsdIdx = 0;
    int shift = 0;
    int bin = 0;
    if (pu.mvdAffi[eRefList][0].getHor())
    {
      bin = (cMvdDerived[0].getHor() == pu.mvdAffi[eRefList][0].getHor()) ? 0 : 1;
      mvsdIdx += bin << shift;
      shift++;
    }
    if (pu.mvdAffi[eRefList][0].getVer())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i].getHor() == pu.mvdAffi[eRefList][0].getHor())
        {
          bin = (cMvdDerived[i].getVer() == pu.mvdAffi[eRefList][0].getVer()) ? 0 : 1;
          mvsdIdx += bin << shift;
          shift++;
          break;
        }
      }
    }
    if (pu.mvdAffi[eRefList][1].getHor())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i] == pu.mvdAffi[eRefList][0])
        {
          bin = (cMvdDerived2[i].getHor() == pu.mvdAffi[eRefList][1].getHor()) ? 0 : 1;
          mvsdIdx += bin << shift;
          shift++;
          break;
        }
      }
    }
    if (pu.mvdAffi[eRefList][1].getVer())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i] == pu.mvdAffi[eRefList][0] && cMvdDerived2[i].getHor() == pu.mvdAffi[eRefList][1].getHor())
        {
          bin = (cMvdDerived2[i].getVer() == pu.mvdAffi[eRefList][1].getVer()) ? 0 : 1;
          mvsdIdx += bin << shift;
          shift++;
          break;
        }
      }
    }
    if (pu.cu->affineType == AFFINEMODEL_6PARAM)
    {
      if (pu.mvdAffi[eRefList][2].getHor())
      {
        for (int i = 0; i < (int)cMvdDerived.size(); i++)
        {
          if (cMvdDerived[i] == pu.mvdAffi[eRefList][0] && cMvdDerived2[i] == pu.mvdAffi[eRefList][1])
          {
            bin = (cMvdDerived3[i].getHor() == pu.mvdAffi[eRefList][2].getHor()) ? 0 : 1;
            mvsdIdx += bin << shift;
            shift++;
            break;
          }
        }
      }
      if (pu.mvdAffi[eRefList][2].getVer())
      {
        for (int i = 0; i < (int)cMvdDerived.size(); i++)
        {
          if (cMvdDerived[i] == pu.mvdAffi[eRefList][0] && cMvdDerived2[i] == pu.mvdAffi[eRefList][1] && cMvdDerived3[i].getHor() == pu.mvdAffi[eRefList][2].getHor())
          {
            bin = (cMvdDerived3[i].getVer() == pu.mvdAffi[eRefList][2].getVer()) ? 0 : 1;
            mvsdIdx += bin << shift;
            shift++;
            break;
          }
        }
      }
    }
    return mvsdIdx;
  }
  
  void InterPrediction::deriveMVDFromMVSDIdxAffine(PredictionUnit& pu, RefPicList eRefList, std::vector<Mv>& cMvdDerived, std::vector<Mv>& cMvdDerived2, std::vector<Mv>& cMvdDerived3)
  {
    int mvsdIdx = pu.mvsdIdx[eRefList];
    int bin = 0;
    
    if (pu.mvdAffi[eRefList][0].getHor())
    {
      bin = mvsdIdx & 1;
      int val = bin ? -cMvdDerived[0].getHor() : cMvdDerived[0].getHor();
      pu.mvdAffi[eRefList][0].setHor(val);
      mvsdIdx >>= 1;
    }
    if (pu.mvdAffi[eRefList][0].getVer())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i].getHor() == pu.mvdAffi[eRefList][0].getHor())
        {
          bin = mvsdIdx & 1;
          int val = bin ? -cMvdDerived[i].getVer() : cMvdDerived[i].getVer();
          pu.mvdAffi[eRefList][0].setVer(val);
          mvsdIdx >>= 1;
          break;
        }
      }
    }
    if (pu.mvdAffi[eRefList][1].getHor())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i] == pu.mvdAffi[eRefList][0])
        {
          bin = mvsdIdx & 1;
          int val = bin ? -cMvdDerived2[i].getHor() : cMvdDerived2[i].getHor();
          pu.mvdAffi[eRefList][1].setHor(val);
          mvsdIdx >>= 1;
          break;
        }
      }
    }
    if (pu.mvdAffi[eRefList][1].getVer())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i] == pu.mvdAffi[eRefList][0] && cMvdDerived2[i].getHor() == pu.mvdAffi[eRefList][1].getHor())
        {
          bin = mvsdIdx & 1;
          int val = bin ? -cMvdDerived2[i].getVer() : cMvdDerived2[i].getVer();
          pu.mvdAffi[eRefList][1].setVer(val);
          mvsdIdx >>= 1;
          break;
        }
      }
    }
    if (pu.cu->affineType == AFFINEMODEL_6PARAM)
    {
      if (pu.mvdAffi[eRefList][2].getHor())
      {
        for (int i = 0; i < (int)cMvdDerived.size(); i++)
        {
          if (cMvdDerived[i] == pu.mvdAffi[eRefList][0] && cMvdDerived2[i] == pu.mvdAffi[eRefList][1])
          {
            bin = mvsdIdx & 1;
            int val = bin ? -cMvdDerived3[i].getHor() : cMvdDerived3[i].getHor();
            pu.mvdAffi[eRefList][2].setHor(val);
            mvsdIdx >>= 1;
            break;
          }
        }
      }
      if (pu.mvdAffi[eRefList][2].getVer())
      {
        for (int i = 0; i < (int)cMvdDerived.size(); i++)
        {
          if (cMvdDerived[i] == pu.mvdAffi[eRefList][0] && cMvdDerived2[i] == pu.mvdAffi[eRefList][1] && cMvdDerived3[i].getHor() == pu.mvdAffi[eRefList][2].getHor())
          {
            bin = mvsdIdx & 1;
            int val = bin ? -cMvdDerived3[i].getVer() : cMvdDerived3[i].getVer();
            pu.mvdAffi[eRefList][2].setVer(val);
            mvsdIdx >>= 1;
            break;
          }
        }
      }
    }
  }
  int InterPrediction::deriveMVSDIdxFromMVDTrans(Mv cMvd, std::vector<Mv>& cMvdDerived)
  {
    int mvsdIdx = 0;
    int shift = 0;
    int bin = 0;
    
    if (cMvd.getHor())
    {
      bin = (cMvdDerived[0].getHor() == cMvd.getHor()) ? 0 : 1;
      mvsdIdx += bin << shift;
      shift++;
    }
    if (cMvd.getVer())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i].getHor() == cMvd.getHor())
        {
          bin = (cMvdDerived[i].getVer() == cMvd.getVer()) ? 0 : 1;
          mvsdIdx += bin << shift;
          shift++;
          break;
        }
      }
    }
    return mvsdIdx;
  }
  Mv InterPrediction::deriveMVDFromMVSDIdxTrans(int mvsdIdx, std::vector<Mv>& cMvdDerived)
  {
    int bin = 0;
    Mv cMvd = Mv(0, 0);
    if (cMvdDerived[0].getHor())
    {
      bin = mvsdIdx & 1;
      int val = bin ? -cMvdDerived[0].getHor() : cMvdDerived[0].getHor();
      cMvd.setHor(val);
      mvsdIdx >>= 1;
    }
    if (cMvdDerived[0].getVer())
    {
      for (int i = 0; i < (int)cMvdDerived.size(); i++)
      {
        if (cMvdDerived[i].getHor() == cMvd.getHor())
        {
          bin = mvsdIdx & 1;
          int val = bin ? -cMvdDerived[i].getVer() : cMvdDerived[i].getVer();
          cMvd.setVer(val);
          mvsdIdx >>= 1;
          break;
        }
      }
    }
    return cMvd;
  }
#endif