/* The copyright in this software is being made available under the BSD
* License, included below. This software may be subject to other third party
* and contributor rights, including patent rights, and no such rights are
* granted under this license.
*
* Copyright (c) 2010-2023, ITU/ISO/IEC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*  * Redistributions of source code must retain the above copyright notice,
*    this list of conditions and the following disclaimer.
*  * Redistributions in binary form must reproduce the above copyright notice,
*    this list of conditions and the following disclaimer in the documentation
*    and/or other materials provided with the distribution.
*  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
*    be used to endorse or promote products derived from this software without
*    specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/

/** \file     Buffer.cpp
 *  \brief    Low-overhead class describing 2D memory layout
 */

#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP

// unit needs to come first due to a forward declaration
#include "Unit.h"
#include "Buffer.h"
#include "InterpolationFilter.h"

void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng)
{
  int idx = 0;

  const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13);
  for (int h = 0; h < height; h++)
  {
    for (int w = 0; w < width; w++)
    {
      int32_t dI = dMvX[idx] * gradX[w] + dMvY[idx] * gradY[w];
      dI = Clip3(-dILimit, dILimit - 1, dI);
      dst[w] = src[w] + dI;
      if (!bi)
      {
        dst[w] = (dst[w] + offset) >> shiftNum;
        dst[w] = ClipPel(dst[w], clpRng);
      }

      idx++;
    }
    gradX += gradStride;
    gradY += gradStride;
    dst += dstStride;
    src += srcStride;
  }
}

#if TM_AMVP || TM_MRG || JVET_Z0084_IBC_TM
int64_t getSumOfDifferenceCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int rowSubShift, int bitDepth)
{
  height     >>= rowSubShift;
  src0Stride <<= rowSubShift;
  src1Stride <<= rowSubShift;

  int64_t sum = 0;
#define GET_SUM_DIFF_CORE_OP( ADDR ) sum += ( src0[ADDR] - src1[ADDR] )
#define GET_SUM_DIFF_CORE_INC    \
  src0 += src0Stride;            \
  src1 += src1Stride;            \

  SIZE_AWARE_PER_EL_OP(GET_SUM_DIFF_CORE_OP, GET_SUM_DIFF_CORE_INC);

#undef GET_SUM_DIFF_CORE_OP
#undef GET_SUM_DIFF_CORE_INC

  return sum;
}
#endif

#if JVET_Z0056_GPM_SPLIT_MODE_REORDERING
void getAbsoluteDifferencePerSampleCore(Pel* dst, int dstStride, const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height)
{
#define GET_ABS_DIFF_PER_SAMPLE_CORE_OP( ADDR ) dst[ADDR] = std::abs( src0[ADDR] - src1[ADDR] )
#define GET_ABS_DIFF_PER_SAMPLE_CORE_INC    \
  src0 += src0Stride;                       \
  src1 += src1Stride;                       \
  dst  += dstStride;                        \

  SIZE_AWARE_PER_EL_OP(GET_ABS_DIFF_PER_SAMPLE_CORE_OP, GET_ABS_DIFF_PER_SAMPLE_CORE_INC);

#undef GET_ABS_DIFF_PER_SAMPLE_CORE_OP
#undef GET_ABS_DIFF_PER_SAMPLE_CORE_INC
}

template <uint8_t maskType>
int64_t getMaskedSampleSumCore(Pel* src, int srcStride, int width, int height, int bitDepth, short* weightMask, int maskStepX, int maskStride, int maskStride2)
{
  const Pel* mask      = weightMask;
  const int  cols      = width;
        int  rows      = height;

  int64_t sum = 0;
  if (maskType == 1) // 1: Use mask
  {
    for (; rows != 0; rows--)
    {
      for (int n = 0; n < cols; n++)
      {
        sum  += (src[n]) * (*mask);
        mask += maskStepX;
      }
      src  += srcStride;
      mask += (maskStride + maskStride2);
    }
  }
  else if (maskType == 2 || maskType == 3) // 2: Use binary mask that contains only 0's and 1's, 3: Inverse the input binary mask before use
  {
    for (; rows != 0; rows--)
    {
      for (int n = 0; n < cols; n++)
      {
        sum += (src[n]) & (maskType == 3 ? ((*mask) - 1) : (-(*mask)));
        mask += maskStepX;
      }
      src  += srcStride;
      mask += (maskStride + maskStride2);
    }
  }
  else // No mask
  {
    for (; rows != 0; rows--)
    {
      for (int n = 0; n < cols; n++)
      {
        sum += src[n];
      }
      src  += srcStride;
    }
  }

  return sum;
}
#endif

#if JVET_W0097_GPM_MMVD_TM
void roundBDCore(const Pel* srcp, const int srcStride, Pel* dest, const int destStride, int width, int height, const ClpRng& clpRng)
{
  const int32_t clipbd = clpRng.bd;
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int32_t shiftDefault = IF_INTERNAL_FRAC_BITS(clipbd);
#else
  const int32_t shiftDefault = std::max<int>(2, (IF_INTERNAL_PREC - clipbd));
#endif
  const int32_t offsetDefault = (1 << (shiftDefault - 1)) + IF_INTERNAL_OFFS;

  if (width == 1)
  {
    THROW("Blocks of width = 1 not supported");
  }
  else
  {
#define RND_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( srcp[ADDR] + offsetDefault, shiftDefault), clpRng )
#define RND_INC        \
    srcp += srcStride;  \
    dest += destStride; \

    SIZE_AWARE_PER_EL_OP(RND_OP, RND_INC);

#undef RND_OP
#undef RND_INC
  }
}

void weightedAvgCore(const Pel* src0, const unsigned src0Stride, const Pel* src1, const unsigned src1Stride, Pel* dest, const unsigned destStride, const int8_t w0, const int8_t w1, int width, int height, const ClpRng& clpRng)
{
  const int8_t log2WeightBase = g_bcwLog2WeightBase;
  const int clipbd = clpRng.bd;
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd) + log2WeightBase;
#else
  const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase;
#endif
  const int offset = (1 << (shiftNum - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);

#define ADD_AVG_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( ( src0[ADDR]*w0 + src1[ADDR]*w1 + offset ), shiftNum ), clpRng )
#define ADD_AVG_INC     \
    src0 += src0Stride; \
    src1 += src1Stride; \
    dest += destStride; \

  SIZE_AWARE_PER_EL_OP(ADD_AVG_OP, ADD_AVG_INC);

#undef ADD_AVG_OP
#undef ADD_AVG_INC
}

void copyClipCore(const Pel* srcp, const unsigned srcStride, Pel* dest, const unsigned destStride, int width, int height, const ClpRng& clpRng)
{
#define RECO_OP( ADDR ) dest[ADDR] = ClipPel( srcp[ADDR], clpRng )
#define RECO_INC        \
  srcp += srcStride;  \
  dest += destStride; \

  SIZE_AWARE_PER_EL_OP(RECO_OP, RECO_INC);

#undef RECO_OP
#undef RECO_INC
}
#endif
template< typename T >
#if JVET_Z0136_OOB
void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T* dest, int dstStride, int width, int height, int rshift, int offset, const ClpRng& clpRng, bool *mcMask[2], int mcStride, bool *isOOB)
#else
void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T* dest, int dstStride, int width, int height, int rshift, int offset, const ClpRng& clpRng )
#endif
{
#define ADD_AVG_CORE_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( ( src1[ADDR] + src2[ADDR] + offset ), rshift ), clpRng )
#define ADD_AVG_CORE_INC    \
  src1 += src1Stride;       \
  src2 += src2Stride;       \
  dest +=  dstStride;       \

  SIZE_AWARE_PER_EL_OP( ADD_AVG_CORE_OP, ADD_AVG_CORE_INC );

#undef ADD_AVG_CORE_OP
#undef ADD_AVG_CORE_INC
}

#if JVET_AD0213_LIC_IMP
template< typename T >
void toLastCore(T* src, int srcStride, int width, int height, int shiftNum, int offset, const ClpRng& clpRng)
{
  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      src[x] = ClipPel(rightShift((src[x] + offset), shiftNum), clpRng);
    }
    src += srcStride;
  }
}

template< typename T >
void licRemoveWeightHighFreqCore(T* src0, T* src1, T* dst, int length, int w0, int w1, int offset, const ClpRng& clpRng)
{
  for (int w = 0; w < length; w++)
  {
    T iTemp = ClipPel(T((int(src0[w])*w0 - int(src1[w])*w1 + offset) >> 16), clpRng);
    dst[w] = iTemp;
  }
}
#endif

void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
{
  int b = 0;

  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x += 4)
    {
      b = tmpx * (gradX0[x] - gradX1[x]) + tmpy * (gradY0[x] - gradY1[x]);
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
      dst[x] = ClipPel(rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
#else
      dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
#endif

      b = tmpx * (gradX0[x + 1] - gradX1[x + 1]) + tmpy * (gradY0[x + 1] - gradY1[x + 1]);
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
      dst[x + 1] = ClipPel(rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng);
#else
      dst[x + 1] = ClipPel((int16_t)rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng);
#endif

      b = tmpx * (gradX0[x + 2] - gradX1[x + 2]) + tmpy * (gradY0[x + 2] - gradY1[x + 2]);
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
      dst[x + 2] = ClipPel(rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng);
#else
      dst[x + 2] = ClipPel((int16_t)rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng);
#endif

      b = tmpx * (gradX0[x + 3] - gradX1[x + 3]) + tmpy * (gradY0[x + 3] - gradY1[x + 3]);
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
      dst[x + 3] = ClipPel(rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng);
#else
      dst[x + 3] = ClipPel((int16_t)rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng);
#endif
    }
    dst += dstStride;       src0 += src0Stride;     src1 += src1Stride;
    gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride;
  }
}

#if JVET_AD0195_HIGH_PRECISION_BDOF_CORE
void calcBIOParameterCoreHighPrecision(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel* gradX1, Pel* gradY0, Pel* gradY1, int width, int height, const int src0Stride, const int src1Stride, const int widthG, const int bitDepth, int32_t* s1, int32_t* s2, int32_t* s3, int32_t* s5, int32_t* s6, Pel* dI)
{
  width -= 2;
  height -= 2;
  const int bioParamOffset = widthG + 1;
  srcY0Tmp += src0Stride + 1;
  srcY1Tmp += src1Stride + 1;
  gradX0 += bioParamOffset;  gradX1 += bioParamOffset;
  gradY0 += bioParamOffset;  gradY1 += bioParamOffset;
  s1  += bioParamOffset;  s2  += bioParamOffset;
  s3    += bioParamOffset;  s5    += bioParamOffset;
  s6 += bioParamOffset;
  int shift4 = 4;
  dI += bioParamOffset;
  int32_t  temp=0, tempGX=0, tempGY=0;
  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      temp = (int32_t) ((srcY1Tmp[x] >> shift4) - (srcY0Tmp[x] >> shift4)) ;
      tempGX = (int32_t) (gradX0[x] + gradX1[x]);
      tempGY = (int32_t) (gradY0[x] + gradY1[x]);
      dI[x] = (Pel) temp;
      s1[x] =  tempGX * tempGX;
      s2[x] =  tempGX * tempGY;
      s5[x] =  tempGY * tempGY;
      s3[x] = tempGX * temp;
      s6[x] = tempGY * temp;
      
    }
    srcY0Tmp += src0Stride;
    srcY1Tmp += src1Stride;
    gradX0 += widthG;
    gradX1 += widthG;
    gradY0 += widthG;
    gradY1 += widthG;
    s1 += widthG;
    s2 += widthG;
    s3 += widthG;
    s5 += widthG;
    s6 += widthG;
    dI += widthG;
  }
  
  return;
}

void calcBIOParamSum4CoreHighPrecision(int32_t* s1, int32_t* s2, int32_t* s3, int32_t* s5, int32_t* s6, int width, int height, const int widthG, int32_t* sumS1, int32_t* sumS2, int32_t* sumS3, int32_t* sumS5, int32_t* sumS6)
{
  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      int w = 1;
      w = (x >= (width/2) ? width - x : x + 1) * (y >= (height/2) ? height - y : y + 1);
      *sumS1 += w * s1[x];
      *sumS2 += w * s2[x];
      *sumS3 += w * s3[x];
      *sumS5 += w * s5[x];
      *sumS6 += w * s6[x];
    }
    s1 += widthG;
    s2 += widthG;
    s3 += widthG;
    s5 += widthG;
    s6 += widthG;
  }
}
#endif

#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
void calcBIOParameterCore(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel* gradX1, Pel* gradY0, Pel* gradY1, int width, int height, const int src0Stride, const int src1Stride, const int widthG, const int bitDepth, Pel* absGX, Pel* absGY, Pel* dIX, Pel* dIY, Pel* signGyGx, Pel* dI)
{
  width -= 2;
  height -= 2;
  const int bioParamOffset = widthG + 1;
  srcY0Tmp += src0Stride + 1;
  srcY1Tmp += src1Stride + 1;
  gradX0 += bioParamOffset;  gradX1 += bioParamOffset;
  gradY0 += bioParamOffset;  gradY1 += bioParamOffset;
  absGX  += bioParamOffset;  absGY  += bioParamOffset;
  dIX    += bioParamOffset;  dIY    += bioParamOffset;
  signGyGx += bioParamOffset;
  int shift4 = 4;
  int shift5 = 1;
  if (dI)
  {
    dI += bioParamOffset;
    for (int y = 0; y < height; y++)
    {
      for (int x = 0; x < width; x++)
      {
        int tmpGX = (gradX0[x] + gradX1[x]) >> shift5;
        int tmpGY = (gradY0[x] + gradY1[x]) >> shift5;
        int tmpDI = (int)((srcY1Tmp[x] >> shift4) - (srcY0Tmp[x] >> shift4));
        dI[x] = tmpDI;
        absGX[x] = (tmpGX < 0 ? -tmpGX : tmpGX);
        absGY[x] = (tmpGY < 0 ? -tmpGY : tmpGY);
        dIX[x] = (tmpGX < 0 ? -tmpDI : (tmpGX == 0 ? 0 : tmpDI));
        dIY[x] = (tmpGY < 0 ? -tmpDI : (tmpGY == 0 ? 0 : tmpDI));
        signGyGx[x] = (tmpGY < 0 ? -tmpGX : (tmpGY == 0 ? 0 : tmpGX));
      }
      srcY0Tmp += src0Stride;
      srcY1Tmp += src1Stride;
      gradX0 += widthG;
      gradX1 += widthG;
      gradY0 += widthG;
      gradY1 += widthG;
      absGX += widthG;
      absGY += widthG;
      dI += widthG;
      dIX += widthG;
      dIY += widthG;
      signGyGx += widthG;
    }

    return;
  }

  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      int tmpGX = (gradX0[x] + gradX1[x]) >> shift5;
      int tmpGY = (gradY0[x] + gradY1[x]) >> shift5;
      int tmpDI = (int)((srcY1Tmp[x] >> shift4) - (srcY0Tmp[x] >> shift4));
      absGX[x] = (tmpGX < 0 ? -tmpGX : tmpGX);
      absGY[x] = (tmpGY < 0 ? -tmpGY : tmpGY);
      dIX[x] = (tmpGX < 0 ? -tmpDI : (tmpGX == 0 ? 0 : tmpDI));
      dIY[x] = (tmpGY < 0 ? -tmpDI : (tmpGY == 0 ? 0 : tmpDI));
      signGyGx[x] = (tmpGY < 0 ? -tmpGX : (tmpGY == 0 ? 0 : tmpGX));
    }
    srcY0Tmp += src0Stride;
    srcY1Tmp += src1Stride;
    gradX0 += widthG;
    gradX1 += widthG;
    gradY0 += widthG;
    gradY1 += widthG;
    absGX += widthG;
    absGY += widthG;
    dIX += widthG;
    dIY += widthG;
    signGyGx += widthG;
  }
}

void calcBIOParamSum5Core(Pel* absGX, Pel* absGY, Pel* dIX, Pel* dIY, Pel* signGyGx, const int widthG, const int width, const int height, int* sumAbsGX, int* sumAbsGY, int* sumDIX, int* sumDIY, int* sumSignGyGx)
{
  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      const int sampleIdx = y * width + x;
      sumAbsGX[sampleIdx] = 0;
      sumAbsGY[sampleIdx] = 0;
      sumDIX[sampleIdx] = 0;
      sumDIY[sampleIdx] = 0;
      sumSignGyGx[sampleIdx] = 0;
      for (int yy = 0; yy < 5; yy++)
      {
        for (int xx = 0; xx < 5; xx++)
        {
#if JVET_AD0195_HIGH_PRECISION_BDOF_CORE
          int w = 1;
          w = (xx >= 2 ? 5 - xx : xx + 1) * (yy >= 2 ? 5 - yy : yy + 1);
          sumAbsGX[sampleIdx] += w * absGX[xx];
          sumAbsGY[sampleIdx] += w * absGY[xx];
          sumDIX[sampleIdx] += w * dIX[xx];
          sumDIY[sampleIdx] += w * dIY[xx];
          sumSignGyGx[sampleIdx] += w * signGyGx[xx];
#else
          sumAbsGX[sampleIdx] += absGX[xx];
          sumAbsGY[sampleIdx] += absGY[xx];
          sumDIX[sampleIdx] += dIX[xx];
          sumDIY[sampleIdx] += dIY[xx];
          sumSignGyGx[sampleIdx] += signGyGx[xx];
#endif
        }
        absGX += widthG;
        absGY += widthG;
        dIX += widthG;
        dIY += widthG;
        signGyGx += widthG;
      }
      sumDIX[sampleIdx] <<= 2;
      sumDIY[sampleIdx] <<= 2;
#if JVET_AE0091_ITERATIVE_BDOF
      int regVxVy = (1 << 8);
      sumAbsGX[sampleIdx] += regVxVy;
      sumAbsGY[sampleIdx] += regVxVy;
#endif
      absGX += (1 - 5 * widthG);
      absGY += (1 - 5 * widthG);
      dIX += (1 - 5 * widthG);
      dIY += (1 - 5 * widthG);
      signGyGx += (1 - 5 * widthG);
    }
    absGX += (widthG - width);
    absGY += (widthG - width);
    dIX += (widthG - width);
    dIY += (widthG - width);
    signGyGx += (widthG - width);
  }
}

void calcBIOParamSum4Core(Pel* absGX, Pel* absGY, Pel* dIX, Pel* dIY, Pel* signGyGx, int width, int height, const int widthG, int* sumAbsGX, int* sumAbsGY, int* sumDIX, int* sumDIY, int* sumSignGyGx)
{
  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      *sumAbsGX += absGX[x];
      *sumAbsGY += absGY[x];
      *sumDIX += dIX[x];
      *sumDIY += dIY[x];
      *sumSignGyGx += signGyGx[x];
    }
    absGX += widthG;
    absGY += widthG;
    dIX += widthG;
    dIY += widthG;
    signGyGx += widthG;
  }
}

void calcBIOClippedVxVyCore(int* sumDIXSample32bit, int* sumAbsGxSample32bit, int* sumDIYSample32bit, int* sumAbsGySample32bit, int* sumSignGyGxSample32bit, const int limit, const int bioSubblockSize, int* tmpxSample32bit, int* tmpySample32bit)
{
  for (int idx = 0; idx < bioSubblockSize; idx++)
  {
    *tmpxSample32bit = Clip3(-limit, limit, (*sumDIXSample32bit) >> (*sumAbsGxSample32bit));
    int tmpData = ((*sumSignGyGxSample32bit) * (*tmpxSample32bit)) >> 1;
    *tmpySample32bit = Clip3(-limit, limit, (((*sumDIYSample32bit) - tmpData) >> (*sumAbsGySample32bit)));
    sumDIXSample32bit++;
    sumAbsGxSample32bit++;
    sumDIYSample32bit++;
    sumAbsGySample32bit++;
    sumSignGyGxSample32bit++;
    tmpxSample32bit++;
    tmpySample32bit++;
  }
}
#if JVET_Z0136_OOB
void addBIOAvgNCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel *gradY1, int gradStride, int width, int height, int* tmpx, int* tmpy, int shift, int offset, const ClpRng& clpRng, bool *mcMask[2], int mcStride, bool *isOOB)
#else
void addBIOAvgNCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel *gradY1, int gradStride, int width, int height, int* tmpx, int* tmpy, int shift, int offset, const ClpRng& clpRng)
#endif
{
  int b = 0;
#if JVET_Z0136_OOB
  int offset2 = offset >> 1;
  int shift2 = shift - 1;
  bool *pMcMask0 = mcMask[0];
  bool *pMcMask1 = mcMask[1];
  if (isOOB[0] || isOOB[1])
  {
    for (int y = 0; y < height; y++)
    {
      for (int x = 0; x < width; x++)
      {
        b = (int)tmpx[x] * (gradX0[x] - gradX1[x]) + (int)tmpy[x] * (gradY0[x] - gradY1[x]);
        bool oob0 = pMcMask0[x];
        bool oob1 = pMcMask1[x];
        if (oob0 && !oob1)
        {
          dst[x] = ClipPel(rightShift(src1[x] + offset2, shift2), clpRng);
        }
        else if (!oob0 && oob1)
        {
          dst[x] = ClipPel(rightShift(src0[x] + offset2, shift2), clpRng);
        }
        else
        {
          dst[x] = ClipPel(rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
        }
      }
      pMcMask0 += mcStride;
      pMcMask1 += mcStride;
      tmpx += width;
      tmpy += width;
      dst += dstStride;
      src0 += src0Stride;
      src1 += src1Stride;
      gradX0 += gradStride;
      gradX1 += gradStride;
      gradY0 += gradStride;
      gradY1 += gradStride;
    }
  }
  else
  {
    for (int y = 0; y < height; y++)
    {
      for (int x = 0; x < width; x++)
      {
        b = (int)tmpx[x] * (gradX0[x] - gradX1[x]) + (int)tmpy[x] * (gradY0[x] - gradY1[x]);
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
        dst[x] = ClipPel(rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
#else
        dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
#endif
      }
      tmpx += width;
      tmpy += width;
      dst += dstStride;
      src0 += src0Stride;
      src1 += src1Stride;
      gradX0 += gradStride;
      gradX1 += gradStride;
      gradY0 += gradStride;
      gradY1 += gradStride;
    }
  }
#else
  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      b = (int)tmpx[x] * (gradX0[x] - gradX1[x]) + (int)tmpy[x] * (gradY0[x] - gradY1[x]);
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
      dst[x] = ClipPel(rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
#else
      dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
#endif
    }
    tmpx += width;    tmpy += width;
    dst += dstStride;       src0 += src0Stride;     src1 += src1Stride;
    gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride;
  }
#endif
  return;
}

void calAbsSumCore(const Pel* diff, int stride, int width, int height, int* absSum)
{
  *absSum = 0;
  for (int y = 0; y < height; y++)
  {
    for (int x = 0; x < width; x++)
    {
      *absSum += ::abs(diff[x]);
    }
    diff += stride;
  }
}
#endif

template<bool pad = true>
void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth)
{
  Pel* srcTmp = pSrc + srcStride + 1;
  Pel* gradXTmp = gradX + gradStride + 1;
  Pel* gradYTmp = gradY + gradStride + 1;
  int  shift1 = 6;

#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
  for (int y = 0; y < (height - 2); y++)
  {
    for (int x = 0; x < (width - 2); x++)
#else
  for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++)
  {
    for (int x = 0; x < (width - 2 * BIO_EXTEND_SIZE); x++)
#endif
    {
      gradYTmp[x] = ( srcTmp[x + srcStride] >> shift1 ) - ( srcTmp[x - srcStride] >> shift1 );
      gradXTmp[x] = ( srcTmp[x + 1] >> shift1 ) - ( srcTmp[x - 1] >> shift1 );
    }
    gradXTmp += gradStride;
    gradYTmp += gradStride;
    srcTmp += srcStride;
  }

#if !MULTI_PASS_DMVR && !SAMPLE_BASED_BDOF
  if (pad)
  {
  gradXTmp = gradX + gradStride + 1;
  gradYTmp = gradY + gradStride + 1;
  for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++)
  {
    gradXTmp[-1] = gradXTmp[0];
    gradXTmp[width - 2 * BIO_EXTEND_SIZE] = gradXTmp[width - 2 * BIO_EXTEND_SIZE - 1];
    gradXTmp += gradStride;

    gradYTmp[-1] = gradYTmp[0];
    gradYTmp[width - 2 * BIO_EXTEND_SIZE] = gradYTmp[width - 2 * BIO_EXTEND_SIZE - 1];
    gradYTmp += gradStride;
  }

  gradXTmp = gradX + gradStride;
  gradYTmp = gradY + gradStride;
  ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel)*(width));
  ::memcpy(gradXTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradXTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width));
  ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width));
  ::memcpy(gradYTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradYTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width));
  }
#endif
}

void calcBIOSumsCore(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel* gradX1, Pel* gradY0, Pel* gradY1, int xu, int yu, const int src0Stride, const int src1Stride, const int widthG, const int bitDepth, int* sumAbsGX, int* sumAbsGY, int* sumDIX, int* sumDIY, int* sumSignGyGx)
{
  int shift4 = 4;
  int shift5 = 1;

  for (int y = 0; y < 6; y++)
  {
    for (int x = 0; x < 6; x++)
    {
      int tmpGX = (gradX0[x] + gradX1[x]) >> shift5;
      int tmpGY = (gradY0[x] + gradY1[x]) >> shift5;
      int tmpDI = (int)((srcY1Tmp[x] >> shift4) - (srcY0Tmp[x] >> shift4));
      *sumAbsGX += (tmpGX < 0 ? -tmpGX : tmpGX);
      *sumAbsGY += (tmpGY < 0 ? -tmpGY : tmpGY);
      *sumDIX += (tmpGX < 0 ? -tmpDI : (tmpGX == 0 ? 0 : tmpDI));
      *sumDIY += (tmpGY < 0 ? -tmpDI : (tmpGY == 0 ? 0 : tmpDI));
      *sumSignGyGx += (tmpGY < 0 ? -tmpGX : (tmpGY == 0 ? 0 : tmpGX));

    }
    srcY1Tmp += src1Stride;
    srcY0Tmp += src0Stride;
    gradX0 += widthG;
    gradX1 += widthG;
    gradY0 += widthG;
    gradY1 += widthG;
  }
}


void calcBlkGradientCore(int sx, int sy, int     *arraysGx2, int     *arraysGxGy, int     *arraysGxdI, int     *arraysGy2, int     *arraysGydI, int     &sGx2, int     &sGy2, int     &sGxGy, int     &sGxdI, int     &sGydI, int width, int height, int unitSize)
{
  int     *Gx2 = arraysGx2;
  int     *Gy2 = arraysGy2;
  int     *GxGy = arraysGxGy;
  int     *GxdI = arraysGxdI;
  int     *GydI = arraysGydI;

  // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE
  Gx2 -= (BIO_EXTEND_SIZE*width);
  Gy2 -= (BIO_EXTEND_SIZE*width);
  GxGy -= (BIO_EXTEND_SIZE*width);
  GxdI -= (BIO_EXTEND_SIZE*width);
  GydI -= (BIO_EXTEND_SIZE*width);

  for (int y = -BIO_EXTEND_SIZE; y < unitSize + BIO_EXTEND_SIZE; y++)
  {
    for (int x = -BIO_EXTEND_SIZE; x < unitSize + BIO_EXTEND_SIZE; x++)
    {
      sGx2 += Gx2[x];
      sGy2 += Gy2[x];
      sGxGy += GxGy[x];
      sGxdI += GxdI[x];
      sGydI += GydI[x];
    }
    Gx2 += width;
    Gy2 += width;
    GxGy += width;
    GxdI += width;
    GydI += width;
  }
}

#if ENABLE_SIMD_OPT_BCW && defined(TARGET_SIMD_X86)
void removeWeightHighFreq(int16_t* dst, int dstStride, const int16_t* src, int srcStride, int width, int height, int shift, int bcwWeight)
{
  int normalizer = ((1 << 16) + (bcwWeight > 0 ? (bcwWeight >> 1) : -(bcwWeight >> 1))) / bcwWeight;
  int weight0 = normalizer << g_bcwLog2WeightBase;
  int weight1 = (g_bcwWeightBase - bcwWeight)*normalizer;
#define REM_HF_INC  \
  src += srcStride; \
  dst += dstStride; \

#define REM_HF_OP( ADDR )      dst[ADDR] =             (dst[ADDR]*weight0 - src[ADDR]*weight1 + (1<<15))>>16

  SIZE_AWARE_PER_EL_OP(REM_HF_OP, REM_HF_INC);

#undef REM_HF_INC
#undef REM_HF_OP
#undef REM_HF_OP_CLIP
}

void removeHighFreq(int16_t* dst, int dstStride, const int16_t* src, int srcStride, int width, int height)
{
#define REM_HF_INC  \
  src += srcStride; \
  dst += dstStride; \

#define REM_HF_OP( ADDR )      dst[ADDR] =             2 * dst[ADDR] - src[ADDR]

  SIZE_AWARE_PER_EL_OP(REM_HF_OP, REM_HF_INC);

#undef REM_HF_INC
#undef REM_HF_OP
#undef REM_HF_OP_CLIP
}
#endif

template<typename T>
void reconstructCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T* dest, int dstStride, int width, int height, const ClpRng& clpRng )
{
#define RECO_CORE_OP( ADDR ) dest[ADDR] = ClipPel( src1[ADDR] + src2[ADDR], clpRng )
#define RECO_CORE_INC     \
  src1 += src1Stride;     \
  src2 += src2Stride;     \
  dest +=  dstStride;     \

  SIZE_AWARE_PER_EL_OP( RECO_CORE_OP, RECO_CORE_INC );

#undef RECO_CORE_OP
#undef RECO_CORE_INC
}


template<typename T>
void linTfCore( const T* src, int srcStride, Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip )
{
#define LINTF_CORE_OP( ADDR ) dst[ADDR] = ( Pel ) bClip ? ClipPel( rightShift( scale * src[ADDR], shift ) + offset, clpRng ) : ( rightShift( scale * src[ADDR], shift ) + offset )
#define LINTF_CORE_INC  \
  src += srcStride;     \
  dst += dstStride;     \

  SIZE_AWARE_PER_EL_OP( LINTF_CORE_OP, LINTF_CORE_INC );

#undef LINTF_CORE_OP
#undef LINTF_CORE_INC
}

#if JVET_Z0136_OOB
bool isMvOOBCore(const Mv& rcMv, const struct Position pos, const struct Size size, const SPS* sps, const PPS* pps, bool *mcMask, bool *mcMaskChroma, bool lumaOnly, ChromaFormat componentID)
{
  int chromaScale = getComponentScaleX(COMPONENT_Cb, componentID);
  const int mvstep = 1 << MV_FRACTIONAL_BITS_INTERNAL;
  const int mvstepHalf = mvstep >> 1;

  int horMax = (((int)pps->getPicWidthInLumaSamples() - 1) << MV_FRACTIONAL_BITS_INTERNAL) + mvstepHalf;
  int horMin = -mvstepHalf;
  int verMax = (((int)pps->getPicHeightInLumaSamples() - 1) << MV_FRACTIONAL_BITS_INTERNAL) + mvstepHalf;
  int verMin = -mvstepHalf;

  int offsetX = (pos.x << MV_FRACTIONAL_BITS_INTERNAL) + rcMv.getHor();
  int offsetY = (pos.y << MV_FRACTIONAL_BITS_INTERNAL) + rcMv.getVer();
  bool isOOB = false;
  if ((offsetX <= horMin)
    || ((offsetX + ((size.width - 1) << MV_FRACTIONAL_BITS_INTERNAL) ) >= horMax)
    || (offsetY <= verMin)
    || ((offsetY + ((size.height - 1) << MV_FRACTIONAL_BITS_INTERNAL)) >= verMax))
  {
    isOOB = true;
  }
  if (isOOB)
  {
    int baseOffsetX = offsetX;
    bool *pMcMask = mcMask;

    for (int y = 0; y < size.height; y++, offsetY += mvstep)
    {
      offsetX = baseOffsetX;
      bool checkY = (offsetY <= verMin) || (offsetY >= verMax);
      for (int x = 0; x < size.width; x++, offsetX += mvstep)
      {
        pMcMask[x] = (offsetX <= horMin) || (offsetX >= horMax) || checkY;
      }
      pMcMask += size.width;
    }

    if (!lumaOnly)
    {
      bool *pMcMaskChroma = mcMaskChroma;
      pMcMask = mcMask;
      int widthChroma = (size.width) >> chromaScale;
      int heightChroma = (size.height) >> chromaScale;
      int widthLuma2 = size.width << chromaScale;
      for (int y = 0; y < heightChroma; y++)
      {
        for (int x = 0; x < widthChroma; x++)
        {
          pMcMaskChroma[x] = pMcMask[x << chromaScale];
        }
        pMcMaskChroma += widthChroma;
        pMcMask += widthLuma2;
      }
    }
  }
  else
  {
    bool *pMcMask = mcMask;
    memset(pMcMask, false, size.width * size.height);

    bool *pMcMaskChroma = mcMaskChroma;
    int widthChroma = (size.width) >> chromaScale;
    int heightChroma = (size.height) >> chromaScale;
    memset(pMcMaskChroma, false, widthChroma * heightChroma);
  }
  return isOOB;
}

bool isMvOOBSubBlkCore(const Mv& rcMv, const struct Position pos, const struct Size size, const SPS* sps, const PPS* pps, bool *mcMask, int mcStride, bool *mcMaskChroma, int mcCStride, bool lumaOnly, ChromaFormat componentID)
{
  int chromaScale = getComponentScaleX(COMPONENT_Cb, componentID);
  const int mvstep = 1 << MV_FRACTIONAL_BITS_INTERNAL;
  const int mvstepHalf = mvstep >> 1;

  int horMax = (((int)pps->getPicWidthInLumaSamples() - 1) << MV_FRACTIONAL_BITS_INTERNAL) + mvstepHalf;
  int horMin = -mvstepHalf;
  int verMax = (((int)pps->getPicHeightInLumaSamples() - 1) << MV_FRACTIONAL_BITS_INTERNAL) + mvstepHalf;
  int verMin = -mvstepHalf;

  int offsetX = (pos.x << MV_FRACTIONAL_BITS_INTERNAL) + rcMv.getHor();
  int offsetY = (pos.y << MV_FRACTIONAL_BITS_INTERNAL) + rcMv.getVer();
  bool isOOB = false;
  if ((offsetX <= horMin)
    || ((offsetX + ((size.width - 1) << MV_FRACTIONAL_BITS_INTERNAL) ) >= horMax)
    || (offsetY <= verMin)
    || ((offsetY + ((size.height - 1) << MV_FRACTIONAL_BITS_INTERNAL)) >= verMax))
  {
    isOOB = true;
  }
  if (isOOB)
  {
    int baseOffsetX = offsetX;
    bool *pMcMask = mcMask;
    for (int y = 0; y < size.height; y++, offsetY += mvstep)
    {
      offsetX = baseOffsetX;
      bool checkY = (offsetY <= verMin) || (offsetY >= verMax);;
      for (int x = 0; x < size.width; x++, offsetX += mvstep)
      {
        pMcMask[x] = (offsetX <= horMin) || (offsetX >= horMax) || checkY;
      }
      pMcMask += mcStride;
    }

    if (!lumaOnly)
    {
      bool *pMcMaskChroma = mcMaskChroma;
      pMcMask = mcMask;
      int widthChroma = (size.width) >> chromaScale;
      int heightChroma = (size.height) >> chromaScale;
      int strideLuma2 = mcStride << chromaScale;
      for (int y = 0; y < heightChroma; y++)
      {
        for (int x = 0; x < widthChroma; x++)
        {
          pMcMaskChroma[x] = pMcMask[x << chromaScale];
        }
        pMcMaskChroma += mcCStride;
        pMcMask += strideLuma2;
      }
    }
  }
  else
  {
    bool *pMcMask = mcMask;
    for (int y = 0; y < size.height; y++)
    {
      memset(pMcMask, false, size.width);
      pMcMask += mcStride;
    }

    bool *pMcMaskChroma = mcMaskChroma;
    int widthChroma = (size.width) >> chromaScale;
    int heightChroma = (size.height) >> chromaScale;
    for (int y = 0; y < heightChroma; y++)
    {
      memset(pMcMaskChroma, false, widthChroma);
      pMcMaskChroma += mcCStride;
    }
  }
  return isOOB;
}
#endif
#if JVET_AA0107_RMVF_AFFINE_MERGE_DERIVATION
void computeDeltaAndShiftCore(const Position posLT, Mv firstMv, std::vector<RMVFInfo> &mvpInfoVecOri)
{
  for (int i = 0; i < int(mvpInfoVecOri.size()); i++)
  {
#if !JVET_AB0189_RMVF_BITLENGTH_CONTROL
    mvpInfoVecOri[i].mvp.hor = mvpInfoVecOri[i].mvp.hor >= 0 ? mvpInfoVecOri[i].mvp.hor << 2 : -(-mvpInfoVecOri[i].mvp.hor << 2);
    mvpInfoVecOri[i].mvp.ver = mvpInfoVecOri[i].mvp.ver >= 0 ? mvpInfoVecOri[i].mvp.ver << 2 : -(-mvpInfoVecOri[i].mvp.ver << 2);
#endif
    mvpInfoVecOri[i].pos.x = mvpInfoVecOri[i].pos.x - posLT.x;
    mvpInfoVecOri[i].pos.y = mvpInfoVecOri[i].pos.y - posLT.y;
    mvpInfoVecOri[i].mvp.set(mvpInfoVecOri[i].mvp.getHor() - firstMv.getHor(), mvpInfoVecOri[i].mvp.getVer() - firstMv.getVer());
#if JVET_AB0189_RMVF_BITLENGTH_CONTROL
    mvpInfoVecOri[i].mvp.set(Clip3(-RMVF_MV_RANGE, RMVF_MV_RANGE - 1, mvpInfoVecOri[i].mvp.hor), Clip3(-RMVF_MV_RANGE, RMVF_MV_RANGE - 1, mvpInfoVecOri[i].mvp.ver));
#endif
  }
}
void computeDeltaAndShiftCoreAddi(const Position posLT, Mv firstMv, std::vector<RMVFInfo> &mvpInfoVecOri, std::vector<RMVFInfo> &mvpInfoVecRes)
{
  int offset = (int)mvpInfoVecRes.size();
  for (int i = 0; i < int(mvpInfoVecOri.size()); i++)
  {
    mvpInfoVecRes.push_back(RMVFInfo());
#if !JVET_AB0189_RMVF_BITLENGTH_CONTROL
    mvpInfoVecOri[i].mvp.hor = mvpInfoVecOri[i].mvp.hor >= 0 ? mvpInfoVecOri[i].mvp.hor << 2 : -(-mvpInfoVecOri[i].mvp.hor << 2);
    mvpInfoVecOri[i].mvp.ver = mvpInfoVecOri[i].mvp.ver >= 0 ? mvpInfoVecOri[i].mvp.ver << 2 : -(-mvpInfoVecOri[i].mvp.ver << 2);
#endif
    mvpInfoVecRes[offset + i].pos.x = mvpInfoVecOri[i].pos.x - posLT.x;
    mvpInfoVecRes[offset + i].pos.y = mvpInfoVecOri[i].pos.y - posLT.y;
    mvpInfoVecRes[offset + i].mvp.set(mvpInfoVecOri[i].mvp.getHor() - firstMv.getHor(), mvpInfoVecOri[i].mvp.getVer() - firstMv.getVer());
#if JVET_AB0189_RMVF_BITLENGTH_CONTROL
    mvpInfoVecRes[offset + i].mvp.set(Clip3(-RMVF_MV_RANGE, RMVF_MV_RANGE - 1, mvpInfoVecRes[offset + i].mvp.hor), Clip3(-RMVF_MV_RANGE, RMVF_MV_RANGE - 1, mvpInfoVecRes[offset + i].mvp.ver));
#endif
  }
}
void buildRegressionMatrixCore(std::vector<RMVFInfo> &mvpInfoVecOri, 
#if JVET_AA0107_RMVF_AFFINE_OVERFLOW_FIX || JVET_AB0189_RMVF_BITLENGTH_CONTROL
  int64_t sumbb[2][3][3], int64_t sumeb[2][3],
#else
  int sumbb[2][3][3], int sumeb[2][3],
#endif
  uint16_t addedSize)
{
  int iNum = (int)mvpInfoVecOri.size();
  int b[3];
  int e[2];
  for (int ni = addedSize ? iNum - addedSize : 0; ni < iNum; ni++)//for all neighbor PUs
  {
    // to avoid big values in matrix, it is better to use delta_x and delta_y value, ie.e. use the x,y with respect to the top,left corner of current PU
    b[0] = mvpInfoVecOri[ni].pos.x;
    b[1] = mvpInfoVecOri[ni].pos.y;
    b[2] = 1;

    e[0] = mvpInfoVecOri[ni].mvp.getHor();
    e[1] = mvpInfoVecOri[ni].mvp.getVer();

    for (int c = 0; c < 2; c++)
    {
      for (int d = 0; d < 3; d++)
      {
        sumeb[c][d] += (e[c] * b[d]);
      }
      for (int d1 = 0; d1 < 3; d1++)
      {
        for (int d = 0; d < 3; d++)
        {
          sumbb[c][d1][d] += (b[d1] * b[d]);
        }
      }
    }
  }
}
#endif
PelBufferOps::PelBufferOps()
{
#if JVET_W0097_GPM_MMVD_TM
  roundBD = roundBDCore;
  weightedAvg = weightedAvgCore;
  copyClip = copyClipCore;
#endif
  addAvg4 = addAvgCore<Pel>;
  addAvg8 = addAvgCore<Pel>;
#if JVET_AD0213_LIC_IMP
  toLast2 = toLastCore<Pel>;
  toLast4 = toLastCore<Pel>;
  licRemoveWeightHighFreq2 = licRemoveWeightHighFreqCore<Pel>;
  licRemoveWeightHighFreq4 = licRemoveWeightHighFreqCore<Pel>;
#endif
  reco4 = reconstructCore<Pel>;
  reco8 = reconstructCore<Pel>;

  linTf4 = linTfCore<Pel>;
  linTf8 = linTfCore<Pel>;

  addBIOAvg4      = addBIOAvgCore;
#if JVET_AD0195_HIGH_PRECISION_BDOF_CORE
  calcBIOParameterHighPrecision   = calcBIOParameterCoreHighPrecision;
  calcBIOParamSum4HighPrecision   = calcBIOParamSum4CoreHighPrecision;
#endif
#if MULTI_PASS_DMVR || SAMPLE_BASED_BDOF
  calcBIOParameter   = calcBIOParameterCore;
  calcBIOParamSum5   = calcBIOParamSum5Core;
  calcBIOParamSum4   = calcBIOParamSum4Core;
  calcBIOClippedVxVy = calcBIOClippedVxVyCore;
  addBIOAvgN         = addBIOAvgNCore;
  calAbsSum          = calAbsSumCore;
  bioGradFilter      = gradFilterCore <false>;
#else
  bioGradFilter   = gradFilterCore;
#endif
  calcBIOSums = calcBIOSumsCore;

  copyBuffer = copyBufferCore;
  padding = paddingCore;
#if ENABLE_SIMD_OPT_BCW && defined(TARGET_SIMD_X86)
  removeWeightHighFreq8 = removeWeightHighFreq;
  removeWeightHighFreq4 = removeWeightHighFreq;
  removeHighFreq8 = removeHighFreq;
  removeHighFreq4 = removeHighFreq;
#if JVET_AA0093_REFINED_MOTION_FOR_ARMC
  removeWeightHighFreq1 = removeWeightHighFreq;
  removeHighFreq1 = removeHighFreq;
#endif
#endif

  profGradFilter = gradFilterCore <false>;
  applyPROF      = applyPROFCore;
  roundIntVector = nullptr;
#if TM_AMVP || TM_MRG || JVET_Z0084_IBC_TM
  getSumOfDifference = getSumOfDifferenceCore;
#endif
#if JVET_Z0056_GPM_SPLIT_MODE_REORDERING
  getAbsoluteDifferencePerSample = getAbsoluteDifferencePerSampleCore;
  getSampleSumFunc[0] = getMaskedSampleSumCore<0>;
  getSampleSumFunc[1] = getMaskedSampleSumCore<1>;
  getSampleSumFunc[2] = getMaskedSampleSumCore<2>;
  getSampleSumFunc[3] = getMaskedSampleSumCore<3>;
#endif
#if JVET_Z0136_OOB
  isMvOOB = isMvOOBCore;
  isMvOOBSubBlk = isMvOOBSubBlkCore;
#endif
#if JVET_AA0107_RMVF_AFFINE_MERGE_DERIVATION
  computeDeltaAndShift = computeDeltaAndShiftCore;
  computeDeltaAndShiftAddi = computeDeltaAndShiftCoreAddi;
  buildRegressionMatrix = buildRegressionMatrixCore;
#endif
}

PelBufferOps g_pelBufOP = PelBufferOps();

void copyBufferCore(Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height)
{
  int numBytes = width * sizeof(Pel);
  for (int i = 0; i < height; i++)
  {
    memcpy(dst + i * dstStride, src + i * srcStride, numBytes);
  }
}

void paddingCore(Pel *ptr, int stride, int width, int height, int padSize)
{
  /*left and right padding*/
  Pel *ptrTemp1 = ptr;
  Pel *ptrTemp2 = ptr + (width - 1);
  int offset = 0;
  for (int i = 0; i < height; i++)
  {
    offset = stride * i;
    for (int j = 1; j <= padSize; j++)
    {
      *(ptrTemp1 - j + offset) = *(ptrTemp1 + offset);
      *(ptrTemp2 + j + offset) = *(ptrTemp2 + offset);
    }
  }
  /*Top and Bottom padding*/
  int numBytes = (width + padSize + padSize) * sizeof(Pel);
  ptrTemp1 = (ptr - padSize);
  ptrTemp2 = (ptr + (stride * (height - 1)) - padSize);
  for (int i = 1; i <= padSize; i++)
  {
    memcpy(ptrTemp1 - (i * stride), (ptrTemp1), numBytes);
    memcpy(ptrTemp2 + (i * stride), (ptrTemp2), numBytes);
  }
}

#if MULTI_HYP_PRED
template<>
void AreaBuf<Pel>::addHypothesisAndClip(const AreaBuf<const Pel> &other, const int weight, const ClpRng& clpRng)
{
  CHECK(width != other.width, "Incompatible size");
  CHECK(height != other.height, "Incompatible size");

  Pel* dest = buf;
  const Pel* src = other.buf;
  const int counterweight = (1 << MULTI_HYP_PRED_WEIGHT_BITS) - weight;
  const int add = 1 << (MULTI_HYP_PRED_WEIGHT_BITS - 1);

#define ADD_HYP_OP( ADDR ) dest[ADDR] = ClipPel( ( counterweight*dest[ADDR] + weight*src[ADDR] + add ) >> MULTI_HYP_PRED_WEIGHT_BITS, clpRng )
#define ADD_HYP_INC     \
    dest += stride; \
    src += other.stride;

  SIZE_AWARE_PER_EL_OP(ADD_HYP_OP, ADD_HYP_INC);

#undef ADD_HYP_OP
#undef ADD_HYP_INC
}
#endif

template<>
#if JVET_Z0136_OOB
void AreaBuf<Pel>::addWeightedAvg(const AreaBuf<const Pel> &other1, const AreaBuf<const Pel> &other2, const ClpRng& clpRng, const int8_t bcwIdx, bool *mcMask[2], int mcStride, bool* isOOB)
#else
void AreaBuf<Pel>::addWeightedAvg(const AreaBuf<const Pel> &other1, const AreaBuf<const Pel> &other2, const ClpRng& clpRng, const int8_t bcwIdx)
#endif
{
#if JVET_W0097_GPM_MMVD_TM
#if JVET_Z0136_OOB
  int8_t w0 = getBcwWeight(bcwIdx, REF_PIC_LIST_0);
  int8_t w1 = getBcwWeight(bcwIdx, REF_PIC_LIST_1);

  const int8_t log2WeightBase = g_bcwLog2WeightBase;
  const Pel* src1 = other1.buf;
  const Pel* src2 = other2.buf;
  Pel* dest = buf;

  const unsigned src1Stride = other1.stride;
  const unsigned src2Stride = other2.stride;
  const unsigned destStride = stride;
  const int clipbd = clpRng.bd;
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd) + log2WeightBase;
#else
  const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase;
#endif
  const int offset = (1 << (shiftNum - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);
  if (!isOOB[0] && !isOOB[1])
  {
    g_pelBufOP.weightedAvg(src1, src1Stride, src2, src2Stride, dest, destStride, w0, w1, width, height, clpRng);
  }
  else
  {
    int shiftNum2 = IF_INTERNAL_FRAC_BITS(clipbd);
    const int offset2 = (1 << (shiftNum2 - 1)) + IF_INTERNAL_OFFS;
    bool *pMcMask0 = mcMask[0];
    bool *pMcMask1 = mcMask[1];

    for (int y = 0; y < height; y++)
    {
      for (int x = 0; x < width; x++)
      {
        bool oob0 = pMcMask0[x];
        bool oob1 = pMcMask1[x];
        if (oob0 && !oob1)
        {
          dest[x] = ClipPel(rightShift(src2[x] + offset2, shiftNum2), clpRng);
        }
        else if (!oob0 && oob1)
        {
          dest[x] = ClipPel(rightShift(src1[x] + offset2, shiftNum2), clpRng);
        }
        else
        {
          dest[x + 0] = ClipPel(rightShift((src1[x] * w0 + src2[x + 0] * w1 + offset), shiftNum), clpRng);
        }
      }
      pMcMask0 += mcStride;
      pMcMask1 += mcStride;
      src1 += src1Stride;
      src2 += src2Stride;
      dest += destStride;
    }
  }
#else
  const int8_t w0 = getBcwWeight(bcwIdx, REF_PIC_LIST_0);
  const int8_t w1 = getBcwWeight(bcwIdx, REF_PIC_LIST_1);

  const Pel*            src0 = other1.buf;
  const Pel*            src1 = other2.buf;
  Pel*                 dest = buf;
  const unsigned src0Stride = other1.stride;
  const unsigned src1Stride = other2.stride;
  const unsigned destStride = stride;

  g_pelBufOP.weightedAvg(src0, src0Stride, src1, src1Stride, dest, destStride, w0, w1, width, height, clpRng);
#endif
#else
  const int8_t w0 = getBcwWeight(bcwIdx, REF_PIC_LIST_0);
  const int8_t w1 = getBcwWeight(bcwIdx, REF_PIC_LIST_1);
  const int8_t log2WeightBase = g_bcwLog2WeightBase;

  const Pel* src0 = other1.buf;
  const Pel* src2 = other2.buf;
  Pel* dest = buf;

  const unsigned src1Stride = other1.stride;
  const unsigned src2Stride = other2.stride;
  const unsigned destStride = stride;
  const int clipbd = clpRng.bd;
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd) + log2WeightBase;
#else
  const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase;
#endif
  const int offset = (1 << (shiftNum - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);

#define ADD_AVG_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( ( src0[ADDR]*w0 + src2[ADDR]*w1 + offset ), shiftNum ), clpRng )
#define ADD_AVG_INC     \
    src0 += src1Stride; \
    src2 += src2Stride; \
    dest += destStride; \

  SIZE_AWARE_PER_EL_OP(ADD_AVG_OP, ADD_AVG_INC);

#undef ADD_AVG_OP
#undef ADD_AVG_INC
#endif
}

template<>
void AreaBuf<Pel>::rspSignal(std::vector<Pel>& pLUT)
{
  Pel* dst = buf;
  Pel* src = buf;
    for (unsigned y = 0; y < height; y++)
    {
      for (unsigned x = 0; x < width; x++)
      {
        dst[x] = pLUT[src[x]];
      }
      dst += stride;
      src += stride;
    }
}

template<>
void AreaBuf<Pel>::rspSignal(const AreaBuf<const Pel>& other, std::vector<Pel>& pLUT)
{
  CHECK( width != other.width, "Incompatible size" );
  CHECK( height != other.height, "Incompatible size" );

  Pel* dst = buf;
  const Pel* src = other.buf;
  for (unsigned y = 0; y < height; y++)
  {
    for (unsigned x = 0; x < width; x++)
    {
      dst[x] = pLUT[src[x]];
    }
    dst += stride;
    src += other.stride;
  }
}

template<>
void AreaBuf<Pel>::rspSignal( const AreaBuf<Pel> &toReshape, std::vector<Pel>& pLUT )
{
  CHECK( width != toReshape.width, "Incompatible size" );
  CHECK( height != toReshape.height, "Incompatible size" );

  Pel* dst = buf;
  Pel* src = toReshape.buf;
  const int srcStride = toReshape.stride;

  for( unsigned y = 0; y < height; y++ )
  {
    for( unsigned x = 0; x < width; x++ )
    {
      dst[x] = pLUT[src[x]];
    }
    dst += stride;
    src += srcStride;
  }
}

#if JVET_AA0070_RRIBC
template<>
void AreaBuf<Pel>::flipSignal(bool isFlipHor)
{
  Pel *tempPel;
  Size tSize(width, height);
  tempPel       = new Pel[tSize.area()];
  PelBuf tmpBuf = PelBuf(tempPel, tSize);
  copyBufferCore(buf, stride, tmpBuf.buf, tmpBuf.stride, tmpBuf.width, tmpBuf.height);

  Pel *dstbuf = buf;
  Pel *srcbuf = tmpBuf.buf;
  if (isFlipHor)
  {
    for (unsigned y = 0; y < height; y++)
    {
      for (unsigned x = 0; x < width; x++)
      {
        dstbuf[x] = srcbuf[width - 1 - x];
      }
      dstbuf += stride;
      srcbuf += tmpBuf.stride;
    }
  }
  else
  {
    for (unsigned y = 0; y < height; y++)
    {
      for (unsigned x = 0; x < width; x++)
      {
        dstbuf[x] = srcbuf[(height - 1 - y) * tmpBuf.stride + x];
      }
      dstbuf += stride;
    }
  }

  delete[] tempPel;
}
#endif

template<>
void AreaBuf<Pel>::rspSignalAllAndSubtract( const AreaBuf<Pel> &buffer1, const AreaBuf<Pel> &buffer2, std::vector<Pel>& pLUT )
{
  CHECK( width != buffer1.width, "Incompatible size in buffer1" );
  CHECK( height != buffer1.height, "Incompatible size in buffer1" );
  CHECK( width != buffer2.width, "Incompatible size in buffer2" );
  CHECK( height != buffer2.height, "Incompatible size in buffer2" );

  Pel* dest = buf;
  const Pel* buf1 = buffer1.buf;
  const Pel* buf2 = buffer2.buf;

#define SUBS_INC           \
  dest +=          stride; \
  buf1 +=  buffer1.stride; \
  buf2 +=  buffer2.stride; \

#define SUBS_OP( ADDR ) dest[ADDR] = pLUT[buf1[ADDR]] - pLUT[buf2[ADDR]]

  SIZE_AWARE_PER_EL_OP( SUBS_OP, SUBS_INC );

#undef SUBS_OP
#undef SUBS_INC
}

template<>
void AreaBuf<Pel>::rspSignalAndSubtract( const AreaBuf<Pel> &buffer1, const AreaBuf<Pel> &buffer2, std::vector<Pel>& pLUT )
{
  CHECK( width != buffer1.width, "Incompatible size in buffer1" );
  CHECK( height != buffer1.height, "Incompatible size in buffer1" );
  CHECK( width != buffer2.width, "Incompatible size in buffer2" );
  CHECK( height != buffer2.height, "Incompatible size in buffer2" );

  Pel* dest = buf;
  const Pel* buf1 = buffer1.buf;
  const Pel* buf2 = buffer2.buf;

#define SUBS_INC           \
  dest +=          stride; \
  buf1 +=  buffer1.stride; \
  buf2 +=  buffer2.stride; \

#define SUBS_OP( ADDR ) dest[ADDR] = pLUT[buf1[ADDR]] - buf2[ADDR]

  SIZE_AWARE_PER_EL_OP( SUBS_OP, SUBS_INC );

#undef SUBS_OP
#undef SUBS_INC
}

template<>
void AreaBuf<Pel>::scaleSignal(const int scale, const bool dir, const ClpRng& clpRng)
{
  Pel* dst = buf;
  Pel* src = buf;
  int sign, absval;
  int maxAbsclipBD = (1<<clpRng.bd) - 1;

  if (dir) // forward
  {
    if (width == 1)
    {
      THROW("Blocks of width = 1 not supported");
    }
    else
    {
      for (unsigned y = 0; y < height; y++)
      {
        for (unsigned x = 0; x < width; x++)
        {
          sign = src[x] >= 0 ? 1 : -1;
          absval = sign * src[x];
          dst[x] = (Pel)Clip3(-maxAbsclipBD, maxAbsclipBD, sign * (((absval << CSCALE_FP_PREC) + (scale >> 1)) / scale));
        }
        dst += stride;
        src += stride;
      }
    }
  }
  else // inverse
  {
    for (unsigned y = 0; y < height; y++)
    {
      for (unsigned x = 0; x < width; x++)
      {
        src[x] = (Pel)Clip3((Pel)(-maxAbsclipBD - 1), (Pel)maxAbsclipBD, src[x]);
        sign = src[x] >= 0 ? 1 : -1;
        absval = sign * src[x];
        int val = sign * ((absval * scale + (1 << (CSCALE_FP_PREC - 1))) >> CSCALE_FP_PREC);
        if (sizeof(Pel) == 2) // avoid overflow when storing data
        {
           val = Clip3<int>(-32768, 32767, val);
        }
        dst[x] = (Pel)val;
      }
      dst += stride;
      src += stride;
    }
  }
}

template<>
#if JVET_Z0136_OOB
void AreaBuf<Pel>::addAvg(const AreaBuf<const Pel> &other1, const AreaBuf<const Pel> &other2, const ClpRng& clpRng, bool *mcMask[2], int mcStride, bool* isOOB)
#else
void AreaBuf<Pel>::addAvg( const AreaBuf<const Pel> &other1, const AreaBuf<const Pel> &other2, const ClpRng& clpRng)
#endif
{
  const Pel* src0 = other1.buf;
  const Pel* src2 = other2.buf;
        Pel* dest =        buf;

  const unsigned src1Stride = other1.stride;
  const unsigned src2Stride = other2.stride;
  const unsigned destStride =        stride;
  const int     clipbd      = clpRng.bd;
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd) + 1;
#else
  const int     shiftNum    = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + 1;
#endif
  const int     offset      = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;

#if JVET_Z0136_OOB
  if (mcMask == NULL || (!isOOB[0] && !isOOB[1]))
  {
#if ENABLE_SIMD_OPT_BUFFER && defined(TARGET_SIMD_X86)
    if ((width & 7) == 0)
    {
      g_pelBufOP.addAvg8(src0, src1Stride, src2, src2Stride, dest, destStride, width, height, shiftNum, offset, clpRng, mcMask, mcStride, isOOB);
    }
    else if ((width & 3) == 0)
    {
      g_pelBufOP.addAvg4(src0, src1Stride, src2, src2Stride, dest, destStride, width, height, shiftNum, offset, clpRng, mcMask, mcStride, isOOB);
    }
    else
#endif
    {
#define ADD_AVG_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( ( src0[ADDR] + src2[ADDR] + offset ), shiftNum ), clpRng )
#define ADD_AVG_INC     \
    src0 += src1Stride; \
    src2 += src2Stride; \
    dest += destStride; \

    SIZE_AWARE_PER_EL_OP(ADD_AVG_OP, ADD_AVG_INC);

#undef ADD_AVG_OP
#undef ADD_AVG_INC
    }
  }
  else
  {
    int shiftNum2 = IF_INTERNAL_FRAC_BITS(clipbd);
    const int offset2 = (1 << (shiftNum2 - 1)) + IF_INTERNAL_OFFS;
    bool *pMcMask0 = mcMask[0];
    bool *pMcMask1 = mcMask[1];
    for (int y = 0; y < height; y++)
    {
      for (int x = 0; x < width; x++)
      {
        bool oob0 = pMcMask0[x];
        bool oob1 = pMcMask1[x];
        if (oob0 && !oob1)
        {
          dest[x] = ClipPel(rightShift(src2[x] + offset2, shiftNum2), clpRng);
        }
        else if (!oob0 && oob1)
        {
          dest[x] = ClipPel(rightShift(src0[x] + offset2, shiftNum2), clpRng);
        }
        else
        {
          dest[x] = ClipPel(rightShift((src0[x] + src2[x] + offset), shiftNum), clpRng);
        }
      }
      pMcMask0 += mcStride;
      pMcMask1 += mcStride;
      src0 += src1Stride;
      src2 += src2Stride;
      dest += destStride;
    }
  }
#else
#if ENABLE_SIMD_OPT_BUFFER && defined(TARGET_SIMD_X86)
  if( ( width & 7 ) == 0 )
  {
    g_pelBufOP.addAvg8( src0, src1Stride, src2, src2Stride, dest, destStride, width, height, shiftNum, offset, clpRng );
  }
  else if( ( width & 3 ) == 0 )
  {
    g_pelBufOP.addAvg4( src0, src1Stride, src2, src2Stride, dest, destStride, width, height, shiftNum, offset, clpRng );
  }
  else
#endif
  {
#define ADD_AVG_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( ( src0[ADDR] + src2[ADDR] + offset ), shiftNum ), clpRng )
#define ADD_AVG_INC     \
    src0 += src1Stride; \
    src2 += src2Stride; \
    dest += destStride; \

    SIZE_AWARE_PER_EL_OP( ADD_AVG_OP, ADD_AVG_INC );

#undef ADD_AVG_OP
#undef ADD_AVG_INC
  }
#endif
}

#if JVET_AE0169_BIPREDICTIVE_IBC
template<>
void AreaBuf<Pel>::avg(const AreaBuf<const Pel> &other1, const AreaBuf<const Pel> &other2)
{
  const Pel* src1 = other1.buf;
  const Pel* src2 = other2.buf;
        Pel* dest =        buf;

  const unsigned src1Stride = other1.stride;
  const unsigned src2Stride = other2.stride;
  const unsigned destStride =        stride;

#if ENABLE_SIMD_OPT_BUFFER && defined(TARGET_SIMD_X86)
  if ((width & 3) == 0)
  {
    g_pelBufOP.avg(src1, src1Stride, src2, src2Stride, dest, destStride, width, height);
  }
  else
#endif
  {
#define ADD_AVG_OP( ADDR ) dest[ADDR] = rightShift( ( src1[ADDR] + src2[ADDR] + 1 ), 1 )
#define ADD_AVG_INC     \
    src1 += src1Stride; \
    src2 += src2Stride; \
    dest += destStride; \

    SIZE_AWARE_PER_EL_OP( ADD_AVG_OP, ADD_AVG_INC );

#undef ADD_AVG_OP
#undef ADD_AVG_INC
  }
}
#endif

template<>
void AreaBuf<Pel>::toLast( const ClpRng& clpRng )
{
        Pel* src       = buf;
  const uint32_t srcStride = stride;

  const int  clipbd    = clpRng.bd;
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int shiftNum = IF_INTERNAL_FRAC_BITS(clipbd);
#else
  const int  shiftNum  = std::max<int>(2, (IF_INTERNAL_PREC - clipbd));
#endif
  const int  offset    = ( 1 << ( shiftNum - 1 ) ) + IF_INTERNAL_OFFS;

  if (width == 1)
  {
#if JVET_AD0213_LIC_IMP
    for (int y = 0; y < height; y++)
    {
      for (int x = 0; x < width; x++)
      {
        src[x] = ClipPel(rightShift((src[x] + offset), shiftNum), clpRng);
      }
      src += srcStride;
    }
#else
    THROW( "Blocks of width = 1 not supported" );
#endif
  }
#if JVET_AD0213_LIC_IMP
  else if ((width & 3) == 0)
  {
    g_pelBufOP.toLast4(src, srcStride, width, height, shiftNum, offset, clpRng);
  }
  else if ((width & 1) == 0)
  {
    g_pelBufOP.toLast2(src, srcStride, width, height, shiftNum, offset, clpRng);
  }
  else
  {
    THROW("Unsupported size!");
  }
#else
  else if (width&2)
  {
    for ( int y = 0; y < height; y++ )
    {
      for (int x=0 ; x < width; x+=2 )
      {
        src[x + 0] = ClipPel( rightShift( ( src[x + 0] + offset ), shiftNum ), clpRng );
        src[x + 1] = ClipPel( rightShift( ( src[x + 1] + offset ), shiftNum ), clpRng );
      }
      src += srcStride;
    }
  }
  else
  {
    for ( int y = 0; y < height; y++ )
    {
      for (int x=0 ; x < width; x+=4 )
      {
        src[x + 0] = ClipPel( rightShift( ( src[x + 0] + offset ), shiftNum ), clpRng );
        src[x + 1] = ClipPel( rightShift( ( src[x + 1] + offset ), shiftNum ), clpRng );
        src[x + 2] = ClipPel( rightShift( ( src[x + 2] + offset ), shiftNum ), clpRng );
        src[x + 3] = ClipPel( rightShift( ( src[x + 3] + offset ), shiftNum ), clpRng );

      }
      src += srcStride;
    }
  }
#endif
}


template<>
void AreaBuf<Pel>::copyClip( const AreaBuf<const Pel> &src, const ClpRng& clpRng )
{
  const Pel* srcp = src.buf;
        Pel* dest =     buf;

  const unsigned srcStride  = src.stride;
  const unsigned destStride = stride;

#if !JVET_W0090_ARMC_TM && !JVET_Z0056_GPM_SPLIT_MODE_REORDERING
  if( width == 1 )
  {
    THROW( "Blocks of width = 1 not supported" );
  }
  else
#endif
  {
#if JVET_W0097_GPM_MMVD_TM
    g_pelBufOP.copyClip(srcp, srcStride, dest, destStride, width, height, clpRng);
#else
#define RECO_OP( ADDR ) dest[ADDR] = ClipPel( srcp[ADDR], clpRng )
#define RECO_INC        \
    srcp += srcStride;  \
    dest += destStride; \

    SIZE_AWARE_PER_EL_OP( RECO_OP, RECO_INC );

#undef RECO_OP
#undef RECO_INC
#endif
  }
}

template<>
void AreaBuf<Pel>::roundToOutputBitdepth( const AreaBuf<const Pel> &src, const ClpRng& clpRng )
{
  const Pel* srcp = src.buf;
        Pel* dest =     buf;
  const unsigned srcStride  = src.stride;
  const unsigned destStride = stride;
#if !JVET_W0097_GPM_MMVD_TM
  const int32_t clipbd            = clpRng.bd;
#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT
  const int32_t shiftDefault      = IF_INTERNAL_FRAC_BITS(clipbd);
#else
  const int32_t shiftDefault      = std::max<int>(2, (IF_INTERNAL_PREC - clipbd));
#endif
  const int32_t offsetDefault     = (1<<(shiftDefault-1)) + IF_INTERNAL_OFFS;
#endif
  if( width == 1 )
  {
    THROW( "Blocks of width = 1 not supported" );
  }
  else
  {
#if JVET_W0097_GPM_MMVD_TM
    g_pelBufOP.roundBD(srcp, srcStride, dest, destStride, width, height, clpRng);
#else
#define RND_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( srcp[ADDR] + offsetDefault, shiftDefault), clpRng )
#define RND_INC        \
    srcp += srcStride;  \
    dest += destStride; \

    SIZE_AWARE_PER_EL_OP( RND_OP, RND_INC );

#undef RND_OP
#undef RND_INC
#endif
  }
}


template<>
void AreaBuf<Pel>::reconstruct( const AreaBuf<const Pel> &pred, const AreaBuf<const Pel> &resi, const ClpRng& clpRng )
{
  const Pel* src1 = pred.buf;
  const Pel* src2 = resi.buf;
        Pel* dest =      buf;

  const unsigned src1Stride = pred.stride;
  const unsigned src2Stride = resi.stride;
  const unsigned destStride =      stride;

#if ENABLE_SIMD_OPT_BUFFER && defined(TARGET_SIMD_X86)
  if( ( width & 7 ) == 0 )
  {
    g_pelBufOP.reco8( src1, src1Stride, src2, src2Stride, dest, destStride, width, height, clpRng );
  }
  else if( ( width & 3 ) == 0 )
  {
    g_pelBufOP.reco4( src1, src1Stride, src2, src2Stride, dest, destStride, width, height, clpRng );
  }
  else
#endif
  {
#define RECO_OP( ADDR ) dest[ADDR] = ClipPel( src1[ADDR] + src2[ADDR], clpRng )
#define RECO_INC        \
    src1 += src1Stride; \
    src2 += src2Stride; \
    dest += destStride; \

    SIZE_AWARE_PER_EL_OP( RECO_OP, RECO_INC );

#undef RECO_OP
#undef RECO_INC
  }
}

template<>
void AreaBuf<Pel>::linearTransform( const int scale, const int shift, const int offset, bool bClip, const ClpRng& clpRng )
{
  const Pel* src = buf;
        Pel* dst = buf;

#if JVET_W0090_ARMC_TM || JVET_Z0056_GPM_SPLIT_MODE_REORDERING
  if (width == 0)
  {
    THROW("Blocks of width = 0 not supported");
  }
#else
  if( width == 1 )
  {
    THROW( "Blocks of width = 1 not supported" );
  }
#endif
#if ENABLE_SIMD_OPT_BUFFER && defined(TARGET_SIMD_X86)
  else if( ( width & 7 ) == 0 )
  {
    g_pelBufOP.linTf8( src, stride, dst, stride, width, height, scale, shift, offset, clpRng, bClip );
  }
  else if( ( width & 3 ) == 0 )
  {
    g_pelBufOP.linTf4( src, stride, dst, stride, width, height, scale, shift, offset, clpRng, bClip );
  }
#endif
  else
  {
#define LINTF_OP( ADDR ) dst[ADDR] = ( Pel ) bClip ? ClipPel( rightShift( scale * src[ADDR], shift ) + offset, clpRng ) : ( rightShift( scale * src[ADDR], shift ) + offset )
#define LINTF_INC        \
    src += stride;       \
    dst += stride;       \

    SIZE_AWARE_PER_EL_OP( LINTF_OP, LINTF_INC );

#undef LINTF_OP
#undef LINTF_INC
  }
}

#if ENABLE_SIMD_OPT_BUFFER && defined(TARGET_SIMD_X86)
template<>
void AreaBuf<Pel>::subtract( const Pel val )
{
  ClpRng clpRngDummy;
  linearTransform( 1, 0, -val, false, clpRngDummy );
}
#endif


PelStorage::PelStorage()
{
  for( uint32_t i = 0; i < MAX_NUM_COMPONENT; i++ )
  {
    m_origin[i] = nullptr;
  }
}

PelStorage::~PelStorage()
{
  destroy();
}

void PelStorage::create( const UnitArea &_UnitArea )
{
  create( _UnitArea.chromaFormat, _UnitArea.blocks[0] );
}

void PelStorage::create( const ChromaFormat &_chromaFormat, const Area& _area, const unsigned _maxCUSize, const unsigned _margin, const unsigned _alignment, const bool _scaleChromaMargin )
{
  CHECK( !bufs.empty(), "Trying to re-create an already initialized buffer" );

  chromaFormat = _chromaFormat;

  const uint32_t numCh = getNumberValidComponents( _chromaFormat );

  unsigned extHeight = _area.height;
  unsigned extWidth  = _area.width;

  if( _maxCUSize )
  {
    extHeight = ( ( _area.height + _maxCUSize - 1 ) / _maxCUSize ) * _maxCUSize;
    extWidth  = ( ( _area.width  + _maxCUSize - 1 ) / _maxCUSize ) * _maxCUSize;
  }

  for( uint32_t i = 0; i < numCh; i++ )
  {
    const ComponentID compID = ComponentID( i );
    const unsigned scaleX = ::getComponentScaleX( compID, _chromaFormat );
    const unsigned scaleY = ::getComponentScaleY( compID, _chromaFormat );

    unsigned scaledHeight = extHeight >> scaleY;
    unsigned scaledWidth  = extWidth  >> scaleX;
    unsigned ymargin      = _margin >> (_scaleChromaMargin?scaleY:0);
    unsigned xmargin      = _margin >> (_scaleChromaMargin?scaleX:0);
    unsigned totalWidth   = scaledWidth + 2*xmargin;
    unsigned totalHeight  = scaledHeight +2*ymargin;

    if( _alignment )
    {
      // make sure buffer lines are align
      CHECK( _alignment != MEMORY_ALIGN_DEF_SIZE, "Unsupported alignment" );
      totalWidth = ( ( totalWidth + _alignment - 1 ) / _alignment ) * _alignment;
    }
    uint32_t area = totalWidth * totalHeight;
    CHECK( !area, "Trying to create a buffer with zero area" );

    m_origin[i] = ( Pel* ) xMalloc( Pel, area );
    Pel* topLeft = m_origin[i] + totalWidth * ymargin + xmargin;
    bufs.push_back( PelBuf( topLeft, totalWidth, _area.width >> scaleX, _area.height >> scaleY ) );
  }
}

void PelStorage::createFromBuf( PelUnitBuf buf )
{
  chromaFormat = buf.chromaFormat;

  const uint32_t numCh = ::getNumberValidComponents( chromaFormat );

  bufs.resize(numCh);

  for( uint32_t i = 0; i < numCh; i++ )
  {
    PelBuf cPelBuf = buf.get( ComponentID( i ) );
    bufs[i] = PelBuf( cPelBuf.bufAt( 0, 0 ), cPelBuf.stride, cPelBuf.width, cPelBuf.height );
  }
}

void PelStorage::swap( PelStorage& other )
{
  const uint32_t numCh = ::getNumberValidComponents( chromaFormat );

  for( uint32_t i = 0; i < numCh; i++ )
  {
    // check this otherwise it would turn out to get very weird
    CHECK( chromaFormat                   != other.chromaFormat                  , "Incompatible formats" );
    CHECK( get( ComponentID( i ) )        != other.get( ComponentID( i ) )       , "Incompatible formats" );
    CHECK( get( ComponentID( i ) ).stride != other.get( ComponentID( i ) ).stride, "Incompatible formats" );

    std::swap( bufs[i].buf,    other.bufs[i].buf );
    std::swap( bufs[i].stride, other.bufs[i].stride );
    std::swap( m_origin[i],    other.m_origin[i] );
  }
}

void PelStorage::destroy()
{
  chromaFormat = NUM_CHROMA_FORMAT;
  for( uint32_t i = 0; i < MAX_NUM_COMPONENT; i++ )
  {
    if( m_origin[i] )
    {
      xFree( m_origin[i] );
      m_origin[i] = nullptr;
    }
  }
  bufs.clear();
}

PelBuf PelStorage::getBuf( const ComponentID CompID )
{
  return bufs[CompID];
}

const CPelBuf PelStorage::getBuf( const ComponentID CompID ) const
{
  return bufs[CompID];
}

PelBuf PelStorage::getBuf( const CompArea &blk )
{
  const PelBuf& r = bufs[blk.compID];

  CHECKD( rsAddr( blk.bottomRight(), r.stride ) >= ( ( r.height - 1 ) * r.stride + r.width ), "Trying to access a buf outside of bound!" );

  return PelBuf( r.buf + rsAddr( blk, r.stride ), r.stride, blk );
}

const CPelBuf PelStorage::getBuf( const CompArea &blk ) const
{
  const PelBuf& r = bufs[blk.compID];
  return CPelBuf( r.buf + rsAddr( blk, r.stride ), r.stride, blk );
}

PelUnitBuf PelStorage::getBuf( const UnitArea &unit )
{
  return ( chromaFormat == CHROMA_400 ) ? PelUnitBuf( chromaFormat, getBuf( unit.Y() ) ) : PelUnitBuf( chromaFormat, getBuf( unit.Y() ), getBuf( unit.Cb() ), getBuf( unit.Cr() ) );
}

const CPelUnitBuf PelStorage::getBuf( const UnitArea &unit ) const
{
  return ( chromaFormat == CHROMA_400 ) ? CPelUnitBuf( chromaFormat, getBuf( unit.Y() ) ) : CPelUnitBuf( chromaFormat, getBuf( unit.Y() ), getBuf( unit.Cb() ), getBuf( unit.Cr() ) );
}

template<>
void UnitBuf<Pel>::colorSpaceConvert(const UnitBuf<Pel> &other, const bool forward, const ClpRng& clpRng)
{
  const Pel* pOrg0 = bufs[COMPONENT_Y].buf;
  const Pel* pOrg1 = bufs[COMPONENT_Cb].buf;
  const Pel* pOrg2 = bufs[COMPONENT_Cr].buf;
  const int  strideOrg = bufs[COMPONENT_Y].stride;

  Pel* pDst0 = other.bufs[COMPONENT_Y].buf;
  Pel* pDst1 = other.bufs[COMPONENT_Cb].buf;
  Pel* pDst2 = other.bufs[COMPONENT_Cr].buf;
  const int strideDst = other.bufs[COMPONENT_Y].stride;

  int width = bufs[COMPONENT_Y].width;
  int height = bufs[COMPONENT_Y].height;
  int maxAbsclipBD = (1 << (clpRng.bd + 1)) - 1;
  int r, g, b;
  int y0, cg, co;

  CHECK(bufs[COMPONENT_Y].stride != bufs[COMPONENT_Cb].stride || bufs[COMPONENT_Y].stride != bufs[COMPONENT_Cr].stride, "unequal stride for 444 content");
  CHECK(other.bufs[COMPONENT_Y].stride != other.bufs[COMPONENT_Cb].stride || other.bufs[COMPONENT_Y].stride != other.bufs[COMPONENT_Cr].stride, "unequal stride for 444 content");
  CHECK(bufs[COMPONENT_Y].width != other.bufs[COMPONENT_Y].width || bufs[COMPONENT_Y].height != other.bufs[COMPONENT_Y].height, "unequal block size")

    if (forward)
    {
      for (int y = 0; y < height; y++)
      {
        for (int x = 0; x < width; x++)
        {
          r = pOrg2[x];
          g = pOrg0[x];
          b = pOrg1[x];

          co = r - b;
          int t = b + (co >> 1);
          cg = g - t;
          pDst0[x] = t + (cg >> 1);
          pDst1[x] = cg;
          pDst2[x] = co;
        }
        pOrg0 += strideOrg;
        pOrg1 += strideOrg;
        pOrg2 += strideOrg;
        pDst0 += strideDst;
        pDst1 += strideDst;
        pDst2 += strideDst;
      }
    }
    else
    {
      for (int y = 0; y < height; y++)
      {
        for (int x = 0; x < width; x++)
        {
          y0 = pOrg0[x];
          cg = pOrg1[x];
          co = pOrg2[x];

          y0 = Clip3((-maxAbsclipBD - 1), maxAbsclipBD, y0);
          cg = Clip3((-maxAbsclipBD - 1), maxAbsclipBD, cg);
          co = Clip3((-maxAbsclipBD - 1), maxAbsclipBD, co);

          int t = y0 - (cg >> 1);
          pDst0[x] = cg + t;
          pDst1[x] = t - (co >> 1);
          pDst2[x] = co + pDst1[x];
        }

        pOrg0 += strideOrg;
        pOrg1 += strideOrg;
        pOrg2 += strideOrg;
        pDst0 += strideDst;
        pDst1 += strideDst;
        pDst2 += strideDst;
      }
    }
}