diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp
index 7c1bb9f1d7e080fe27d9de64c9f4e60af4f49830..586f8728f27d08de6ab6402c1bfc3ab34668b687 100644
--- a/source/Lib/CommonLib/Buffer.cpp
+++ b/source/Lib/CommonLib/Buffer.cpp
@@ -42,7 +42,11 @@
 #include "Buffer.h"
 #include "InterpolationFilter.h"
 
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng)
+#else
 void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng)
+#endif
 {
   int idx = 0;
 #if !JVET_P0057_BDOF_PROF_HARMONIZATION 
@@ -63,10 +67,16 @@ void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int w
 #endif
 #if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
       dI = Clip3(-dILimit, dILimit - 1, dI);
-#endif
-
+      dst[w] = src[w] + dI;
+      if (!bi)
+      {
+        dst[w] = (dst[w] + offset) >> shiftNum;
+        dst[w] = ClipPel(dst[w], clpRng);
+      }
+#else
       dI = (src[w] + dI + offset) >> shiftNum;
       dst[w] = (Pel)ClipPel(dI, clpRng);
+#endif
 
       idx++;
     }
@@ -77,6 +87,7 @@ void applyPROFCore(Pel* dst, int dstStride, const Pel* src, int srcStride, int w
   }
 }
 
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
 template<bool l1PROFEnabled = true>
 void applyBiPROFCore (Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t w0, const ClpRng& clpRng)
 {
@@ -142,6 +153,7 @@ void applyBiPROFCore (Pel* dst, int dstStride, const Pel* src0, const Pel* src1,
     src1 += srcStride;
   }
 }
+#endif
 
 template< typename T >
 void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T* dest, int dstStride, int width, int height, int rshift, int offset, const ClpRng& clpRng )
@@ -433,8 +445,10 @@ PelBufferOps::PelBufferOps()
 
   profGradFilter = gradFilterCore <false>;
   applyPROF      = applyPROFCore;
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
   applyBiPROF[1] = applyBiPROFCore;
   applyBiPROF[0] = applyBiPROFCore <false>;
+#endif
   roundIntVector = nullptr;
 }
 
diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h
index f4ea3ad6c5d7af238669cd71de3e037635788db0..4f3b95c4c2e5a575bf1e498cd74dbda2286189a8 100644
--- a/source/Lib/CommonLib/Buffer.h
+++ b/source/Lib/CommonLib/Buffer.h
@@ -81,8 +81,12 @@ struct PelBufferOps
   void ( *removeHighFreq4)        ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height);
 #endif
   void (*profGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY, const int bitDepth);
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+  void (*applyPROF)      (Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng);
+#else
   void (*applyPROF)      (Pel* dst, int dstStride, const Pel* src, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng);
   void (*applyBiPROF[2]) (Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t gbiWeightL0, const ClpRng& clpRng);
+#endif
   void (*roundIntVector) (int* v, int size, unsigned int nShift, const int dmvLimit);
 };
 
diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index d5a970c43a9b80c2de5bfdad4293c4d1c18da0d3..f686ac6822a717a354e7e1be30d316b3a6aed584 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -862,19 +862,28 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio
   enablePROF &= !m_encOnly || pu.cu->slice->getCheckLDC() || iDMvHorX > profThres || iDMvHorY > profThres || iDMvVerX > profThres || iDMvVerY > profThres || iDMvHorX < -profThres || iDMvHorY < -profThres || iDMvVerX < -profThres || iDMvVerY < -profThres;
   enablePROF &= pu.cs->pps->getPicWidthInLumaSamples() == refPic->getPicWidthInLumaSamples() && pu.cs->pps->getPicHeightInLumaSamples() == refPic->getPicHeightInLumaSamples();
 
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
   if (compID == COMPONENT_Y)
   {
     m_applyPROF[m_iRefListIdx] = enablePROF;
   }
+#endif
 
   bool isLast = enablePROF ? false : !bi;
 
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+  const int cuExtW = AFFINE_MIN_BLOCK_SIZE + PROF_BORDER_EXT_W * 2;
+  const int cuExtH = AFFINE_MIN_BLOCK_SIZE + PROF_BORDER_EXT_H * 2;
+
+  PelBuf gradXExt(m_gradBuf[0], cuExtW, cuExtH);
+  PelBuf gradYExt(m_gradBuf[1], cuExtW, cuExtH);
+#else
   const int cuExtW = pu.blocks[compID].width + PROF_BORDER_EXT_W * 2;
   const int cuExtH = pu.blocks[compID].height + PROF_BORDER_EXT_H * 2;
 
   PelBuf gradXExt(m_gradBuf[m_iRefListIdx][0], cuExtW, cuExtH);
   PelBuf gradYExt(m_gradBuf[m_iRefListIdx][1], cuExtW, cuExtH);
-
+#endif
   const int MAX_FILTER_SIZE = std::max<int>(NTAPS_LUMA, NTAPS_CHROMA);
   const int dstExtW = ((blockWidth + PROF_BORDER_EXT_W * 2 + 7) >> 3) << 3;
   const int dstExtH = blockHeight + PROF_BORDER_EXT_H * 2;
@@ -888,7 +897,11 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio
   int *dMvScaleHor = m_dMvBuf[m_iRefListIdx];
   int *dMvScaleVer = m_dMvBuf[m_iRefListIdx] + 16;
 
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+  if (enablePROF)
+#else
   if (enablePROF && !bi)
+#endif
   {
     int* dMvH = dMvScaleHor;
     int* dMvV = dMvScaleVer;
@@ -1120,8 +1133,13 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio
           dstPel[blockWidth] = leftShift_round(refPel[blockWidth], shift) - (Pel)IF_INTERNAL_OFFS;
         }
 
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+        PelBuf gradXBuf = gradXExt.subBuf(0, 0, blockWidth + 2, blockHeight + 2);
+        PelBuf gradYBuf = gradYExt.subBuf(0, 0, blockWidth + 2, blockHeight + 2);
+#else
         PelBuf gradXBuf = gradXExt.subBuf(w, h, blockWidth + 2, blockHeight + 2);
         PelBuf gradYBuf = gradYExt.subBuf(w, h, blockWidth + 2, blockHeight + 2);
+#endif
         g_pelBufOP.profGradFilter(dstExtBuf.buf, dstExtBuf.stride, blockWidth + 2, blockHeight + 2, gradXBuf.stride, gradXBuf.buf, gradYBuf.buf, clpRng.bd);
 
         const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd));
@@ -1132,6 +1150,9 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio
 
         Pel * dstY = dstBuf.bufAt(w, h);
 
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+        g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, bi, shiftNum, offset, clpRng);
+#else
         if (!bi)
         {
           g_pelBufOP.applyPROF(dstY, dstBuf.stride, src, dstExtBuf.stride, blockWidth, blockHeight, gX, gY, gradXBuf.stride, dMvScaleHor, dMvScaleVer, blockWidth, shiftNum, offset, clpRng);
@@ -1142,6 +1163,7 @@ void InterPrediction::xPredAffineBlk( const ComponentID& compID, const Predictio
           PelBuf destBuf(dstY, dstBuf.stride, Size(blockWidth, blockHeight));
           destBuf.copyFrom(srcExtBuf);
         }
+#endif
       }
       }
     }
@@ -1300,6 +1322,7 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB
 
   if( iRefIdx0 >= 0 && iRefIdx1 >= 0 )
   {
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
     if (pu.cu->affine && (m_applyPROF[0] || m_applyPROF[1]))
     {
       xApplyBiPROF(pu, pcYuvSrc0.bufs[COMPONENT_Y], pcYuvSrc1.bufs[COMPONENT_Y], pcYuvDst.bufs[COMPONENT_Y], clpRngs.comp[COMPONENT_Y]);
@@ -1307,6 +1330,7 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB
       CHECK(yuvDstTmp, "yuvDstTmp is disallowed with PROF");
       return;
     }
+#endif
     if( pu.cu->GBiIdx != GBI_DEFAULT && (yuvDstTmp || !pu.mhIntraFlag) )
     {
       CHECK(bioApplied, "GBi is disallowed with BIO");
@@ -1389,6 +1413,7 @@ void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitB
   }
 }
 
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
 void InterPrediction::xApplyBiPROF(const PredictionUnit &pu, const CPelBuf& pcYuvSrc0, const CPelBuf& pcYuvSrc1, PelBuf& pcYuvDst, const ClpRng& clpRng)
 {
   int blockWidth = AFFINE_MIN_BLOCK_SIZE;
@@ -1532,6 +1557,7 @@ void InterPrediction::xApplyBiPROF(const PredictionUnit &pu, const CPelBuf& pcYu
   else
     g_pelBufOP.applyBiPROF[0](dstY, pcYuvDst.stride, srcY1, srcY0, pcYuvSrc0.stride, width, height, gX1, gY1, gX0, gY0, gradXExt0.stride, dMvX1, dMvY1, dMvX0, dMvY0, blockWidth, getGbiWeight(pu.cu->GBiIdx, REF_PIC_LIST_1), clpRng);
 }
+#endif
 
 void InterPrediction::motionCompensation( PredictionUnit &pu, PelUnitBuf &predBuf, const RefPicList &eRefPicList
   , const bool luma, const bool chroma
diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h
index b28b49dd3c93969837a8ef870ff9575f4ba593e8..d9e2b2599840be92d7787290243ef13a49af8089 100644
--- a/source/Lib/CommonLib/InterPrediction.h
+++ b/source/Lib/CommonLib/InterPrediction.h
@@ -102,9 +102,15 @@ protected:
                              Mv(-2, 2), Mv(-1, 2), Mv(0, 2), Mv(1, 2), Mv(2, 2) };
   uint64_t m_SADsArray[((2 * DMVR_NUM_ITERATION) + 1) * ((2 * DMVR_NUM_ITERATION) + 1)];
 
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+  Pel                  m_gradBuf[2][(AFFINE_MIN_BLOCK_SIZE + 2) * (AFFINE_MIN_BLOCK_SIZE + 2)];
+#else
   Pel                  m_gradBuf[2][2][(MAX_CU_SIZE + 2) * (MAX_CU_SIZE + 2)];
+#endif
   int                  m_dMvBuf[2][16 * 2];
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
   bool                 m_applyPROF[2];
+#endif
   bool                 m_skipPROF;
   bool                 m_encOnly;
   bool                 m_isBi;
@@ -141,7 +147,9 @@ protected:
   void xCalcBIOPar              (const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG, int bitDepth);
   void xCalcBlkGradient         (int sx, int sy, int    *arraysGx2, int     *arraysGxGy, int     *arraysGxdI, int     *arraysGy2, int     *arraysGydI, int     &sGx2, int     &sGy2, int     &sGxGy, int     &sGxdI, int     &sGydI, int width, int height, int unitSize);
   void xWeightedAverage         ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied, PelUnitBuf* yuvDstTmp = NULL );
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
   void xApplyBiPROF             (const PredictionUnit& pu, const CPelBuf& pcYuvSrc0, const CPelBuf& pcYuvSrc1, PelBuf& pcYuvDst, const ClpRng& clpRng);
+#endif
   void xPredAffineBlk           ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng, const std::pair<int, int> scalingRatio = SCALE_1X );
 
   void xWeightedTriangleBlk     ( const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1 );
diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h
index 3275097e88c78882ffcd4fda69fce38dfb80da3f..ae44540199073b88010b619e5d22261730164e0e 100644
--- a/source/Lib/CommonLib/x86/BufferX86.h
+++ b/source/Lib/CommonLib/x86/BufferX86.h
@@ -351,10 +351,45 @@ void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel*
 }
 
 template< X86_VEXT vext >
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, const bool& bi, int shiftNum, Pel offset, const ClpRng& clpRng)
+#else
 void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, int width, int height, const Pel* gradX, const Pel* gradY, int gradStride, const int* dMvX, const int* dMvY, int dMvStride, int shiftNum, Pel offset, const ClpRng& clpRng)
+#endif
 {
   CHECKD((width & 3), "block width error!");
 
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+  const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13);
+
+#ifdef USE_AVX2
+  __m256i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0, mm_src;
+#if !JVET_P0057_BDOF_PROF_HARMONIZATION
+  __m256i mm_dIoffset = _mm256_set1_epi32(1);
+#endif
+  __m256i mm_offset = _mm256_set1_epi16(offset);
+  __m256i vibdimin = _mm256_set1_epi16(clpRng.min);
+  __m256i vibdimax = _mm256_set1_epi16(clpRng.max);
+  __m256i mm_dimin = _mm256_set1_epi32(-dILimit);
+  __m256i mm_dimax = _mm256_set1_epi32(dILimit - 1);
+#else
+  __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0;
+#if !JVET_P0057_BDOF_PROF_HARMONIZATION
+  __m128i mm_dIoffset = _mm_set1_epi32(1);
+#endif
+  __m128i mm_offset = _mm_set1_epi16(offset);
+  __m128i vibdimin = _mm_set1_epi16(clpRng.min);
+  __m128i vibdimax = _mm_set1_epi16(clpRng.max);
+  __m128i mm_dimin = _mm_set1_epi32(-dILimit);
+  __m128i mm_dimax = _mm_set1_epi32(dILimit - 1);
+#endif
+
+#if USE_AVX2
+  for (int h = 0; h < height; h += 4)
+#else
+  for (int h = 0; h < height; h += 2)
+#endif
+#else
   __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_src;
 #if !JVET_P0057_BDOF_PROF_HARMONIZATION
   __m128i mm_dIoffset = _mm_set1_epi32(1);
@@ -364,13 +399,8 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride,
   __m128i vibdimax  = _mm_set1_epi32(clpRng.max);
   __m128i vzero     = _mm_setzero_si128();
 
-#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING 
-  const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13);
-  __m128i vdImin = _mm_set1_epi32(-dILimit);
-  __m128i vdImax = _mm_set1_epi32(dILimit - 1);
-#endif
-
   for (int h = 0; h < height; h++)
+#endif
   {
     const int* vX = dMvX;
     const int* vY = dMvY;
@@ -381,6 +411,100 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride,
 
     for (int w = 0; w < width; w += 4)
     {
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+#if USE_AVX2
+      const int *vX0 = vX, *vY0 = vY;
+      const Pel *gX0 = gX, *gY0 = gY;
+
+      // first two rows
+      mm_dmvx = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vX0)), _mm_loadu_si128((const __m128i *)(vX0 + dMvStride)), 1);
+      mm_dmvy = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vY0)), _mm_loadu_si128((const __m128i *)(vY0 + dMvStride)), 1);
+      mm_gradx = _mm256_inserti128_si256(
+        _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX0))),
+        _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gX0 + gradStride))), 1);
+      mm_grady = _mm256_inserti128_si256(
+        _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY0))),
+        _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gY0 + gradStride))), 1);
+      mm_dI0 = _mm256_add_epi32(_mm256_mullo_epi32(mm_dmvx, mm_gradx), _mm256_mullo_epi32(mm_dmvy, mm_grady));
+#if !JVET_P0057_BDOF_PROF_HARMONIZATION
+      mm_dI0 = _mm256_srai_epi32(_mm256_add_epi32(mm_dI0, mm_dIoffset), 1);
+#endif
+      mm_dI0 = _mm256_min_epi32(mm_dimax, _mm256_max_epi32(mm_dimin, mm_dI0));
+
+      // next two rows
+      vX0 += (dMvStride << 1); vY0 += (dMvStride << 1); gX0 += (gradStride << 1); gY0 += (gradStride << 1);
+      mm_dmvx = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vX0)), _mm_loadu_si128((const __m128i *)(vX0 + dMvStride)), 1);
+      mm_dmvy = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)vY0)), _mm_loadu_si128((const __m128i *)(vY0 + dMvStride)), 1);
+      mm_gradx = _mm256_inserti128_si256(
+        _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX0))),
+        _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gX0 + gradStride))), 1);
+      mm_grady = _mm256_inserti128_si256(
+        _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY0))),
+        _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gY0 + gradStride))), 1);
+      mm_dI = _mm256_add_epi32(_mm256_mullo_epi32(mm_dmvx, mm_gradx), _mm256_mullo_epi32(mm_dmvy, mm_grady));
+#if !JVET_P0057_BDOF_PROF_HARMONIZATION
+      mm_dI = _mm256_srai_epi32(_mm256_add_epi32(mm_dI, mm_dIoffset), 1);
+#endif
+      mm_dI = _mm256_min_epi32(mm_dimax, _mm256_max_epi32(mm_dimin, mm_dI));
+
+      // combine four rows
+      mm_dI = _mm256_packs_epi32(mm_dI0, mm_dI);
+      const Pel* src0 = src + srcStride;
+      mm_src = _mm256_inserti128_si256(
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src), _mm_loadl_epi64((const __m128i *)(src + (srcStride << 1))))),
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src0), _mm_loadl_epi64((const __m128i *)(src0 + (srcStride << 1)))),
+        1
+      );
+      mm_dI = _mm256_add_epi16(mm_dI, mm_src);
+      if (!bi)
+      {
+        mm_dI = _mm256_srai_epi16(_mm256_add_epi16(mm_dI, mm_offset), shiftNum);
+        mm_dI = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, mm_dI));
+      }
+
+      // store final results
+      __m128i dITmp = _mm256_extractf128_si256(mm_dI, 1);
+      Pel* dst0 = dst;
+      _mm_storel_epi64((__m128i *)dst0, _mm256_castsi256_si128(mm_dI));
+      dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, dITmp);
+      dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(_mm256_castsi256_si128(mm_dI), _mm256_castsi256_si128(mm_dI)));
+      dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(dITmp, dITmp));
+#else
+      // first row
+      mm_dmvx = _mm_loadu_si128((const __m128i *)vX);
+      mm_dmvy = _mm_loadu_si128((const __m128i *)vY);
+      mm_gradx = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX));
+      mm_grady = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gY));
+      mm_dI0 = _mm_add_epi32(_mm_mullo_epi32(mm_dmvx, mm_gradx), _mm_mullo_epi32(mm_dmvy, mm_grady));
+#if !JVET_P0057_BDOF_PROF_HARMONIZATION
+      mm_dI0 = _mm_srai_epi32(_mm_add_epi32(mm_dI0, mm_dIoffset), 1);
+#endif
+      mm_dI0 = _mm_min_epi32(mm_dimax, _mm_max_epi32(mm_dimin, mm_dI0));
+
+      // second row
+      mm_dmvx = _mm_loadu_si128((const __m128i *)(vX + dMvStride));
+      mm_dmvy = _mm_loadu_si128((const __m128i *)(vY + dMvStride));
+      mm_gradx = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gX + gradStride)));
+      mm_grady = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(gY + gradStride)));
+      mm_dI = _mm_add_epi32(_mm_mullo_epi32(mm_dmvx, mm_gradx), _mm_mullo_epi32(mm_dmvy, mm_grady));
+#if !JVET_P0057_BDOF_PROF_HARMONIZATION
+      mm_dI = _mm_srai_epi32(_mm_add_epi32(mm_dI, mm_dIoffset), 1);
+#endif
+      mm_dI = _mm_min_epi32(mm_dimax, _mm_max_epi32(mm_dimin, mm_dI));
+
+      // combine both rows
+      mm_dI = _mm_packs_epi32(mm_dI0, mm_dI);
+      mm_dI = _mm_add_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src), _mm_loadl_epi64((const __m128i *)(src + srcStride))), mm_dI);
+      if (!bi)
+      {
+        mm_dI = _mm_srai_epi16(_mm_add_epi16(mm_dI, mm_offset), shiftNum);
+        mm_dI = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_dI));
+      }
+
+      _mm_storel_epi64((__m128i *)dst, mm_dI);
+      _mm_storel_epi64((__m128i *)(dst + dstStride), _mm_unpackhi_epi64(mm_dI, mm_dI));
+#endif
+#else
       mm_dmvx = _mm_loadu_si128((const __m128i *)vX);
       mm_dmvy = _mm_loadu_si128((const __m128i *)vY);
       mm_gradx = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)gX));
@@ -391,24 +515,43 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride,
 #if !JVET_P0057_BDOF_PROF_HARMONIZATION 
       mm_dI = _mm_srai_epi32(_mm_add_epi32(mm_dI, mm_dIoffset), 1);
 #endif
-#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
-      mm_dI = _mm_min_epi32(vdImax, _mm_max_epi32(vdImin, mm_dI));
-#endif
+
       mm_dI = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(mm_dI, mm_src), mm_offset), shiftNum);
       mm_dI = _mm_packs_epi32(_mm_min_epi32(vibdimax, _mm_max_epi32(vibdimin, mm_dI)), vzero);
       _mm_storel_epi64((__m128i *)dst, mm_dI);
 
+#endif
       vX += 4; vY += 4; gX += 4; gY += 4; src += 4; dst += 4;
     }
+
+#if JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
+#if USE_AVX2
+    dMvX += (dMvStride << 2);
+    dMvY += (dMvStride << 2);
+    gradX += (gradStride << 2);
+    gradY += (gradStride << 2);
+    srcPel += (srcStride << 2);
+    dstPel += (dstStride << 2);
+#else
+    dMvX += (dMvStride << 1);
+    dMvY += (dMvStride << 1);
+    gradX += (gradStride << 1);
+    gradY += (gradStride << 1);
+    srcPel += (srcStride << 1);
+    dstPel += (dstStride << 1);
+#endif
+#else
     dMvX += dMvStride;
     dMvY += dMvStride;
     gradX += gradStride;
     gradY += gradStride;
     srcPel += srcStride;
     dstPel += dstStride;
+#endif
   }
 }
 
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
 template< X86_VEXT vext, bool l1PROFEnabled = true>
 void applyBiPROF_SSE(Pel* dst, int dstStride, const Pel* src0, const Pel* src1, int srcStride, int width, int height, const Pel* gradX0, const Pel* gradY0, const Pel* gradX1, const Pel* gradY1, int gradStride, const int* dMvX0, const int* dMvY0, const int* dMvX1, const int* dMvY1, int dMvStride, const int8_t w0, const ClpRng& clpRng)
 {
@@ -526,6 +669,7 @@ void applyBiPROF_SSE(Pel* dst, int dstStride, const Pel* src0, const Pel* src1,
     dst += dstStride;
   }
 }
+#endif
 
 template< X86_VEXT vext >
 void roundIntVector_SIMD(int* v, int size, unsigned int nShift, const int dmvLimit)
@@ -1295,8 +1439,10 @@ void PelBufferOps::_initPelBufOpsX86()
 #endif
   profGradFilter = gradFilter_SSE<vext, false>;
   applyPROF      = applyPROF_SSE<vext>;
+#if !JVET_P0154_PROF_SAMPLE_OFFSET_CLIPPING
   applyBiPROF[1] = applyBiPROF_SSE<vext>;
   applyBiPROF[0] = applyBiPROF_SSE<vext, false>;
+#endif
   roundIntVector = roundIntVector_SIMD<vext>;
 }