diff --git a/cfg/encoder_randomaccess_vtm.cfg b/cfg/encoder_randomaccess_vtm.cfg
index 63ff58df9358f54e7567d150a6e8993a48ca04ea..29f7902779a5d2f22a7d36e60ec9171ae6bb1e1e 100644
--- a/cfg/encoder_randomaccess_vtm.cfg
+++ b/cfg/encoder_randomaccess_vtm.cfg
@@ -142,7 +142,8 @@ DepQuant                     : 1
 IMV                          : 2
 ALF                          : 1
 GBi                          : 1 
-GBiFast                      : 1 
+GBiFast                      : 1
+BIO                          : 1 
 
 # Fast tools
 PBIntraFast                  : 1
diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp
index ab22f12e29a6ea1870bb89851025095380eb075e..ab7fcde8ec6df67dad3d9bf65d03174e30a64e04 100644
--- a/source/App/EncoderApp/EncApp.cpp
+++ b/source/App/EncoderApp/EncApp.cpp
@@ -230,6 +230,9 @@ void EncApp::xInitLibCfg()
   m_cEncLib.setAffineType                                        ( m_AffineType );
 #if !REMOVE_MV_ADAPT_PREC
   m_cEncLib.setHighPrecisionMv                                   (m_highPrecisionMv);
+#endif
+#if JVET_L0256_BIO
+  m_cEncLib.setBIO                                               (m_BIO);
 #endif
   m_cEncLib.setDisableMotionCompression                          ( m_DisableMotionCompression );
   m_cEncLib.setMTTMode                                           ( m_MTT );
diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp
index e6723233eeb7f91df4cf1170470b85452e25187e..fc12314b2aeedd2f4357acd99e0cba4fe8f0f06d 100644
--- a/source/App/EncoderApp/EncAppCfg.cpp
+++ b/source/App/EncoderApp/EncAppCfg.cpp
@@ -827,8 +827,11 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] )
 #if !REMOVE_MV_ADAPT_PREC 
   ("HighPrecMv",                                     m_highPrecisionMv,                                false, "High precision motion vectors for temporal merging (0:off, 1:on)  [default: off]")
 #endif
-  ("Affine",                                          m_Affine,                                        false, "Enable affine prediction (0:off, 1:on)  [default: off]")
-  ( "AffineType",                                     m_AffineType,                                     true,  "Enable affine type prediction (0:off, 1:on)  [default: on]" )
+  ("Affine",                                         m_Affine,                                         false, "Enable affine prediction (0:off, 1:on)  [default: off]")
+  ("AffineType",                                     m_AffineType,                                     true,  "Enable affine type prediction (0:off, 1:on)  [default: on]" )
+#if JVET_L0256_BIO
+  ("BIO",                                            m_BIO,                                             false, "Enable bi-directional optical flow")
+#endif    
   ("DisableMotCompression",                           m_DisableMotionCompression,                       false, "Disable motion data compression for all modes")
   ("IMV",                                             m_ImvMode,                                            2, "Adaptive MV precision Mode (IMV)\n"
                                                                                                                "\t0: disabled IMV\n"
@@ -1943,6 +1946,9 @@ bool EncAppCfg::xCheckParameter()
 #if !REMOVE_MV_ADAPT_PREC
     xConfirmPara( m_highPrecisionMv, "High precision MV for temporal merging can only be used with NEXT profile" );
     xConfirmPara( m_Affine, "Affine is only allowed with NEXT profile" );
+#endif
+#if JVET_L0256_BIO
+    xConfirmPara( m_BIO, "BIO only allowed with NEXT profile" );
 #endif
     xConfirmPara( m_DisableMotionCompression, "Disable motion data compression only allowed with NEXT profile" );
     xConfirmPara( m_MTT, "Multi type tree is only allowed with NEXT profile" );
@@ -3143,6 +3149,9 @@ void EncAppCfg::xPrintParameter()
     if( !m_QTBT ) msg( VERBOSE, "IMVMaxCand:%d ", m_ImvMaxCand );
 #if !REMOVE_MV_ADAPT_PREC 
     msg(VERBOSE, "HighPrecMv:%d ", m_highPrecisionMv);
+#endif
+#if JVET_L0256_BIO
+    msg( VERBOSE, "BIO:%d ", m_BIO );
 #endif
     msg( VERBOSE, "DisMDC:%d ", m_DisableMotionCompression );
     msg( VERBOSE, "MTT:%d ", m_MTT );
diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h
index cf5095f67d48a66364d4b438d0a1447d78fca63d..712a0e1276075c1389a703d31eba94034b292c90 100644
--- a/source/App/EncoderApp/EncAppCfg.h
+++ b/source/App/EncoderApp/EncAppCfg.h
@@ -213,6 +213,9 @@ protected:
   bool      m_AffineType;
 #if !REMOVE_MV_ADAPT_PREC
   bool      m_highPrecisionMv;
+#endif
+#if JVET_L0256_BIO
+  bool      m_BIO;
 #endif
   bool      m_DisableMotionCompression;
   unsigned  m_MTT;
diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp
index f31a22044ec3f8d59704b0ccf47387f1f513f70d..2454c501ae976e3885395f6efcc3169531db8c9b 100644
--- a/source/Lib/CommonLib/Buffer.cpp
+++ b/source/Lib/CommonLib/Buffer.cpp
@@ -62,6 +62,138 @@ void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T
 #undef ADD_AVG_CORE_INC
 }
 
+#if JVET_L0256_BIO
+void addBIOAvgCore(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
+{
+  int b = 0;
+
+  for (int y = 0; y < height; y++)
+  {
+    for (int x = 0; x < width; x += 4)
+    {
+      b = tmpx * (gradX0[x] - gradX1[x]) + tmpy * (gradY0[x] - gradY1[x]);
+      b = ((b + 1) >> 1);
+      dst[x] = ClipPel((int16_t)rightShift((src0[x] + src1[x] + b + offset), shift), clpRng);
+
+      b = tmpx * (gradX0[x + 1] - gradX1[x + 1]) + tmpy * (gradY0[x + 1] - gradY1[x + 1]);
+      b = ((b + 1) >> 1);
+      dst[x + 1] = ClipPel((int16_t)rightShift((src0[x + 1] + src1[x + 1] + b + offset), shift), clpRng);
+
+      b = tmpx * (gradX0[x + 2] - gradX1[x + 2]) + tmpy * (gradY0[x + 2] - gradY1[x + 2]);
+      b = ((b + 1) >> 1);
+      dst[x + 2] = ClipPel((int16_t)rightShift((src0[x + 2] + src1[x + 2] + b + offset), shift), clpRng);
+
+      b = tmpx * (gradX0[x + 3] - gradX1[x + 3]) + tmpy * (gradY0[x + 3] - gradY1[x + 3]);
+      b = ((b + 1) >> 1);
+      dst[x + 3] = ClipPel((int16_t)rightShift((src0[x + 3] + src1[x + 3] + b + offset), shift), clpRng);
+    }
+    dst += dstStride;       src0 += src0Stride;     src1 += src1Stride;
+    gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride;
+  }
+}
+
+void gradFilterCore(Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY)
+{
+  Pel* srcTmp = pSrc + srcStride + 1;
+  Pel* gradXTmp = gradX + gradStride + 1;
+  Pel* gradYTmp = gradY + gradStride + 1;
+
+  for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++)
+  {
+    for (int x = 0; x < (width - 2 * BIO_EXTEND_SIZE); x++)
+    {
+      gradYTmp[x] = (srcTmp[x + srcStride] - srcTmp[x - srcStride]) >> 4;
+      gradXTmp[x] = (srcTmp[x + 1] - srcTmp[x - 1]) >> 4;
+    }
+    gradXTmp += gradStride;
+    gradYTmp += gradStride;
+    srcTmp += srcStride;
+  }
+
+  gradXTmp = gradX + gradStride + 1;
+  gradYTmp = gradY + gradStride + 1;
+  for (int y = 0; y < (height - 2 * BIO_EXTEND_SIZE); y++)
+  {
+    gradXTmp[-1] = gradXTmp[0];
+    gradXTmp[width - 2 * BIO_EXTEND_SIZE] = gradXTmp[width - 2 * BIO_EXTEND_SIZE - 1];
+    gradXTmp += gradStride;
+
+    gradYTmp[-1] = gradYTmp[0];
+    gradYTmp[width - 2 * BIO_EXTEND_SIZE] = gradYTmp[width - 2 * BIO_EXTEND_SIZE - 1];
+    gradYTmp += gradStride;
+  }
+
+  gradXTmp = gradX + gradStride;
+  gradYTmp = gradY + gradStride;
+  ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel)*(width));
+  ::memcpy(gradXTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradXTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width));
+  ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width));
+  ::memcpy(gradYTmp + (height - 2 * BIO_EXTEND_SIZE)*gradStride, gradYTmp + (height - 2 * BIO_EXTEND_SIZE - 1)*gradStride, sizeof(Pel)*(width));
+}
+
+void calcBIOParCore(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG)
+{
+  for (int y = 0; y < heightG; y++)
+  {
+    for (int x = 0; x < widthG; x++)
+    {
+      int temp = (srcY0Temp[x] >> 6) - (srcY1Temp[x] >> 6);
+      int tempX = (gradX0[x] + gradX1[x]) >> 3;
+      int tempY = (gradY0[x] + gradY1[x]) >> 3;
+      dotProductTemp1[x] = tempX * tempX;
+      dotProductTemp2[x] = tempX * tempY;
+      dotProductTemp3[x] = -tempX * temp;
+      dotProductTemp5[x] = tempY * tempY;
+      dotProductTemp6[x] = -tempY * temp;
+    }
+    srcY0Temp += src0Stride;
+    srcY1Temp += src1Stride;
+    gradX0 += gradStride;
+    gradX1 += gradStride;
+    gradY0 += gradStride;
+    gradY1 += gradStride;
+    dotProductTemp1 += widthG;
+    dotProductTemp2 += widthG;
+    dotProductTemp3 += widthG;
+    dotProductTemp5 += widthG;
+    dotProductTemp6 += widthG;
+  }
+}
+
+void calcBlkGradientCore(int sx, int sy, int     *arraysGx2, int     *arraysGxGy, int     *arraysGxdI, int     *arraysGy2, int     *arraysGydI, int     &sGx2, int     &sGy2, int     &sGxGy, int     &sGxdI, int     &sGydI, int width, int height, int unitSize)
+{
+  int     *Gx2 = arraysGx2;
+  int     *Gy2 = arraysGy2;
+  int     *GxGy = arraysGxGy;
+  int     *GxdI = arraysGxdI;
+  int     *GydI = arraysGydI;
+
+  // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE
+  Gx2 -= (BIO_EXTEND_SIZE*width);
+  Gy2 -= (BIO_EXTEND_SIZE*width);
+  GxGy -= (BIO_EXTEND_SIZE*width);
+  GxdI -= (BIO_EXTEND_SIZE*width);
+  GydI -= (BIO_EXTEND_SIZE*width);
+
+  for (int y = -BIO_EXTEND_SIZE; y < unitSize + BIO_EXTEND_SIZE; y++)
+  {
+    for (int x = -BIO_EXTEND_SIZE; x < unitSize + BIO_EXTEND_SIZE; x++)
+    {
+      sGx2 += Gx2[x];
+      sGy2 += Gy2[x];
+      sGxGy += GxGy[x];
+      sGxdI += GxdI[x];
+      sGydI += GydI[x];
+    }
+    Gx2 += width;
+    Gy2 += width;
+    GxGy += width;
+    GxdI += width;
+    GydI += width;
+  }
+}
+#endif
+
 #if ENABLE_SIMD_OPT_GBI && JVET_L0646_GBI
 void removeWeightHighFreq(int16_t* dst, int dstStride, const int16_t* src, int srcStride, int width, int height, int shift, int gbiWeight)
 {
@@ -138,6 +270,13 @@ PelBufferOps::PelBufferOps()
   linTf4 = linTfCore<Pel>;
   linTf8 = linTfCore<Pel>;
 
+#if JVET_L0256_BIO
+  addBIOAvg4      = addBIOAvgCore;
+  bioGradFilter   = gradFilterCore;
+  calcBIOPar      = calcBIOParCore;
+  calcBlkGradient = calcBlkGradientCore;
+#endif
+
 #if ENABLE_SIMD_OPT_GBI
   removeWeightHighFreq8 = removeWeightHighFreq;
   removeWeightHighFreq4 = removeWeightHighFreq;
diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h
index fdf3b962f774c18efc1c9433d496415ffcc74cf2..a0142743f3efd1b287efa3c039b80e038aeb6db6 100644
--- a/source/Lib/CommonLib/Buffer.h
+++ b/source/Lib/CommonLib/Buffer.h
@@ -68,6 +68,12 @@ struct PelBufferOps
   void ( *reco8 )         ( const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, int width, int height,                                   const ClpRng& clpRng );
   void ( *linTf4 )        ( const Pel* src0, int src0Stride,                                  Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip );
   void ( *linTf8 )        ( const Pel* src0, int src0Stride,                                  Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip );
+#if JVET_L0256_BIO
+  void(*addBIOAvg4)    (const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng);
+  void(*bioGradFilter) (Pel* pSrc, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY);
+  void(*calcBIOPar)    (const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG);
+  void(*calcBlkGradient)(int sx, int sy, int    *arraysGx2, int     *arraysGxGy, int     *arraysGxdI, int     *arraysGy2, int     *arraysGydI, int     &sGx2, int     &sGy2, int     &sGxGy, int     &sGxdI, int     &sGydI, int width, int height, int unitSize);
+#endif
 #if ENABLE_SIMD_OPT_GBI
   void ( *removeWeightHighFreq8)  ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight);
   void ( *removeWeightHighFreq4)  ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight);
diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
index 19f2cd861515f2c9d76a6f26f59a10f8c93140d6..02bbfa14e463147ba051ba1c709dec33cc5ef0bb 100644
--- a/source/Lib/CommonLib/CommonDef.h
+++ b/source/Lib/CommonLib/CommonDef.h
@@ -318,6 +318,12 @@ static const int MAX_NUM_GT2_BINS_4x4SUBBLOCK =                     4; ///< max
 static const int MAX_NUM_REG_BINS_2x2SUBBLOCK =                     8; ///< max number of context-coded bins (incl. gt2 bins) per 2x2 subblock (chroma)
 static const int MAX_NUM_GT2_BINS_2x2SUBBLOCK =                     2; ///< max number of gt2 bins per 2x2 subblock (chroma)
 #endif
+
+#if JVET_L0256_BIO
+static const int BIO_EXTEND_SIZE              =                     1;
+static const int BIO_TEMP_BUFFER_SIZE         =                     (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE) * (MAX_CU_SIZE + 2 * BIO_EXTEND_SIZE);
+#endif
+
 #if JVET_L0646_GBI
 static const int GBI_NUM =                                          5; ///< the number of weight options
 static const int GBI_DEFAULT =                                      ((uint8_t)(GBI_NUM >> 1)); ///< Default weighting index representing for w=0.5
@@ -397,6 +403,10 @@ static const int NTAPS_CHROMA             =                         4; ///< Numb
 static const int MAX_LADF_INTERVALS       =                         5; /// max number of luma adaptive deblocking filter qp offset intervals
 #endif
 
+#if JVET_L0256_BIO
+static const int NTAPS_BILINEAR           =                         2; ///< Number of taps for bilinear filter
+#endif
+
 // ====================================================================================================================
 // Macro functions
 // ====================================================================================================================
diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index c6ea5e914dc34da5d0886b1afb10108142b98bf4..d980b24cc05c874241e46be5acb47342d03830ad 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -55,6 +55,13 @@ InterPrediction::InterPrediction()
   m_currChromaFormat( NUM_CHROMA_FORMAT )
 , m_maxCompIDToPred ( MAX_NUM_COMPONENT )
 , m_pcRdCost        ( nullptr )
+#if JVET_L0256_BIO
+, m_gradX0(nullptr)
+, m_gradY0(nullptr)
+, m_gradX1(nullptr)
+, m_gradY1(nullptr)
+, m_subPuMC(false)
+#endif
 {
   for( uint32_t ch = 0; ch < MAX_NUM_COMPONENT; ch++ )
   {
@@ -109,6 +116,13 @@ void InterPrediction::destroy()
       m_filteredBlockTmp[i][c] = nullptr;
     }
   }
+
+#if JVET_L0256_BIO
+  xFree(m_gradX0);   m_gradX0 = nullptr;
+  xFree(m_gradY0);   m_gradY0 = nullptr;
+  xFree(m_gradX1);   m_gradX1 = nullptr;
+  xFree(m_gradY1);   m_gradY1 = nullptr;
+#endif
 }
 
 void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC )
@@ -127,8 +141,13 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC )
   {
     for( uint32_t c = 0; c < MAX_NUM_COMPONENT; c++ )
     {
+#if JVET_L0256_BIO
+      int extWidth = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 16;
+      int extHeight = MAX_CU_SIZE + (2 * BIO_EXTEND_SIZE + 2) + 1;
+#else
       int extWidth  = MAX_CU_SIZE + 16;
       int extHeight = MAX_CU_SIZE + 1;
+#endif
       for( uint32_t i = 0; i < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS; i++ )
       {
         m_filteredBlockTmp[i][c] = ( Pel* ) xMalloc( Pel, ( extWidth + 4 ) * ( extHeight + 7 + 4 ) );
@@ -148,7 +167,13 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chromaFormatIDC )
 
 
     m_iRefListIdx = -1;
-    
+  
+#if JVET_L0256_BIO
+    m_gradX0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
+    m_gradY0 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
+    m_gradX1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
+    m_gradY1 = (Pel*)xMalloc(Pel, BIO_TEMP_BUFFER_SIZE);
+#endif
   }
 
 #if !JVET_J0090_MEMORY_BANDWITH_MEASURE
@@ -264,6 +289,10 @@ void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const R
   int  fstStep = (!verMC ? puHeight : puWidth);
   int  secStep = (!verMC ? puWidth : puHeight);
 
+#if JVET_L0256_BIO
+  m_subPuMC = true;
+#endif
+
   for (int fstDim = fstStart; fstDim < fstEnd; fstDim += fstStep)
   {
     for (int secDim = secStart; secDim < secEnd; secDim += secStep)
@@ -299,10 +328,16 @@ void InterPrediction::xSubPuMC( PredictionUnit& pu, PelUnitBuf& predBuf, const R
       secDim = later - secStep;
     }
   }
+#if JVET_L0256_BIO
+  m_subPuMC = false;
+#endif
 }
 
 
 void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi 
+#if JVET_L0256_BIO
+                                   ,const bool& bioApplied /*=false*/
+#endif
 )
 {
   const SPS &sps = *pu.cs->sps;
@@ -332,12 +367,18 @@ void InterPrediction::xPredInterUni(const PredictionUnit& pu, const RefPicList&
     const ComponentID compID = ComponentID( comp );
     if ( pu.cu->affine )
     {
+#if JVET_L0256_BIO
+      CHECK( bioApplied, "BIO is not allowed with affine" );
+#endif
       xPredAffineBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv, pcYuvPred, bi, pu.cu->slice->clpRng( compID ) );
     }
     else
     {
       xPredInterBlk( compID, pu, pu.cu->slice->getRefPic( eRefPicList, iRefIdx ), mv[0], pcYuvPred, bi, pu.cu->slice->clpRng( compID )
-                    );
+#if JVET_L0256_BIO
+                    ,bioApplied
+#endif
+      );
     }
   }
 }
@@ -347,6 +388,36 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred)
   const PPS   &pps   = *pu.cs->pps;
   const Slice &slice = *pu.cs->slice;
 
+#if JVET_L0256_BIO
+  bool bioApplied = false;
+  if (pu.cs->sps->getSpsNext().getUseBIO())
+  {
+    if (pu.cu->affine || m_subPuMC)
+    {
+      bioApplied = false;
+    }
+    else
+    {
+      const bool biocheck0 = !(pps.getWPBiPred() && slice.getSliceType() == B_SLICE);
+      const bool biocheck1 = !(pps.getUseWP() && slice.getSliceType() == P_SLICE);
+      if (biocheck0
+        && biocheck1
+        && PU::isBiPredFromDifferentDir(pu)
+        && !(pu.Y().height == 4 || (pu.Y().width == 4 && pu.Y().height == 8))
+       )
+      {
+        bioApplied = true;
+      }
+    }
+
+#if JVET_L0646_GBI
+    if (pu.cu->cs->sps->getSpsNext().getUseGBi() && bioApplied && pu.cu->GBiIdx != GBI_DEFAULT)
+    {
+      bioApplied = false;
+    }
+#endif
+  }
+#endif
 
   for (uint32_t refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
   {
@@ -367,6 +438,9 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred)
     if (pu.refIdx[0] >= 0 && pu.refIdx[1] >= 0)
     {
       xPredInterUni ( pu, eRefPicList, pcMbBuf, true
+#if JVET_L0256_BIO
+                     ,bioApplied 
+#endif
                      );
     }
     else
@@ -399,13 +473,19 @@ void InterPrediction::xPredInterBi(PredictionUnit& pu, PelUnitBuf &pcYuvPred)
   }
   else
   {
+#if JVET_L0256_BIO
+    xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs(), bioApplied );
+#else
     xWeightedAverage( pu, srcPred0, srcPred1, pcYuvPred, slice.getSPS()->getBitDepths(), slice.clpRngs() );
+#endif
   }
 }
 
-
 void InterPrediction::xPredInterBlk ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng
-                                    )
+#if JVET_L0256_BIO
+                                     ,const bool& bioApplied /*=false*/
+#endif
+)
 {
   JVET_J0090_SET_REF_PICTURE( refPic, compID );
   const ChromaFormat  chFmt = pu.chromaFormat;
@@ -446,24 +526,75 @@ void InterPrediction::xPredInterBlk ( const ComponentID& compID, const Predictio
     refBuf = refPic->getRecoBuf( CompArea( compID, chFmt, offset, pu.blocks[compID].size() ) );
   }
 
+#if JVET_L0256_BIO
+  // backup data
+  int backupWidth = width;
+  int backupHeight = height;
+  Pel *backupDstBufPtr = dstBuf.buf;
+  int backupDstBufStride = dstBuf.stride;
+
+  if (bioApplied && compID == COMPONENT_Y)
+  {
+    width = width + 2 * BIO_EXTEND_SIZE + 2;
+    height = height + 2 * BIO_EXTEND_SIZE + 2;
+
+    // change MC output
+    dstBuf.stride = width;
+    dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + 2 * dstBuf.stride + 2;
+  }
+#endif
+
   if( yFrac == 0 )
   {
+#if JVET_L0256_BIO
+    m_if.filterHor(compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, xFrac, rndRes, chFmt, clpRng);
+#else
     m_if.filterHor(compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width, height, xFrac, rndRes, chFmt, clpRng);
+#endif
   }
   else if( xFrac == 0 )
   {
+#if JVET_L0256_BIO
+    m_if.filterVer(compID, (Pel*)refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, true, rndRes, chFmt, clpRng);
+#else
     m_if.filterVer(compID, (Pel*) refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width, height, yFrac, true, rndRes, chFmt, clpRng);
+#endif
   }
   else
   {
-      PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], pu.blocks[compID]);
+    PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][compID], pu.blocks[compID]);
+#if JVET_L0256_BIO
+    tmpBuf.stride = dstBuf.stride;
+#endif
 
     int vFilterSize = isLuma(compID) ? NTAPS_LUMA : NTAPS_CHROMA;
+#if JVET_L0256_BIO
+    m_if.filterHor(compID, (Pel*)refBuf.buf - ((vFilterSize >> 1) - 1) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, backupWidth, backupHeight + vFilterSize - 1, xFrac, false, chFmt, clpRng);
+#else
     m_if.filterHor(compID, (Pel*) refBuf.buf - ((vFilterSize >> 1) - 1) * refBuf.stride, refBuf.stride, tmpBuf.buf, tmpBuf.stride, width, height + vFilterSize - 1, xFrac, false,         chFmt, clpRng);
+#endif
     JVET_J0090_SET_CACHE_ENABLE( false );
+#if JVET_L0256_BIO
+    m_if.filterVer(compID, (Pel*)tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, backupWidth, backupHeight, yFrac, false, rndRes, chFmt, clpRng);
+#else
     m_if.filterVer(compID, (Pel*) tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, dstBuf.buf, dstBuf.stride, width, height,                   yFrac, false, rndRes, chFmt, clpRng);
+#endif
     JVET_J0090_SET_CACHE_ENABLE( true );
   }
+#if JVET_L0256_BIO
+  if (bioApplied && compID == COMPONENT_Y)
+  {
+    refBuf.buf = refBuf.buf - refBuf.stride - 1;
+    dstBuf.buf = m_filteredBlockTmp[2 + m_iRefListIdx][compID] + dstBuf.stride + 1;
+    bioSampleExtendBilinearFilter(refBuf.buf, refBuf.stride, dstBuf.buf, dstBuf.stride, width - 2, height - 2, 1, xFrac, yFrac, rndRes, chFmt, clpRng);
+
+    // restore data 
+    width = backupWidth;
+    height = backupHeight;
+    dstBuf.buf = backupDstBufPtr;
+    dstBuf.stride = backupDstBufStride;
+  }
+#endif
 }
 
 void InterPrediction::xPredAffineBlk( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng )
@@ -612,8 +743,224 @@ int getMSB( unsigned x )
   return msb;
 }
 
+#if JVET_L0256_BIO
+void InterPrediction::applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths)
+{
+  const int     height = yuvDst.Y().height;
+  const int     width = yuvDst.Y().width;
+  int           heightG = height + 2 * BIO_EXTEND_SIZE;
+  int           widthG = width + 2 * BIO_EXTEND_SIZE;
+  int           offsetPos = widthG*BIO_EXTEND_SIZE + BIO_EXTEND_SIZE;
+
+  Pel*          gradX0 = m_gradX0;
+  Pel*          gradX1 = m_gradX1;
+  Pel*          gradY0 = m_gradY0;
+  Pel*          gradY1 = m_gradY1;
+
+  int           stridePredMC = widthG + 2;
+  const Pel*    srcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + stridePredMC + 1;
+  const Pel*    srcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + stridePredMC + 1;
+  const int     src0Stride = stridePredMC;
+  const int     src1Stride = stridePredMC;
+
+  Pel*          dstY = yuvDst.Y().buf;
+  const int     dstStride = yuvDst.Y().stride;
+  const Pel*    srcY0Temp = srcY0;
+  const Pel*    srcY1Temp = srcY1;
+
+  for (int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++)
+  {
+    Pel* dstTempPtr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + stridePredMC + 1;
+    Pel* gradY = (refList == 0) ? m_gradY0 : m_gradY1;
+    Pel* gradX = (refList == 0) ? m_gradX0 : m_gradX1;
+
+    g_pelBufOP.bioGradFilter(dstTempPtr, stridePredMC, widthG, heightG, widthG, gradX, gradY);
+    Pel* padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 2;
+    for (int y = 0; y< height; y++)
+    {
+      padStr[-1] = padStr[0];
+      padStr[width] = padStr[width - 1];
+      padStr += stridePredMC;
+    }
+
+    padStr = m_filteredBlockTmp[2 + refList][COMPONENT_Y] + 2 * stridePredMC + 1;
+    ::memcpy(padStr - stridePredMC, padStr, sizeof(Pel)*(widthG));
+    ::memcpy(padStr + height*stridePredMC, padStr + (height - 1)*stridePredMC, sizeof(Pel)*(widthG));
+  }
+
+  const ClpRng& clpRng = pu.cu->cs->slice->clpRng(COMPONENT_Y);
+  const int   bitDepth = clipBitDepths.recon[toChannelType(COMPONENT_Y)];
+  const int   shiftNum = IF_INTERNAL_PREC + 1 - bitDepth;
+  const int   offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
+  const int   limit = ((int)1 << (4 + IF_INTERNAL_PREC - bitDepth - 5));
+
+  int*     dotProductTemp1 = m_dotProduct1;
+  int*     dotProductTemp2 = m_dotProduct2;
+  int*     dotProductTemp3 = m_dotProduct3;
+  int*     dotProductTemp5 = m_dotProduct5;
+  int*     dotProductTemp6 = m_dotProduct6;
+
+  g_pelBufOP.calcBIOPar(srcY0Temp, srcY1Temp, gradX0, gradX1, gradY0, gradY1, dotProductTemp1, dotProductTemp2, dotProductTemp3, dotProductTemp5, dotProductTemp6, src0Stride, src1Stride, widthG, widthG, heightG);
+
+  int xUnit = (width >> 2);
+  int yUnit = (height >> 2);
+
+  Pel *dstY0 = dstY;
+  gradX0 = m_gradX0; gradX1 = m_gradX1;
+  gradY0 = m_gradY0; gradY1 = m_gradY1;
+
+  for (int yu = 0; yu < yUnit; yu++)
+  {
+    for (int xu = 0; xu < xUnit; xu++)
+    {
+      if (m_bioPredSubBlkDist[yu*xUnit + xu] < m_bioSubBlkDistThres)
+      {
+        srcY0Temp = srcY0 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2);
+        srcY1Temp = srcY1 + (stridePredMC + 1) + ((yu*src1Stride + xu) << 2);
+        dstY0 = dstY + ((yu*dstStride + xu) << 2);
+        g_pelBufOP.addAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, (1 << 2), (1 << 2), shiftNum, offset, clpRng);
+        continue;
+      }
+
+      int     sGxdI = 0, sGydI = 0, sGxGy = 0, sGx2 = 0, sGy2 = 0;
+      int     tmpx = 0, tmpy = 0;
 
+      dotProductTemp1 = m_dotProduct1 + offsetPos + ((yu*widthG + xu) << 2);
+      dotProductTemp2 = m_dotProduct2 + offsetPos + ((yu*widthG + xu) << 2);
+      dotProductTemp3 = m_dotProduct3 + offsetPos + ((yu*widthG + xu) << 2);
+      dotProductTemp5 = m_dotProduct5 + offsetPos + ((yu*widthG + xu) << 2);
+      dotProductTemp6 = m_dotProduct6 + offsetPos + ((yu*widthG + xu) << 2);
+
+      g_pelBufOP.calcBlkGradient(xu << 2, yu << 2, dotProductTemp1, dotProductTemp2, dotProductTemp3, dotProductTemp5, dotProductTemp6, sGx2, sGy2, sGxGy, sGxdI, sGydI, widthG, heightG, (1 << 2));
+
+      if (sGx2 > 0)
+      {
+        tmpx = rightShiftMSB(sGxdI << 3, sGx2);
+        tmpx = Clip3(-limit, limit, tmpx);
+      }
+      if (sGy2 > 0)
+      {
+        int     mainsGxGy = sGxGy >> 12;
+        int     secsGxGy = sGxGy & ((1 << 12) - 1);
+        int     tmpData = tmpx * mainsGxGy;
+        tmpData = ((tmpData << 12) + tmpx*secsGxGy) >> 1;
+        tmpy = rightShiftMSB(((sGydI << 3) - tmpData), sGy2);
+        tmpy = Clip3(-limit, limit, tmpy);
+      }
+
+      srcY0Temp = srcY0 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2);
+      srcY1Temp = srcY1 + (stridePredMC + 1) + ((yu*src0Stride + xu) << 2);
+      gradX0 = m_gradX0 + offsetPos + ((yu*widthG + xu) << 2);
+      gradX1 = m_gradX1 + offsetPos + ((yu*widthG + xu) << 2);
+      gradY0 = m_gradY0 + offsetPos + ((yu*widthG + xu) << 2);
+      gradY1 = m_gradY1 + offsetPos + ((yu*widthG + xu) << 2);
+
+      dstY0 = dstY + ((yu*dstStride + xu) << 2);
+      g_pelBufOP.addBIOAvg4(srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, gradX0, gradX1, gradY0, gradY1, widthG, (1 << 2), (1 << 2), (int)tmpx, (int)tmpy, shiftNum, offset, clpRng);
+    }  // xu
+  }  // yu
+}
+
+void InterPrediction::bioSampleExtendBilinearFilter(Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int dim, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng)
+{
+  Pel const* pSrc = NULL;
+  Pel*       pDst = NULL;
+
+  int vFilterSize = NTAPS_LUMA;
+  int widthTmp = 0;
+  int heightTmp = 0;
+
+  for (int cand = 0; cand < 4; cand++)  // top, left, bottom and right
+  {
+
+    if (cand == 0)  // top
+    {
+      pSrc = src;
+      pDst = dst;
+      widthTmp = width;
+      heightTmp = dim;
+    }
+    else if (cand == 1)  // left
+    {
+      pSrc = src + dim*srcStride;
+      pDst = dst + dim*dstStride;
+      widthTmp = dim;
+      heightTmp = height - 2 * dim;
+    }
+    else if (cand == 2)  // bottom
+    {
+      pSrc = src + (height - dim)*srcStride;
+      pDst = dst + (height - dim)*dstStride;
+      widthTmp = width;
+      heightTmp = dim;
+    }
+    else if (cand == 3)  // right
+    {
+      pSrc = src + dim*srcStride + width - dim;
+      pDst = dst + dim*dstStride + width - dim;
+      widthTmp = dim;
+      heightTmp = height - 2 * dim;
+    }
+
+    if (fracY == 0)
+    {
+      m_if.filterHor(COMPONENT_Y, pSrc, srcStride, pDst, dstStride, widthTmp, heightTmp, fracX, isLast, fmt, clpRng, 1);
+    }
+    else if (fracX == 0)
+    {
+      m_if.filterVer(COMPONENT_Y, pSrc, srcStride, pDst, dstStride, widthTmp, heightTmp, fracY, true, isLast, fmt, clpRng, 1);
+    }
+    else
+    {
+      PelBuf tmpBuf = PelBuf(m_filteredBlockTmp[0][COMPONENT_Y], Size(width, height));
+      tmpBuf.stride = width;
+
+      m_if.filterHor(COMPONENT_Y, pSrc - ((vFilterSize >> 1) - 1) * srcStride, srcStride, tmpBuf.buf, tmpBuf.stride, widthTmp, heightTmp + vFilterSize - 1, fracX, false, fmt, clpRng, 1);
+      m_if.filterVer(COMPONENT_Y, tmpBuf.buf + ((vFilterSize >> 1) - 1) * tmpBuf.stride, tmpBuf.stride, pDst, dstStride, widthTmp, heightTmp, fracY, false, isLast, fmt, clpRng, 1);
+    }
+  }
+}
+
+bool InterPrediction::xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* pYuvSrc0, const int src0Stride, const Pel* pYuvSrc1, const int src1Stride, const BitDepths &clipBitDepths)
+{
+  const int     width = pu.lwidth();
+  const int     height = pu.lheight();
+  const int     clipbd = clipBitDepths.recon[toChannelType(COMPONENT_Y)];
+  const uint32_t distortionShift = DISTORTION_PRECISION_ADJUSTMENT(clipbd);
+  const int     shift = std::max<int>(2, (IF_INTERNAL_PREC - clipbd));
+  const int     xUnit = (width >> 2);
+  const int     yUnit = (height >> 2);
+
+  m_bioDistThres = (shift <= 5) ? (((32 << (clipbd - 8))*width*height) >> (5 - shift)) : (((32 << (clipbd - 8))*width*height) << (shift - 5));
+  m_bioSubBlkDistThres = (shift <= 5) ? (((64 << (clipbd - 8)) << 4) >> (5 - shift)) : (((64 << (clipbd - 8)) << 4) << (shift - 5));
+
+  m_bioDistThres >>= distortionShift;
+  m_bioSubBlkDistThres >>= distortionShift;
+
+  DistParam cDistParam;
+  Distortion dist = 0;
+  for (int yu = 0, blkIdx = 0; yu < yUnit; yu++)
+  {
+    for (int xu = 0; xu < xUnit; xu++, blkIdx++)
+    {
+      const Pel* pPred0 = pYuvSrc0 + ((yu*src0Stride + xu) << 2);
+      const Pel* pPred1 = pYuvSrc1 + ((yu*src1Stride + xu) << 2);
+
+      m_pcRdCost->setDistParam(cDistParam, pPred0, pPred1, src0Stride, src1Stride, clipbd, COMPONENT_Y, (1 << 2), (1 << 2), 0, 1, false, true);
+      m_bioPredSubBlkDist[blkIdx] = cDistParam.distFunc(cDistParam);
+      dist += m_bioPredSubBlkDist[blkIdx];
+    }
+  }
+
+  return (dist >= m_bioDistThres);
+}
+#endif
+
+#if JVET_L0256_BIO
+void InterPrediction::xWeightedAverage(const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied )
+#else
 void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs )
+#endif
 {
   const int iRefIdx0 = pu.refIdx[0];
   const int iRefIdx1 = pu.refIdx[1];
@@ -623,11 +970,35 @@ void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnit
 #if JVET_L0646_GBI
     if( pu.cu->GBiIdx != GBI_DEFAULT )
     {
+#if JVET_L0256_BIO
+      CHECK(bioApplied, "GBi is disallowed with BIO");
+#endif
       pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->GBiIdx);
       return;
     }
 #endif
+#if JVET_L0256_BIO
+    if (bioApplied)
+    {
+      const int  src0Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2;
+      const int  src1Stride = pu.lwidth() + 2 * BIO_EXTEND_SIZE + 2;
+      const Pel* pSrcY0 = m_filteredBlockTmp[2][COMPONENT_Y] + 2 * src0Stride + 2;
+      const Pel* pSrcY1 = m_filteredBlockTmp[3][COMPONENT_Y] + 2 * src1Stride + 2;
+
+      bool bioEnabled = xCalcBiPredSubBlkDist(pu, pSrcY0, src0Stride, pSrcY1, src1Stride, clipBitDepths);
+      if (bioEnabled)
+      {
+        applyBiOptFlow(pu, pcYuvSrc0, pcYuvSrc1, iRefIdx0, iRefIdx1, pcYuvDst, clipBitDepths);
+      }
+      else
+      {
+        pcYuvDst.bufs[0].addAvg(CPelBuf(pSrcY0, src0Stride, pu.lumaSize()), CPelBuf(pSrcY1, src1Stride, pu.lumaSize()), clpRngs.comp[0]);
+      }
+    }
+    pcYuvDst.addAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, bioApplied);
+#else
     pcYuvDst.addAvg( pcYuvSrc0, pcYuvSrc1, clpRngs );
+#endif
   }
   else if( iRefIdx0 >= 0 && iRefIdx1 < 0 )
   {
@@ -694,8 +1065,25 @@ void InterPrediction::motionCompensation( PredictionUnit &pu, const RefPicList &
   );
 }
 
+#if JVET_L0256_BIO
+int InterPrediction::rightShiftMSB(int numer, int denom)
+{
+  int     d;
+  int msbIdx = 0;
+  for (msbIdx = 0; msbIdx<32; msbIdx++)
+  {
+    if (denom < ((int)1 << msbIdx))
+    {
+      break;
+    }
+  }
 
+  int shiftIdx = msbIdx - 1;
+  d = (numer >> shiftIdx);
 
+  return d;
+}
+#endif
 
 #if JVET_J0090_MEMORY_BANDWITH_MEASURE
 void InterPrediction::cacheAssign( CacheModel *cache )
diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h
index c58fed66412d52f3f234057fbfa29c7900936eac..0bff85b70a5f525b26f8734c6e3b2c682cf48846 100644
--- a/source/Lib/CommonLib/InterPrediction.h
+++ b/source/Lib/CommonLib/InterPrediction.h
@@ -64,6 +64,17 @@ class InterPrediction : public WeightPrediction
 {
 private:
 
+#if JVET_L0256_BIO
+  Distortion  m_bioDistThres;
+  Distortion  m_bioSubBlkDistThres;
+  Distortion  m_bioPredSubBlkDist[MAX_NUM_PARTS_IN_CTU];
+
+  int m_dotProduct1[BIO_TEMP_BUFFER_SIZE];
+  int m_dotProduct2[BIO_TEMP_BUFFER_SIZE];
+  int m_dotProduct3[BIO_TEMP_BUFFER_SIZE];
+  int m_dotProduct5[BIO_TEMP_BUFFER_SIZE];
+  int m_dotProduct6[BIO_TEMP_BUFFER_SIZE];
+#endif
 
 protected:
   InterpolationFilter  m_if;
@@ -80,15 +91,37 @@ protected:
   RdCost*              m_pcRdCost;
 
   int                  m_iRefListIdx;
-  
+ 
+#if JVET_L0256_BIO
+  Pel*                 m_gradX0;
+  Pel*                 m_gradY0;
+  Pel*                 m_gradX1;
+  Pel*                 m_gradY1;
+  bool                 m_subPuMC;
+
+  int             rightShiftMSB(int numer, int    denom);
+  void            applyBiOptFlow(const PredictionUnit &pu, const CPelUnitBuf &yuvSrc0, const CPelUnitBuf &yuvSrc1, const int &refIdx0, const int &refIdx1, PelUnitBuf &yuvDst, const BitDepths &clipBitDepths);
+  bool            xCalcBiPredSubBlkDist(const PredictionUnit &pu, const Pel* yuvSrc0, const int src0Stride, const Pel* yuvSrc1, const int src1Stride, const BitDepths &clipBitDepths);
+  void            bioSampleExtendBilinearFilter(Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int dim, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng);
+#endif
 
   void xPredInterUni            ( const PredictionUnit& pu, const RefPicList& eRefPicList, PelUnitBuf& pcYuvPred, const bool& bi 
+#if JVET_L0256_BIO
+                                  ,const bool& bioApplied = false
+#endif
   );
   void xPredInterBi             ( PredictionUnit& pu, PelUnitBuf &pcYuvPred );
   void xPredInterBlk            ( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv& _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng
+#if JVET_L0256_BIO
+                                  ,const bool& bioApplied = false
+#endif
                                  );
-  
+
+#if JVET_L0256_BIO
+  void xWeightedAverage         ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs, const bool& bioApplied );
+#else
   void xWeightedAverage         ( const PredictionUnit& pu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const BitDepths& clipBitDepths, const ClpRngs& clpRngs );
+#endif
   void xPredAffineBlk( const ComponentID& compID, const PredictionUnit& pu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool& bi, const ClpRng& clpRng );
 
   static bool xCheckIdenticalMotion( const PredictionUnit& pu );
diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp
index 32e4d9d755c7a110f8d5abefa3a9d38f5ba2ff39..abcef170f7a7675d4930ceaf77218519540b3607 100644
--- a/source/Lib/CommonLib/InterpolationFilter.cpp
+++ b/source/Lib/CommonLib/InterpolationFilter.cpp
@@ -111,6 +111,28 @@ const TFilterCoeff InterpolationFilter::m_chromaFilter[CHROMA_INTERPOLATION_FILT
   {  0,  2, 63, -1 },
 };
 
+#if JVET_L0256_BIO
+const TFilterCoeff InterpolationFilter::m_bilinearFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_BILINEAR] =
+{
+  { 64,  0, },
+  { 60,  4, },
+  { 56,  8, },
+  { 52, 12, },
+  { 48, 16, },
+  { 44, 20, },
+  { 40, 24, },
+  { 36, 28, },
+  { 32, 32, },
+  { 28, 36, },
+  { 24, 40, },
+  { 20, 44, },
+  { 16, 48, },
+  { 12, 52, },
+  { 8, 56, },
+  { 4, 60, },
+};
+#endif
+
 // ====================================================================================================================
 // Private member functions
 // ====================================================================================================================
@@ -443,7 +465,11 @@ void InterpolationFilter::filterVer(const ClpRng& clpRng, Pel const *src, int sr
  * \param  fmt        Chroma format
  * \param  bitDepth   Bit depth
  */
+#if JVET_L0256_BIO
+void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx )
+#else
 void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng )
+#endif
 {
   if( frac == 0 )
   {
@@ -452,6 +478,13 @@ void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, i
   else if( isLuma( compID ) )
   {
     CHECK( frac < 0 || frac >= ( LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE ), "Invalid fraction" );
+#if JVET_L0256_BIO
+    if( nFilterIdx == 1 )
+    {
+      filterHor<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_bilinearFilter[frac]);
+    }
+    else
+#endif
     {
       filterHor<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac] );
     }
@@ -481,7 +514,11 @@ void InterpolationFilter::filterHor( const ComponentID compID, Pel const *src, i
  * \param  fmt        Chroma format
  * \param  bitDepth   Bit depth
  */
+#if JVET_L0256_BIO
+void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx)
+#else
 void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng )
+#endif
 {
   if( frac == 0 )
   {
@@ -490,6 +527,13 @@ void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, i
   else if( isLuma( compID ) )
   {
     CHECK( frac < 0 || frac >= ( LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE ), "Invalid fraction" );
+#if JVET_L0256_BIO
+    if (nFilterIdx == 1)
+    {
+      filterVer<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_bilinearFilter[frac]);
+    }
+    else
+#endif
     {
       filterVer<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac] );
     }
diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h
index 4535b6bc56ba52fd6130e4f7e8a5443fac1a7770..4f246d9bed3ff90076ed125212a79a9da8761112 100644
--- a/source/Lib/CommonLib/InterpolationFilter.h
+++ b/source/Lib/CommonLib/InterpolationFilter.h
@@ -56,6 +56,9 @@ class InterpolationFilter
 {
   static const TFilterCoeff m_lumaFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_LUMA]; ///< Luma filter taps
   static const TFilterCoeff m_chromaFilter[CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_CHROMA]; ///< Chroma filter taps
+#if JVET_L0256_BIO
+  static const TFilterCoeff m_bilinearFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS << VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE][NTAPS_BILINEAR]; ///< bilinear filter taps
+#endif
 public:
   template<bool isFirst, bool isLast>
   static void filterCopy( const ClpRng& clpRng, const Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height );
@@ -87,8 +90,13 @@ public:
   void _initInterpolationFilterX86();
 #endif
 
+#if JVET_L0256_BIO
+  void filterHor(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac,               bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx = 0);
+  void filterVer(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, int nFilterIdx = 0);
+#else
   void filterHor(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac,               bool isLast, const ChromaFormat fmt, const ClpRng& clpRng );
   void filterVer(const ComponentID compID, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, int frac, bool isFirst, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng );
+#endif
 #if JVET_J0090_MEMORY_BANDWITH_MEASURE
   void cacheAssign( CacheModel *cache ) { m_cacheModel = cache; }
 #endif
diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp
index 20119d9a8feaeaef1bf16ee72a81714e60521571..572eacb8ba30219900a0137a0d55f05157511430 100644
--- a/source/Lib/CommonLib/RdCost.cpp
+++ b/source/Lib/CommonLib/RdCost.cpp
@@ -164,6 +164,10 @@ void RdCost::init()
   m_afpDistortFunc[DF_SSE16N_WTD] = RdCost::xGetSSE16N_WTD;
 #endif
 
+#if JVET_L0256_BIO
+  m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD;
+#endif
+
 #if ENABLE_SIMD_OPT_DIST
 #ifdef TARGET_SIMD_X86
   initRdCostX86();
@@ -318,7 +322,11 @@ void RdCost::setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &c
   rcDP.maximumDistortionForEarlyExit = std::numeric_limits<Distortion>::max();
 }
 
+#if JVET_L0256_BIO
+void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard, bool bioApplied )
+#else
 void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode, int step, bool useHadamard )
+#endif
 {
   rcDP.bitDepth   = bitDepth;
   rcDP.compID     = compID;
@@ -339,6 +347,14 @@ void RdCost::setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY,
 
   CHECK( useHadamard || rcDP.useMR || subShiftMode > 0, "only used in xDirectMCCost with these default parameters (so far...)" );
 
+#if JVET_L0256_BIO
+  if ( bioApplied )
+  {
+    rcDP.distFunc = m_afpDistortFunc[ DF_SAD_INTERMEDIATE_BITDEPTH ];
+    return;
+  }
+#endif
+
   if( width == 12 )
   {
     rcDP.distFunc = m_afpDistortFunc[ DF_SAD12 ];
diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h
index 4e79040c21fb739fc75a13b0643d4798989099d7..3c95f90adf54158c5cfce66572b7dedc82613f7d 100644
--- a/source/Lib/CommonLib/RdCost.h
+++ b/source/Lib/CommonLib/RdCost.h
@@ -154,7 +154,11 @@ public:
 
   void           setDistParam( DistParam &rcDP, const CPelBuf &org, const Pel* piRefY , int iRefStride, int bitDepth, ComponentID compID, int subShiftMode = 0, int step = 1, bool useHadamard = false );
   void           setDistParam( DistParam &rcDP, const CPelBuf &org, const CPelBuf &cur, int bitDepth, ComponentID compID, bool useHadamard = false );
+#if JVET_L0256_BIO
+  void           setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false, bool bioApplied = false );
+#else
   void           setDistParam( DistParam &rcDP, const Pel* pOrg, const Pel* piRefY, int iOrgStride, int iRefStride, int bitDepth, ComponentID compID, int width, int height, int subShiftMode = 0, int step = 1, bool useHadamard = false );
+#endif
 
   double         getMotionLambda          ( bool bIsTransquantBypass ) { return m_dLambdaMotionSAD[(bIsTransquantBypass && m_costMode==COST_MIXED_LOSSLESS_LOSSY_CODING)?1:0]; }
   void           selectMotionLambda       ( bool bIsTransquantBypass ) { m_motionLambda = getMotionLambda( bIsTransquantBypass ); }
@@ -266,6 +270,10 @@ private:
   static Distortion xGetSAD_SIMD    ( const DistParam& pcDtParam );
   template< int iWidth, X86_VEXT vext >
   static Distortion xGetSAD_NxN_SIMD( const DistParam& pcDtParam );
+#if ENABLE_SIMD_OPT_BIO
+  template< X86_VEXT vext >
+  static Distortion xGetSAD_IBD_SIMD(const DistParam& pcDtParam);
+#endif
 
   template< typename Torg, typename Tcur, X86_VEXT vext >
   static Distortion xGetHADs_SIMD   ( const DistParam& pcDtParam );
diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp
index 356ec1c2618aea010a1cbb289fcea99c2f71a929..61e3c8fd94c8b40ed4dd23ce7710a32ca06e2c92 100644
--- a/source/Lib/CommonLib/Slice.cpp
+++ b/source/Lib/CommonLib/Slice.cpp
@@ -1734,6 +1734,9 @@ SPSNext::SPSNext( SPS& sps )
   , m_IMV                       ( false )
 #if !REMOVE_MV_ADAPT_PREC
   , m_highPrecMv                ( false )
+#endif
+#if JVET_L0256_BIO
+  , m_BIO                       ( false )
 #endif
   , m_DisableMotionCompression  ( false )
   , m_LMChroma                  ( false )
diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h
index 7dbbc4611e5ff80854c32ebfc7e1ad16db98c1bc..8cbcfee8795e65767309db01ed782f10b68b45ef 100644
--- a/source/Lib/CommonLib/Slice.h
+++ b/source/Lib/CommonLib/Slice.h
@@ -806,6 +806,9 @@ private:
   bool              m_IMV;                        // 9
 #if !REMOVE_MV_ADAPT_PREC
   bool              m_highPrecMv;
+#endif
+#if JVET_L0256_BIO
+  bool              m_BIO;
 #endif
   bool              m_DisableMotionCompression;   // 13
   bool              m_LMChroma;                   // 17
@@ -880,6 +883,10 @@ public:
 #if !REMOVE_MV_ADAPT_PREC
   void      setUseHighPrecMv(bool b) { m_highPrecMv = b; }
   bool      getUseHighPrecMv()                                      const { return m_highPrecMv; }
+#endif
+#if JVET_L0256_BIO
+  void      setUseBIO(bool b)                                                       { m_BIO = b; }
+  bool      getUseBIO()                                                   const     { return m_BIO; }
 #endif
   void      setDisableMotCompress ( bool b )                                        { m_DisableMotionCompression = b; }
   bool      getDisableMotCompress ()                                      const     { return m_DisableMotionCompression; }
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 347ae35904ff9d6e1b86fafb1aabdd833015081a..472b3428168d2b7d107d8418d1b0042f6f45c2c0 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -85,6 +85,8 @@
 
 #define L0074_SUBBLOCK_DEBLOCKING                         1
 
+#define JVET_L0256_BIO                                    1
+
 #define JVET_L0646_GBI                                    1 // Generalized bi-prediction (GBi)
 
 #define JVET_L0628_4TAP_INTRA                             1 // 4-tap intra-interpolation filtering with switching between Gaussian and DCT-IF filters for luma component
@@ -264,6 +266,7 @@
 #if ENABLE_SIMD_OPT_BUFFER && JVET_L0646_GBI
 #define ENABLE_SIMD_OPT_GBI                               1                                                 ///< SIMD optimization for GBi   
 #endif
+#define ENABLE_SIMD_OPT_BIO                             ( JVET_L0256_BIO && ENABLE_SIMD_OPT )               ///< SIMD optimization for BIO
 
 // End of SIMD optimizations
 
@@ -558,7 +561,13 @@ enum DFunc
   DF_DEFAULT_ORI      = DF_SSE_WTD+8,
 #endif
 
+#if JVET_L0256_BIO
+  DF_SAD_INTERMEDIATE_BITDEPTH = 63,
+
+  DF_TOTAL_FUNCTIONS = 64
+#else
   DF_TOTAL_FUNCTIONS = 63
+#endif
 };
 
 /// motion vector predictor direction used in AMVP
diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h
index 34d2cb7b417ef2278b0c9fe55277eeec2dd068b1..92f52655a47940016d708dadc68a152a00a1315b 100644
--- a/source/Lib/CommonLib/x86/BufferX86.h
+++ b/source/Lib/CommonLib/x86/BufferX86.h
@@ -128,6 +128,308 @@ void addAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int s
   }
 }
 
+#if ENABLE_SIMD_OPT_BIO
+template< X86_VEXT vext >
+void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
+{
+  __m128i mm_tmpx = _mm_unpacklo_epi64(_mm_set1_epi16(tmpx), _mm_set1_epi16(tmpy));
+  __m128i mm_boffset = _mm_set1_epi32(1);
+  __m128i mm_offset = _mm_set1_epi32(offset);
+  __m128i vibdimin = _mm_set1_epi16(clpRng.min);
+  __m128i vibdimax = _mm_set1_epi16(clpRng.max);
+
+  for (int y = 0; y < height; y++)
+  {
+    for (int x = 0; x < width; x += 4)
+    {
+      __m128i mm_a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX0 + x)), _mm_loadl_epi64((const __m128i *)(gradY0 + x)));
+      __m128i mm_b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX1 + x)), _mm_loadl_epi64((const __m128i *)(gradY1 + x)));
+      mm_a = _mm_sub_epi16(mm_a, mm_b);
+      mm_b = _mm_mulhi_epi16(mm_a, mm_tmpx);
+      mm_a = _mm_mullo_epi16(mm_a, mm_tmpx);
+
+      __m128i mm_sum = _mm_add_epi32(_mm_unpacklo_epi16(mm_a, mm_b), _mm_unpackhi_epi16(mm_a, mm_b));
+      mm_sum = _mm_srai_epi32(_mm_add_epi32(mm_sum, mm_boffset), 1);
+      mm_a = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src0 + x)));
+      mm_b = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src1 + x)));
+      mm_sum = _mm_add_epi32(_mm_add_epi32(mm_sum, mm_a), _mm_add_epi32(mm_b, mm_offset));
+      mm_sum = _mm_packs_epi32(_mm_srai_epi32(mm_sum, shift), mm_a);
+      mm_sum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_sum));
+      _mm_storel_epi64((__m128i *)(dst + x), mm_sum);
+    }
+    dst += dstStride;       src0 += src0Stride;     src1 += src1Stride;
+    gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride;
+  }
+}
+
+template< X86_VEXT vext >
+void gradFilter_SSE(Pel* src, int srcStride, int width, int height, int gradStride, Pel* gradX, Pel* gradY)
+{
+  __m128i vzero = _mm_setzero_si128();
+  Pel* srcTmp = src + srcStride + 1;
+  Pel* gradXTmp = gradX + gradStride + 1;
+  Pel* gradYTmp = gradY + gradStride + 1;
+
+  int widthInside = width - 2 * BIO_EXTEND_SIZE;
+  int heightInside = height - 2 * BIO_EXTEND_SIZE;
+
+  assert((widthInside & 3) == 0);
+
+  for (int y = 0; y < heightInside; y++)
+  {
+    int x = 0;
+    for (; x < widthInside; x += 4)
+    {
+      __m128i mmPixTop = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - srcStride)));
+      __m128i mmPixBottom = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + srcStride)));
+      __m128i mmPixLeft = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x - 1)));
+      __m128i mmPixRight = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)(srcTmp + x + 1)));
+
+      __m128i mmGradVer = _mm_srai_epi32(_mm_sub_epi32(mmPixBottom, mmPixTop), 4);
+      __m128i mmGradHor = _mm_srai_epi32(_mm_sub_epi32(mmPixRight, mmPixLeft), 4);
+      mmGradVer = _mm_packs_epi32(mmGradVer, vzero);
+      mmGradHor = _mm_packs_epi32(mmGradHor, vzero);
+
+      _mm_storel_epi64((__m128i *)(gradYTmp + x), mmGradVer);
+      _mm_storel_epi64((__m128i *)(gradXTmp + x), mmGradHor);
+    }
+
+    gradXTmp += gradStride;
+    gradYTmp += gradStride;
+    srcTmp += srcStride;
+  }
+
+  gradXTmp = gradX + gradStride + 1;
+  gradYTmp = gradY + gradStride + 1;
+  for (int y = 0; y < heightInside; y++)
+  {
+    gradXTmp[-1] = gradXTmp[0];
+    gradXTmp[widthInside] = gradXTmp[widthInside - 1];
+    gradXTmp += gradStride;
+
+    gradYTmp[-1] = gradYTmp[0];
+    gradYTmp[widthInside] = gradYTmp[widthInside - 1];
+    gradYTmp += gradStride;
+  }
+
+  gradXTmp = gradX + gradStride;
+  gradYTmp = gradY + gradStride;
+  ::memcpy(gradXTmp - gradStride, gradXTmp, sizeof(Pel)*(width));
+  ::memcpy(gradXTmp + heightInside*gradStride, gradXTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width));
+  ::memcpy(gradYTmp - gradStride, gradYTmp, sizeof(Pel)*(width));
+  ::memcpy(gradYTmp + heightInside*gradStride, gradYTmp + (heightInside - 1)*gradStride, sizeof(Pel)*(width));
+}
+
+template< X86_VEXT vext >
+void calcBIOPar_SSE(const Pel* srcY0Temp, const Pel* srcY1Temp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, int* dotProductTemp1, int* dotProductTemp2, int* dotProductTemp3, int* dotProductTemp5, int* dotProductTemp6, const int src0Stride, const int src1Stride, const int gradStride, const int widthG, const int heightG)
+{
+  for (int y = 0; y < heightG; y++)
+  {
+    int x = 0;
+    for (; x < ((widthG >> 3) << 3); x += 8)
+    {
+      __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Temp + x)), 6);
+      __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Temp + x)), 6);
+      __m128i mmGradX0 = _mm_loadu_si128((__m128i*)(gradX0 + x));
+      __m128i mmGradX1 = _mm_loadu_si128((__m128i*)(gradX1 + x));
+      __m128i mmGradY0 = _mm_loadu_si128((__m128i*)(gradY0 + x));
+      __m128i mmGradY1 = _mm_loadu_si128((__m128i*)(gradY1 + x));
+
+      __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp);
+      __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3);
+      __m128i mmTempY = _mm_srai_epi16(_mm_add_epi16(mmGradY0, mmGradY1), 3);
+
+      // m_piDotProductTemp1
+      __m128i mm_b = _mm_mulhi_epi16(mmTempX, mmTempX);
+      __m128i mm_a = _mm_mullo_epi16(mmTempX, mmTempX);
+
+      __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+      __m128i mm_h = _mm_unpackhi_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp1 + x), mm_l);
+      _mm_storeu_si128((__m128i *)(dotProductTemp1 + x + 4), mm_h);
+
+      // m_piDotProductTemp2
+      mm_b = _mm_mulhi_epi16(mmTempX, mmTempY);
+      mm_a = _mm_mullo_epi16(mmTempX, mmTempY);
+
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+      mm_h = _mm_unpackhi_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp2 + x), mm_l);
+      _mm_storeu_si128((__m128i *)(dotProductTemp2 + x + 4), mm_h);
+
+      // m_piDotProductTemp3
+      mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1);
+      mm_a = _mm_mullo_epi16(mmTempX, mmTemp1);
+
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+      mm_h = _mm_unpackhi_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp3 + x), mm_l);
+      _mm_storeu_si128((__m128i *)(dotProductTemp3 + x + 4), mm_h);
+
+      // m_piDotProductTemp5
+      mm_b = _mm_mulhi_epi16(mmTempY, mmTempY);
+      mm_a = _mm_mullo_epi16(mmTempY, mmTempY);
+
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+      mm_h = _mm_unpackhi_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp5 + x), mm_l);
+      _mm_storeu_si128((__m128i *)(dotProductTemp5 + x + 4), mm_h);
+
+      // m_piDotProductTemp6
+      mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1);
+      mm_a = _mm_mullo_epi16(mmTempY, mmTemp1);
+
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+      mm_h = _mm_unpackhi_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp6 + x), mm_l);
+      _mm_storeu_si128((__m128i *)(dotProductTemp6 + x + 4), mm_h);
+    }
+
+    for (; x < ((widthG >> 2) << 2); x += 4)
+    {
+      __m128i mmSrcY0Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(srcY0Temp + x)), 6);
+      __m128i mmSrcY1Temp = _mm_srai_epi16(_mm_loadl_epi64((__m128i*)(srcY1Temp + x)), 6);
+      __m128i mmGradX0 = _mm_loadl_epi64((__m128i*)(gradX0 + x));
+      __m128i mmGradX1 = _mm_loadl_epi64((__m128i*)(gradX1 + x));
+      __m128i mmGradY0 = _mm_loadl_epi64((__m128i*)(gradY0 + x));
+      __m128i mmGradY1 = _mm_loadl_epi64((__m128i*)(gradY1 + x));
+
+      __m128i mmTemp1 = _mm_sub_epi16(mmSrcY1Temp, mmSrcY0Temp);
+      __m128i mmTempX = _mm_srai_epi16(_mm_add_epi16(mmGradX0, mmGradX1), 3);
+      __m128i mmTempY = _mm_srai_epi16(_mm_add_epi16(mmGradY0, mmGradY1), 3);
+
+      // m_piDotProductTemp1
+      __m128i mm_b = _mm_mulhi_epi16(mmTempX, mmTempX);
+      __m128i mm_a = _mm_mullo_epi16(mmTempX, mmTempX);
+      __m128i mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp1 + x), mm_l);
+
+      // m_piDotProductTemp2
+      mm_b = _mm_mulhi_epi16(mmTempX, mmTempY);
+      mm_a = _mm_mullo_epi16(mmTempX, mmTempY);
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp2 + x), mm_l);
+
+      // m_piDotProductTemp3
+      mm_b = _mm_mulhi_epi16(mmTempX, mmTemp1);
+      mm_a = _mm_mullo_epi16(mmTempX, mmTemp1);
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp3 + x), mm_l);
+
+      // m_piDotProductTemp5
+      mm_b = _mm_mulhi_epi16(mmTempY, mmTempY);
+      mm_a = _mm_mullo_epi16(mmTempY, mmTempY);
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp5 + x), mm_l);
+
+      // m_piDotProductTemp6
+      mm_b = _mm_mulhi_epi16(mmTempY, mmTemp1);
+      mm_a = _mm_mullo_epi16(mmTempY, mmTemp1);
+      mm_l = _mm_unpacklo_epi16(mm_a, mm_b);
+
+      _mm_storeu_si128((__m128i *)(dotProductTemp6 + x), mm_l);
+    }
+
+    for (; x < widthG; x++)
+    {
+      int temp = (srcY0Temp[x] >> 6) - (srcY1Temp[x] >> 6);
+      int tempX = (gradX0[x] + gradX1[x]) >> 3;
+      int tempY = (gradY0[x] + gradY1[x]) >> 3;
+      dotProductTemp1[x] = tempX * tempX;
+      dotProductTemp2[x] = tempX * tempY;
+      dotProductTemp3[x] = -tempX * temp;
+      dotProductTemp5[x] = tempY * tempY;
+      dotProductTemp6[x] = -tempY * temp;
+    }
+
+    srcY0Temp += src0Stride;
+    srcY1Temp += src1Stride;
+    gradX0 += gradStride;
+    gradX1 += gradStride;
+    gradY0 += gradStride;
+    gradY1 += gradStride;
+    dotProductTemp1 += widthG;
+    dotProductTemp2 += widthG;
+    dotProductTemp3 += widthG;
+    dotProductTemp5 += widthG;
+    dotProductTemp6 += widthG;
+  }
+}
+
+template< X86_VEXT vext >
+void calcBlkGradient_SSE(int sx, int sy, int     *arraysGx2, int     *arraysGxGy, int     *arraysGxdI, int     *arraysGy2, int     *arraysGydI, int     &sGx2, int     &sGy2, int     &sGxGy, int     &sGxdI, int     &sGydI, int width, int height, int unitSize)
+{
+  int     *Gx2 = arraysGx2;
+  int     *Gy2 = arraysGy2;
+  int     *GxGy = arraysGxGy;
+  int     *GxdI = arraysGxdI;
+  int     *GydI = arraysGydI;
+
+  // set to the above row due to JVET_K0485_BIO_EXTEND_SIZE
+  Gx2 -= (BIO_EXTEND_SIZE*width);
+  Gy2 -= (BIO_EXTEND_SIZE*width);
+  GxGy -= (BIO_EXTEND_SIZE*width);
+  GxdI -= (BIO_EXTEND_SIZE*width);
+  GydI -= (BIO_EXTEND_SIZE*width);
+
+  __m128i vzero = _mm_setzero_si128();
+  __m128i mmGx2Total = _mm_setzero_si128();
+  __m128i mmGy2Total = _mm_setzero_si128();
+  __m128i mmGxGyTotal = _mm_setzero_si128();
+  __m128i mmGxdITotal = _mm_setzero_si128();
+  __m128i mmGydITotal = _mm_setzero_si128();
+
+  for (int y = -BIO_EXTEND_SIZE; y < unitSize + BIO_EXTEND_SIZE; y++)
+  {
+    __m128i mmsGx2 = _mm_loadu_si128((__m128i*)(Gx2 - 1));   __m128i mmsGx2Sec = _mm_loadl_epi64((__m128i*)(Gx2 + 3));
+    __m128i mmsGy2 = _mm_loadu_si128((__m128i*)(Gy2 - 1));   __m128i mmsGy2Sec = _mm_loadl_epi64((__m128i*)(Gy2 + 3));
+    __m128i mmsGxGy = _mm_loadu_si128((__m128i*)(GxGy - 1));  __m128i mmsGxGySec = _mm_loadl_epi64((__m128i*)(GxGy + 3));
+    __m128i mmsGxdI = _mm_loadu_si128((__m128i*)(GxdI - 1));  __m128i mmsGxdISec = _mm_loadl_epi64((__m128i*)(GxdI + 3));
+    __m128i mmsGydI = _mm_loadu_si128((__m128i*)(GydI - 1));  __m128i mmsGydISec = _mm_loadl_epi64((__m128i*)(GydI + 3));
+
+    mmsGx2 = _mm_add_epi32(mmsGx2, mmsGx2Sec);
+    mmsGy2 = _mm_add_epi32(mmsGy2, mmsGy2Sec);
+    mmsGxGy = _mm_add_epi32(mmsGxGy, mmsGxGySec);
+    mmsGxdI = _mm_add_epi32(mmsGxdI, mmsGxdISec);
+    mmsGydI = _mm_add_epi32(mmsGydI, mmsGydISec);
+
+
+    mmGx2Total = _mm_add_epi32(mmGx2Total, mmsGx2);
+    mmGy2Total = _mm_add_epi32(mmGy2Total, mmsGy2);
+    mmGxGyTotal = _mm_add_epi32(mmGxGyTotal, mmsGxGy);
+    mmGxdITotal = _mm_add_epi32(mmGxdITotal, mmsGxdI);
+    mmGydITotal = _mm_add_epi32(mmGydITotal, mmsGydI);
+
+    Gx2 += width;
+    Gy2 += width;
+    GxGy += width;
+    GxdI += width;
+    GydI += width;
+  }
+
+  mmGx2Total = _mm_hadd_epi32(_mm_hadd_epi32(mmGx2Total, vzero), vzero);
+  mmGy2Total = _mm_hadd_epi32(_mm_hadd_epi32(mmGy2Total, vzero), vzero);
+  mmGxGyTotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGxGyTotal, vzero), vzero);
+  mmGxdITotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGxdITotal, vzero), vzero);
+  mmGydITotal = _mm_hadd_epi32(_mm_hadd_epi32(mmGydITotal, vzero), vzero);
+
+  sGx2 = _mm_cvtsi128_si32(mmGx2Total);
+  sGy2 = _mm_cvtsi128_si32(mmGy2Total);
+  sGxGy = _mm_cvtsi128_si32(mmGxGyTotal);
+  sGxdI = _mm_cvtsi128_si32(mmGxdITotal);
+  sGydI = _mm_cvtsi128_si32(mmGydITotal);
+}
+#endif
+
 template< X86_VEXT vext, int W >
 void reco_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int16_t *dst, int dstStride, int width, int height, const ClpRng& clpRng )
 {
@@ -496,6 +798,13 @@ void PelBufferOps::_initPelBufOpsX86()
   addAvg8 = addAvg_SSE<vext, 8>;
   addAvg4 = addAvg_SSE<vext, 4>;
 
+#if ENABLE_SIMD_OPT_BIO
+  addBIOAvg4      = addBIOAvg4_SSE<vext>;
+  bioGradFilter   = gradFilter_SSE<vext>;
+  calcBIOPar      = calcBIOPar_SSE<vext>;
+  calcBlkGradient = calcBlkGradient_SSE<vext>;
+#endif
+
   reco8 = reco_SSE<vext, 8>;
   reco4 = reco_SSE<vext, 4>;
 
diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h
index 95383045fe72d78572e9ab132f91fb13588e7b8d..4d87189ca052e0d8ab14a17980b92c2e1406c13e 100644
--- a/source/Lib/CommonLib/x86/RdCostX86.h
+++ b/source/Lib/CommonLib/x86/RdCostX86.h
@@ -297,6 +297,45 @@ Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam )
   return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
 }
 
+#if ENABLE_SIMD_OPT_BIO
+template< X86_VEXT vext >
+Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam)
+{
+  if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight)
+    return RdCost::xGetSAD(rcDtParam);
+
+  const short* src0 = (const short*)rcDtParam.org.buf;
+  const short* src1 = (const short*)rcDtParam.cur.buf;
+  int  width = rcDtParam.org.height;
+  int  height = rcDtParam.org.width;
+  int  subShift = rcDtParam.subShift;
+  int  subStep = (1 << subShift);
+  const int src0Stride = rcDtParam.org.stride * subStep;
+  const int src1Stride = rcDtParam.cur.stride * subStep;
+
+  __m128i vtotalsum32 = _mm_setzero_si128();
+  __m128i vzero = _mm_setzero_si128();
+  for (int y = 0; y < height; y += subStep)
+  {
+    for (int x = 0; x < width; x += 4)
+    {
+      __m128i vsrc1 = _mm_loadl_epi64((const __m128i*)(src0 + x));
+      __m128i vsrc2 = _mm_loadl_epi64((const __m128i*)(src1 + x));
+      vsrc1 = _mm_cvtepi16_epi32(vsrc1);
+      vsrc2 = _mm_cvtepi16_epi32(vsrc2);
+      vtotalsum32 = _mm_add_epi32(vtotalsum32, _mm_abs_epi32(_mm_sub_epi32(vsrc1, vsrc2)));
+    }
+    src0 += src0Stride;
+    src1 += src1Stride;
+  }
+  vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero);
+  vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero);
+  Distortion uiSum = _mm_cvtsi128_si32(vtotalsum32);
+
+  uiSum <<= subShift;
+  return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
+}
+#endif
 
 template< int iWidth, X86_VEXT vext >
 Distortion RdCost::xGetSAD_NxN_SIMD( const DistParam &rcDtParam )
@@ -2422,6 +2461,10 @@ void RdCost::_initRdCostX86()
   m_afpDistortFunc[DF_HAD32]   = RdCost::xGetHADs_SIMD<Pel, Pel, vext>;
   m_afpDistortFunc[DF_HAD64]   = RdCost::xGetHADs_SIMD<Pel, Pel, vext>;
   m_afpDistortFunc[DF_HAD16N]  = RdCost::xGetHADs_SIMD<Pel, Pel, vext>;
+
+#if ENABLE_SIMD_OPT_BIO
+  m_afpDistortFunc[DF_SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD_IBD_SIMD<vext>;
+#endif
 }
 
 template void RdCost::_initRdCostX86<SIMDX86>();
diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp
index 5afb188dc802e0b87573f4b2d8c3f13c421b1b89..908bd0020d4c585116b5414f000b47b2aff9f038 100644
--- a/source/Lib/DecoderLib/VLCReader.cpp
+++ b/source/Lib/DecoderLib/VLCReader.cpp
@@ -794,6 +794,9 @@ void HLSyntaxReader::parseSPSNext( SPSNext& spsNext, const bool usePCM )
   READ_FLAG( symbol,    "imv_enable_flag" );                        spsNext.setUseIMV                 ( symbol != 0 );
 #if !REMOVE_MV_ADAPT_PREC
   READ_FLAG( symbol, "high_precision_motion_vectors" );             spsNext.setUseHighPrecMv(symbol != 0);
+#endif
+#if JVET_L0256_BIO
+  READ_FLAG( symbol, "bio_enable_flag" );                           spsNext.setUseBIO                 ( symbol != 0 );
 #endif
   READ_FLAG( symbol,    "disable_motion_compression_flag" );        spsNext.setDisableMotCompress     ( symbol != 0 );
   READ_FLAG( symbol,    "lm_chroma_enabled_flag" );                 spsNext.setUseLMChroma            ( symbol != 0 );
diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h
index 6ccf7acdff8a3f0792770fd94edd9196d187a1ad..00826ad0231d92fbcea10c1c35c75a093149368e 100644
--- a/source/Lib/EncoderLib/EncCfg.h
+++ b/source/Lib/EncoderLib/EncCfg.h
@@ -199,6 +199,9 @@ protected:
   bool      m_AffineType;
 #if !REMOVE_MV_ADAPT_PREC
   bool      m_highPrecMv;
+#endif
+#if JVET_L0256_BIO
+  bool      m_BIO;
 #endif
   bool      m_DisableMotionCompression;
   unsigned  m_MTTMode;
@@ -632,6 +635,10 @@ public:
 #if !REMOVE_MV_ADAPT_PREC
   void      setHighPrecisionMv              ( bool b )       { m_highPrecMv = b; }
   bool      getHighPrecisionMv              ()               { return m_highPrecMv; }
+#endif
+#if JVET_L0256_BIO
+  void      setBIO(bool b)                                   { m_BIO = b; }
+  bool      getBIO()                                   const { return m_BIO; }
 #endif
   void      setDisableMotionCompression     ( bool b )       { m_DisableMotionCompression = b; }
   bool      getDisableMotionCompression     ()         const { return m_DisableMotionCompression; }
diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp
index 3e4b088bdf4739f18998a6aede40021e8839bddb..5f0cdc5613c9418498a623bf9db84c66a1ce12d3 100644
--- a/source/Lib/EncoderLib/EncLib.cpp
+++ b/source/Lib/EncoderLib/EncLib.cpp
@@ -843,6 +843,9 @@ void EncLib::xInitSPS(SPS &sps)
   sps.getSpsNext().setUseIMV                ( m_ImvMode != IMV_OFF );
 #if !REMOVE_MV_ADAPT_PREC
   sps.getSpsNext().setUseHighPrecMv         ( m_highPrecMv );
+#endif
+#if JVET_L0256_BIO
+  sps.getSpsNext().setUseBIO                ( m_BIO );
 #endif
   sps.getSpsNext().setUseAffine             ( m_Affine );
   sps.getSpsNext().setUseAffineType         ( m_AffineType );
diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp
index a7738748090c9813d940f9e5cc5c9726df36f990..7ad505f512d60bc9a81a7895d096f1ceb4ddee52 100644
--- a/source/Lib/EncoderLib/VLCWriter.cpp
+++ b/source/Lib/EncoderLib/VLCWriter.cpp
@@ -534,6 +534,9 @@ void HLSWriter::codeSPSNext( const SPSNext& spsNext, const bool usePCM )
   WRITE_FLAG( spsNext.getUseIMV() ? 1 : 0,                                                      "imv_enable_flag" );
 #if !REMOVE_MV_ADAPT_PREC
   WRITE_FLAG( spsNext.getUseHighPrecMv() ? 1 : 0,                                               "high_precision_motion_vectors");
+#endif
+#if JVET_L0256_BIO
+  WRITE_FLAG( spsNext.getUseBIO() ? 1 : 0,                                                      "bio_enable_flag" );
 #endif
   WRITE_FLAG( spsNext.getDisableMotCompress() ? 1 : 0,                                          "disable_motion_compression_flag" );
   WRITE_FLAG( spsNext.getUseLMChroma() ? 1 : 0,                                                 "lm_chroma_enabled_flag" );