diff --git a/cfg/encoder_lowdelay_vtm.cfg b/cfg/encoder_lowdelay_vtm.cfg
index e61a59988798c265db5a7693d30cf0e3e5c47215..9737a2854824f560bfef606585e012ef738f0307 100644
--- a/cfg/encoder_lowdelay_vtm.cfg
+++ b/cfg/encoder_lowdelay_vtm.cfg
@@ -127,6 +127,8 @@ LMChroma                     : 1      # use CCLM only
 DepQuant                     : 1
 IMV                          : 2
 ALF                          : 1
+GBi                          : 1 
+GBiFast                      : 1 
 
 # Fast tools
 PBIntraFast                  : 1
diff --git a/cfg/encoder_randomaccess_vtm.cfg b/cfg/encoder_randomaccess_vtm.cfg
index b2d2eddee705a14bf256652b90652740fa696edf..63ff58df9358f54e7567d150a6e8993a48ca04ea 100644
--- a/cfg/encoder_randomaccess_vtm.cfg
+++ b/cfg/encoder_randomaccess_vtm.cfg
@@ -141,6 +141,8 @@ LMChroma                     : 1      # use CCLM only
 DepQuant                     : 1
 IMV                          : 2
 ALF                          : 1
+GBi                          : 1 
+GBiFast                      : 1 
 
 # Fast tools
 PBIntraFast                  : 1
diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp
index 93bf0194c23c886cb794efd1a729e6265099ec41..3c6efc53764d3d8d710daab9abefc93c43b8926d 100644
--- a/source/App/EncoderApp/EncApp.cpp
+++ b/source/App/EncoderApp/EncApp.cpp
@@ -239,6 +239,10 @@ void EncApp::xInitLibCfg()
   m_cEncLib.setInterEMT                                          ( ( m_EMT >> 1 ) & 1 );
   m_cEncLib.setFastInterEMT                                      ( ( m_FastEMT >> 1 ) & ( m_EMT >> 1 ) & 1 );
   m_cEncLib.setUseCompositeRef                                   ( m_compositeRefEnabled );
+#if JVET_L0646_GBI
+  m_cEncLib.setUseGBi                                            ( m_GBi );
+  m_cEncLib.setUseGBiFast                                        ( m_GBiFast );
+#endif
   // ADD_NEW_TOOL : (encoder app) add setting of tool enabling flags and associated parameters here
 
   m_cEncLib.setMaxCUWidth                                        ( m_QTBT ? m_uiCTUSize : m_uiMaxCUWidth );
diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp
index 52be0b4aebf2d94a4cc514722f788aaa756f302f..0b7da7327be4a55d7219ebc19efa06771ef3f98a 100644
--- a/source/App/EncoderApp/EncAppCfg.cpp
+++ b/source/App/EncoderApp/EncAppCfg.cpp
@@ -844,6 +844,10 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] )
     "\t2:  Enable fast methods only for Inter EMT\n"
     "\t3:  Enable fast methods for both Intra & Inter EMT\n")
   ("CompositeLTReference",                            m_compositeRefEnabled,                            false, "Enable Composite Long Term Reference Frame")
+#if JVET_L0646_GBI
+  ("GBi",                                             m_GBi,                                            false, "Enable Generalized Bi-prediction(GBi)")
+  ("GBiFast",                                         m_GBiFast,                                        false, "Fast methods for Generalized Bi-prediction(GBi)\n")
+#endif
   // ADD_NEW_TOOL : (encoder app) add parsing parameters here
 
   ("LCTUFast",                                        m_useFastLCTU,                                    false, "Fast methods for large CTU")
@@ -1916,7 +1920,11 @@ bool EncAppCfg::xCheckParameter()
     xConfirmPara( m_useFastLCTU, "Fast large CTU can only be applied when encoding with NEXT profile" );
     xConfirmPara( m_EMT, "EMT only allowed with NEXT profile" );
     xConfirmPara( m_FastEMT, "EMT only allowed with NEXT profile" );
-    xConfirmPara(m_compositeRefEnabled, "Composite Reference Frame is only allowed with NEXT profile");
+    xConfirmPara( m_compositeRefEnabled, "Composite Reference Frame is only allowed with NEXT profile" );
+#if JVET_L0646_GBI
+    xConfirmPara( m_GBi, "GBi is only allowed with NEXT profile" );
+    xConfirmPara( m_GBiFast, "GBiFast is only allowed with NEXT profile" );
+#endif
     // ADD_NEW_TOOL : (parameter check) add a check for next tools here
   }
   else
@@ -3111,6 +3119,10 @@ void EncAppCfg::xPrintParameter()
     msg( VERBOSE, "LMChroma:%d ", m_LMChroma );
     msg( VERBOSE, "EMT: %1d(intra) %1d(inter) ", m_EMT & 1, ( m_EMT >> 1 ) & 1 );
     msg(VERBOSE, "CompositeLTReference:%d ", m_compositeRefEnabled);
+#if JVET_L0646_GBI
+    msg( VERBOSE, "GBi:%d ", m_GBi );
+    msg( VERBOSE, "GBiFast:%d ", m_GBiFast );
+#endif
   }
   // ADD_NEW_TOOL (add some output indicating the usage of tools)
 
diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h
index c30670b9872dad3e6d8897be13b525a73cc39825..a9eb00f22b16e1ccb7a754209c6b9110af5a4229 100644
--- a/source/App/EncoderApp/EncAppCfg.h
+++ b/source/App/EncoderApp/EncAppCfg.h
@@ -221,6 +221,10 @@ protected:
   int       m_FastEMT;                                        ///< XZ: Fast Methods of Enhanced Multiple Transform
 
   bool      m_compositeRefEnabled;
+#if JVET_L0646_GBI
+  bool      m_GBi;
+  bool      m_GBiFast;
+#endif
   // ADD_NEW_TOOL : (encoder app) add tool enabling flags and associated parameters here
 
   unsigned  m_uiMaxCUWidth;                                   ///< max. CU width in pixel
diff --git a/source/Lib/CommonLib/Buffer.cpp b/source/Lib/CommonLib/Buffer.cpp
index 4ad657d48214846ee476e4d8bcc4479e3fe61cf6..f31a22044ec3f8d59704b0ccf47387f1f513f70d 100644
--- a/source/Lib/CommonLib/Buffer.cpp
+++ b/source/Lib/CommonLib/Buffer.cpp
@@ -62,6 +62,40 @@ void addAvgCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T
 #undef ADD_AVG_CORE_INC
 }
 
+#if ENABLE_SIMD_OPT_GBI && JVET_L0646_GBI
+void removeWeightHighFreq(int16_t* dst, int dstStride, const int16_t* src, int srcStride, int width, int height, int shift, int gbiWeight)
+{
+  int normalizer = ((1 << 16) + (gbiWeight > 0 ? (gbiWeight >> 1) : -(gbiWeight >> 1))) / gbiWeight;
+  int weight0 = normalizer << g_GbiLog2WeightBase;
+  int weight1 = (g_GbiWeightBase - gbiWeight)*normalizer;
+#define REM_HF_INC  \
+  src += srcStride; \
+  dst += dstStride; \
+
+#define REM_HF_OP( ADDR )      dst[ADDR] =             (dst[ADDR]*weight0 - src[ADDR]*weight1 + (1<<15))>>16
+
+  SIZE_AWARE_PER_EL_OP(REM_HF_OP, REM_HF_INC);
+
+#undef REM_HF_INC
+#undef REM_HF_OP
+#undef REM_HF_OP_CLIP
+}
+
+void removeHighFreq(int16_t* dst, int dstStride, const int16_t* src, int srcStride, int width, int height)
+{
+#define REM_HF_INC  \
+  src += srcStride; \
+  dst += dstStride; \
+
+#define REM_HF_OP( ADDR )      dst[ADDR] =             2 * dst[ADDR] - src[ADDR]
+
+  SIZE_AWARE_PER_EL_OP(REM_HF_OP, REM_HF_INC);
+
+#undef REM_HF_INC
+#undef REM_HF_OP
+#undef REM_HF_OP_CLIP
+}
+#endif
 
 template<typename T>
 void reconstructCore( const T* src1, int src1Stride, const T* src2, int src2Stride, T* dest, int dstStride, int width, int height, const ClpRng& clpRng )
@@ -103,6 +137,14 @@ PelBufferOps::PelBufferOps()
 
   linTf4 = linTfCore<Pel>;
   linTf8 = linTfCore<Pel>;
+
+#if ENABLE_SIMD_OPT_GBI
+  removeWeightHighFreq8 = removeWeightHighFreq;
+  removeWeightHighFreq4 = removeWeightHighFreq;
+  removeHighFreq8 = removeHighFreq;
+  removeHighFreq4 = removeHighFreq;
+#endif
+
 }
 
 PelBufferOps g_pelBufOP = PelBufferOps();
@@ -110,6 +152,37 @@ PelBufferOps g_pelBufOP = PelBufferOps();
 #endif
 #endif
 
+#if JVET_L0646_GBI
+template<>
+void AreaBuf<Pel>::addWeightedAvg(const AreaBuf<const Pel> &other1, const AreaBuf<const Pel> &other2, const ClpRng& clpRng, const int8_t gbiIdx)
+{
+  const int8_t w0 = getGbiWeight(gbiIdx, REF_PIC_LIST_0);
+  const int8_t w1 = getGbiWeight(gbiIdx, REF_PIC_LIST_1);
+  const int8_t log2WeightBase = g_GbiLog2WeightBase;
+
+  const Pel* src0 = other1.buf;
+  const Pel* src2 = other2.buf;
+  Pel* dest = buf;
+
+  const unsigned src1Stride = other1.stride;
+  const unsigned src2Stride = other2.stride;
+  const unsigned destStride = stride;
+  const int clipbd = clpRng.bd;
+  const int shiftNum = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase;
+  const int offset = (1 << (shiftNum - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);
+
+#define ADD_AVG_OP( ADDR ) dest[ADDR] = ClipPel( rightShift( ( src0[ADDR]*w0 + src2[ADDR]*w1 + offset ), shiftNum ), clpRng )
+#define ADD_AVG_INC     \
+    src0 += src1Stride; \
+    src2 += src2Stride; \
+    dest += destStride; \
+
+  SIZE_AWARE_PER_EL_OP(ADD_AVG_OP, ADD_AVG_INC);
+
+#undef ADD_AVG_OP
+#undef ADD_AVG_INC
+}
+#endif
 
 template<>
 void AreaBuf<Pel>::addAvg( const AreaBuf<const Pel> &other1, const AreaBuf<const Pel> &other2, const ClpRng& clpRng)
diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h
index 1763242e3dc0af2fd30165b09045a5028c460863..fdf3b962f774c18efc1c9433d496415ffcc74cf2 100644
--- a/source/Lib/CommonLib/Buffer.h
+++ b/source/Lib/CommonLib/Buffer.h
@@ -68,6 +68,12 @@ struct PelBufferOps
   void ( *reco8 )         ( const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, int width, int height,                                   const ClpRng& clpRng );
   void ( *linTf4 )        ( const Pel* src0, int src0Stride,                                  Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip );
   void ( *linTf8 )        ( const Pel* src0, int src0Stride,                                  Pel *dst, int dstStride, int width, int height, int scale, int shift, int offset, const ClpRng& clpRng, bool bClip );
+#if ENABLE_SIMD_OPT_GBI
+  void ( *removeWeightHighFreq8)  ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight);
+  void ( *removeWeightHighFreq4)  ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height, int shift, int gbiWeight);
+  void ( *removeHighFreq8)        ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height);
+  void ( *removeHighFreq4)        ( Pel* src0, int src0Stride, const Pel* src1, int src1Stride, int width, int height);
+#endif
 };
 
 extern PelBufferOps g_pelBufOP;
@@ -102,6 +108,10 @@ struct AreaBuf : public Size
   void subtract             ( const AreaBuf<const T> &other );
   void extendSingleBorderPel();
   void extendBorderPel      (  unsigned margin );
+#if JVET_L0646_GBI
+  void addWeightedAvg       ( const AreaBuf<const T> &other1, const AreaBuf<const T> &other2, const ClpRng& clpRng, const int8_t gbiIdx);
+  void removeWeightHighFreq ( const AreaBuf<T>& other, const bool bClip, const ClpRng& clpRng, const int8_t iGbiWeight);
+#endif
   void addAvg               ( const AreaBuf<const T> &other1, const AreaBuf<const T> &other2, const ClpRng& clpRng );
   void removeHighFreq       ( const AreaBuf<T>& other, const bool bClip, const ClpRng& clpRng);
   void updateHistogram      ( std::vector<int32_t>& hist ) const;
@@ -384,6 +394,59 @@ void AreaBuf<T>::toLast( const ClpRng& clpRng )
 template<>
 void AreaBuf<Pel>::toLast( const ClpRng& clpRng );
 
+#if JVET_L0646_GBI
+template<typename T>
+void AreaBuf<T>::removeWeightHighFreq(const AreaBuf<T>& other, const bool bClip, const ClpRng& clpRng, const int8_t gbiWeight)
+{
+  const int8_t gbiWeightOther = g_GbiWeightBase - gbiWeight;
+  const int8_t log2WeightBase = g_GbiLog2WeightBase;
+
+  const Pel* src = other.buf;
+  const int  srcStride = other.stride;
+
+  Pel* dst = buf;
+  const int  dstStride = stride;
+
+#if ENABLE_SIMD_OPT_GBI
+  if(!bClip)
+  {
+    if(!(width & 7))
+      g_pelBufOP.removeWeightHighFreq8(dst, dstStride, src, srcStride, width, height, 16, gbiWeight);
+    else if(!(width & 3))
+      g_pelBufOP.removeWeightHighFreq4(dst, dstStride, src, srcStride, width, height, 16, gbiWeight);
+    else
+      CHECK(true, "Not supported");
+  }
+  else
+  {
+#endif
+    int normalizer = ((1 << 16) + (gbiWeight > 0 ? (gbiWeight >> 1) : -(gbiWeight >> 1))) / gbiWeight;
+    int weight0 = normalizer << log2WeightBase;
+    int weight1 = gbiWeightOther * normalizer;
+#define REM_HF_INC  \
+  src += srcStride; \
+  dst += dstStride; \
+
+#define REM_HF_OP_CLIP( ADDR ) dst[ADDR] = ClipPel<T>( (dst[ADDR]*weight0 - src[ADDR]*weight1 + (1<<15))>>16, clpRng )
+#define REM_HF_OP( ADDR )      dst[ADDR] =             (dst[ADDR]*weight0 - src[ADDR]*weight1 + (1<<15))>>16
+
+    if(bClip)
+    {
+      SIZE_AWARE_PER_EL_OP(REM_HF_OP_CLIP, REM_HF_INC);
+    }
+    else
+    {
+      SIZE_AWARE_PER_EL_OP(REM_HF_OP, REM_HF_INC);
+    }
+
+#undef REM_HF_INC
+#undef REM_HF_OP
+#undef REM_HF_OP_CLIP
+#if ENABLE_SIMD_OPT_GBI
+  }
+#endif
+}
+#endif
 
 template<typename T>
 void AreaBuf<T>::removeHighFreq( const AreaBuf<T>& other, const bool bClip, const ClpRng& clpRng )
@@ -394,6 +457,20 @@ void AreaBuf<T>::removeHighFreq( const AreaBuf<T>& other, const bool bClip, cons
         T*  dst       = buf;
   const int dstStride = stride;
 
+#if ENABLE_SIMD_OPT_GBI && JVET_L0646_GBI
+  if (!bClip)
+  {
+    if(!(width & 7))
+      g_pelBufOP.removeHighFreq8(dst, dstStride, src, srcStride, width, height);
+    else if (!(width & 3))
+      g_pelBufOP.removeHighFreq4(dst, dstStride, src, srcStride, width, height);
+    else
+      CHECK(true, "Not supported");
+  }
+  else
+  {
+#endif
+
 #define REM_HF_INC  \
   src += srcStride; \
   dst += dstStride; \
@@ -413,6 +490,10 @@ void AreaBuf<T>::removeHighFreq( const AreaBuf<T>& other, const bool bClip, cons
 #undef REM_HF_INC
 #undef REM_HF_OP
 #undef REM_HF_OP_CLIP
+
+#if ENABLE_SIMD_OPT_GBI && JVET_L0646_GBI
+  }
+#endif
 }
 
 
@@ -576,10 +657,16 @@ struct UnitBuf
   void reconstruct          ( const UnitBuf<const T> &pred, const UnitBuf<const T> &resi, const ClpRngs& clpRngs );
   void copyClip             ( const UnitBuf<const T> &src, const ClpRngs& clpRngs );
   void subtract             ( const UnitBuf<const T> &other );
+#if JVET_L0646_GBI
+  void addWeightedAvg       ( const UnitBuf<const T> &other1, const UnitBuf<const T> &other2, const ClpRngs& clpRngs, const uint8_t gbiIdx = GBI_DEFAULT, const bool chromaOnly = false, const bool lumaOnly = false);
+#endif
   void addAvg               ( const UnitBuf<const T> &other1, const UnitBuf<const T> &other2, const ClpRngs& clpRngs, const bool chromaOnly = false, const bool lumaOnly = false);
   void extendSingleBorderPel();
   void extendBorderPel      ( unsigned margin );
   void removeHighFreq       ( const UnitBuf<T>& other, const bool bClip, const ClpRngs& clpRngs
+#if JVET_L0646_GBI
+                            , const int8_t gbiWeight = g_GbiWeights[GBI_DEFAULT]
+#endif
                             );
 
         UnitBuf<      T> subBuf (const UnitArea& subArea);
@@ -649,6 +736,21 @@ void UnitBuf<T>::reconstruct(const UnitBuf<const T> &pred, const UnitBuf<const T
   }
 }
 
+#if JVET_L0646_GBI
+template<typename T>
+void UnitBuf<T>::addWeightedAvg(const UnitBuf<const T> &other1, const UnitBuf<const T> &other2, const ClpRngs& clpRngs, const uint8_t gbiIdx /* = GBI_DEFAULT */, const bool chromaOnly /* = false */, const bool lumaOnly /* = false */)
+{
+  const size_t istart = chromaOnly ? 1 : 0;
+  const size_t iend = lumaOnly ? 1 : bufs.size();
+
+  CHECK(lumaOnly && chromaOnly, "should not happen");
+
+  for(size_t i = istart; i < iend; i++)
+  {
+    bufs[i].addWeightedAvg(other1.bufs[i], other2.bufs[i], clpRngs.comp[i], gbiIdx);
+  }
+}
+#endif
 
 template<typename T>
 void UnitBuf<T>::addAvg(const UnitBuf<const T> &other1, const UnitBuf<const T> &other2, const ClpRngs& clpRngs, const bool chromaOnly /* = false */, const bool lumaOnly /* = false */)
@@ -684,12 +786,25 @@ void UnitBuf<T>::extendBorderPel( unsigned margin )
 
 template<typename T>
 void UnitBuf<T>::removeHighFreq( const UnitBuf<T>& other, const bool bClip, const ClpRngs& clpRngs
+#if JVET_L0646_GBI
+                               , const int8_t gbiWeight
+#endif
                                )
 {
-  for( unsigned i = 0; i < bufs.size(); i++ )
+#if JVET_L0646_GBI 
+  if(gbiWeight != g_GbiWeights[GBI_DEFAULT])
   {
-    bufs[i].removeHighFreq(other.bufs[i], bClip, clpRngs.comp[i] );
+    bufs[0].removeWeightHighFreq(other.bufs[0], bClip, clpRngs.comp[0], gbiWeight);
+    return;
   }
+  bufs[0].removeHighFreq(other.bufs[0], bClip, clpRngs.comp[0]);
+#else
+  for (unsigned i = 0; i <bufs.size(); i++)
+  {
+    bufs[i].removeHighFreq(other.bufs[i], bClip, clpRngs.comp[i]);
+  }
+#endif
+
 }
 
 template<typename T>
diff --git a/source/Lib/CommonLib/CodingStatistics.h b/source/Lib/CommonLib/CodingStatistics.h
index 57a3263fee2d321728311da4a35a347a03484ffc..be99dbbe7dc88237915100a14a634d2dcc445f33 100644
--- a/source/Lib/CommonLib/CodingStatistics.h
+++ b/source/Lib/CommonLib/CodingStatistics.h
@@ -100,6 +100,9 @@ enum CodingStatisticsType
   STATS__CABAC_BITS__PAR_FLAG,
   STATS__CABAC_BITS__ALF,
   STATS__CABAC_BITS__IMV_FLAG,
+#if JVET_L0646_GBI
+  STATS__CABAC_BITS__GBI_IDX,
+#endif
   STATS__CABAC_BITS__EMT_CU_FLAG,
   STATS__CABAC_BITS__EMT_TU_INDEX,
   STATS__TOOL_EMT,
@@ -170,6 +173,9 @@ static inline const char* getName(CodingStatisticsType name)
     "CABAC_BITS__ALIGNED_SIGN_BIT",
     "CABAC_BITS__ALIGNED_ESCAPE_BITS",
     "CABAC_BITS__IMV_FLAG",
+#if JVET_L0646_GBI
+    "CABAC_BITS__GBI_IDX",
+#endif
     "CABAC_BITS__EMT_CU_FLAG",
     "CABAC_BITS__EMT_TU_INDX",
     "CABAC_BITS__OTHER",
diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
index 55c0bd395a0544eafd1901fd1c9ff71e1f4311a4..14a5671cebd5d82bea22b83fc756ff5de6153f75 100644
--- a/source/Lib/CommonLib/CommonDef.h
+++ b/source/Lib/CommonLib/CommonDef.h
@@ -291,6 +291,11 @@ static const int AFFINE_MAX_NUM_V2 =                                2; ///< max
 static const int AFFINE_MAX_NUM_COMB =                             12; ///< max number of combined motion candidates
 static const int AFFINE_MIN_BLOCK_SIZE =                            4; ///< Minimum affine MC block size
 
+#if JVET_L0646_GBI
+static const int GBI_NUM =                                          5; ///< the number of weight options
+static const int GBI_DEFAULT =                                      ((uint8_t)(GBI_NUM >> 1)); ///< Default weighting index representing for w=0.5
+static const int GBI_SIZE_CONSTRAINT =                            256; ///< disabling GBi if cu size is smaller than 256
+#endif
 
 #if W0038_DB_OPT
 static const int MAX_ENCODER_DEBLOCKING_QUALITY_LAYERS =           8 ;
diff --git a/source/Lib/CommonLib/ContextModelling.cpp b/source/Lib/CommonLib/ContextModelling.cpp
index cfd5e8b1adf6e7ff1ec4aa399ac229f548fa6bed..4f2296d89d8ddfc86a40b055842c345f539a6bce 100644
--- a/source/Lib/CommonLib/ContextModelling.cpp
+++ b/source/Lib/CommonLib/ContextModelling.cpp
@@ -347,5 +347,8 @@ void MergeCtx::setMergeInfo( PredictionUnit& pu, int candIdx )
   pu.mvpNum [REF_PIC_LIST_0] = NOT_VALID;
   pu.mvpNum [REF_PIC_LIST_1] = NOT_VALID;
 
-  
+#if JVET_L0646_GBI 
+  pu.cu->GBiIdx = ( interDirNeighbours[candIdx] == 3 ) ? GBiIdx[candIdx] : GBI_DEFAULT;
+#endif
+
 }
diff --git a/source/Lib/CommonLib/ContextModelling.h b/source/Lib/CommonLib/ContextModelling.h
index 681631922ed8df536bce7a8d1c95aaa84935ebb5..1a353fd6579319f32648e630c59674f13da1527e 100644
--- a/source/Lib/CommonLib/ContextModelling.h
+++ b/source/Lib/CommonLib/ContextModelling.h
@@ -265,6 +265,9 @@ public:
   ~MergeCtx() {}
 public:
   MvField       mvFieldNeighbours [ MRG_MAX_NUM_CANDS << 1 ]; // double length for mv of both lists
+#if JVET_L0646_GBI
+  uint8_t       GBiIdx            [ MRG_MAX_NUM_CANDS      ];
+#endif
   unsigned char interDirNeighbours[ MRG_MAX_NUM_CANDS      ];
   MergeType     mrgTypeNeighbours [ MRG_MAX_NUM_CANDS      ];
   int           numValidMergeCand;
diff --git a/source/Lib/CommonLib/Contexts.cpp b/source/Lib/CommonLib/Contexts.cpp
index c735ab2d932ef6917a7797c16786d6b45400e7c4..334ab7a792e461d545d1343bff92d887f2ee96b8 100644
--- a/source/Lib/CommonLib/Contexts.cpp
+++ b/source/Lib/CommonLib/Contexts.cpp
@@ -367,6 +367,15 @@ const CtxSet ContextSetCfg::AffineType = ContextSetCfg::addCtxSet
   { CNU, },
 });
 
+#if JVET_L0646_GBI
+const CtxSet ContextSetCfg::GBiIdx = ContextSetCfg::addCtxSet
+({
+  // 4 ctx for 1st bin; 1 ctx for each of rest bins
+  { 95,  79,  63,  31,  31,  31,  31, },
+  { 95,  79,  63,  31,  31,  31,  31, },
+  { CNU, CNU, CNU, CNU, CNU, CNU, CNU, },
+  });
+#endif
 
 const CtxSet ContextSetCfg::Mvd = ContextSetCfg::addCtxSet
 ({
diff --git a/source/Lib/CommonLib/Contexts.h b/source/Lib/CommonLib/Contexts.h
index 8f18ae58e7bf974738cddbe5edfb4ebe96373c92..3e4679c06bd64c00f0f7611e8ca52e739902a135 100644
--- a/source/Lib/CommonLib/Contexts.h
+++ b/source/Lib/CommonLib/Contexts.h
@@ -191,6 +191,9 @@ public:
   static const CtxSet   ChromaQpAdjFlag;
   static const CtxSet   ChromaQpAdjIdc;
   static const CtxSet   ImvFlag;
+#if JVET_L0646_GBI
+  static const CtxSet   GBiIdx;
+#endif
   static const CtxSet   ctbAlfFlag;
   static const unsigned NumberOfContexts;
 
diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index 8030555ab0e6e24a5f1525fcb53b65195d828a42..c6ea5e914dc34da5d0886b1afb10108142b98bf4 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -620,6 +620,13 @@ void InterPrediction::xWeightedAverage( const PredictionUnit& pu, const CPelUnit
 
   if( iRefIdx0 >= 0 && iRefIdx1 >= 0 )
   {
+#if JVET_L0646_GBI
+    if( pu.cu->GBiIdx != GBI_DEFAULT )
+    {
+      pcYuvDst.addWeightedAvg(pcYuvSrc0, pcYuvSrc1, clpRngs, pu.cu->GBiIdx);
+      return;
+    }
+#endif
     pcYuvDst.addAvg( pcYuvSrc0, pcYuvSrc1, clpRngs );
   }
   else if( iRefIdx0 >= 0 && iRefIdx1 < 0 )
diff --git a/source/Lib/CommonLib/MotionInfo.h b/source/Lib/CommonLib/MotionInfo.h
index d41a9371209cdb9661377de1858e540b6f8ef7c0..26fcb5a712797ddf09b3b2b262617542e8c8269d 100644
--- a/source/Lib/CommonLib/MotionInfo.h
+++ b/source/Lib/CommonLib/MotionInfo.h
@@ -140,5 +140,73 @@ struct MotionInfo
   }
 };
 
+#if JVET_L0646_GBI
+class GBiMotionParam
+{
+  bool       m_readOnly[2][33];       // 2 RefLists, 33 RefFrams
+  Mv         m_mv[2][33];
+  Distortion m_dist[2][33];
+
+  bool       m_readOnlyAffine[2][2][33];
+  Mv         m_mvAffine[2][2][33][3];
+  Distortion m_distAffine[2][2][33];
+
+public:
+
+  void reset()
+  {
+    Mv* pMv = &(m_mv[0][0]);
+    for (int ui = 0; ui < 1 * 2 * 33; ++ui, ++pMv)
+    {
+      pMv->set(std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::max());
+    }
+
+    Mv* pAffineMv = &(m_mvAffine[0][0][0][0]);
+    for (int ui = 0; ui < 2 * 2 * 33 * 3; ++ui, ++pMv)
+    {
+      pAffineMv->set(0, 0);
+    }
+
+    memset(m_readOnly, false, 2 * 33 * sizeof(bool));
+    memset(m_dist, -1, 2 * 33 * sizeof(Distortion));
+    memset(m_readOnlyAffine, false, 2 * 2 * 33 * sizeof(bool));
+    memset(m_distAffine, -1, 2 * 2 * 33 * sizeof(Distortion));
+  }
+
+  void setReadMode(bool b, uint32_t uiRefList, uint32_t uiRefIdx) { m_readOnly[uiRefList][uiRefIdx] = b; }
+  bool isReadMode(uint32_t uiRefList, uint32_t uiRefIdx) { return m_readOnly[uiRefList][uiRefIdx]; }
+
+  void setReadModeAffine(bool b, uint32_t uiRefList, uint32_t uiRefIdx, int bP4) { m_readOnlyAffine[bP4][uiRefList][uiRefIdx] = b; }
+  bool isReadModeAffine(uint32_t uiRefList, uint32_t uiRefIdx, int bP4) { return m_readOnlyAffine[bP4][uiRefList][uiRefIdx]; }
+
+  Mv&  getMv(uint32_t uiRefList, uint32_t uiRefIdx) { return m_mv[uiRefList][uiRefIdx]; }
+
+  void copyFrom(Mv& rcMv, Distortion uiDist, uint32_t uiRefList, uint32_t uiRefIdx)
+  {
+    m_mv[uiRefList][uiRefIdx] = rcMv;
+    m_dist[uiRefList][uiRefIdx] = uiDist;
+  }
+
+  void copyTo(Mv& rcMv, Distortion& ruiDist, uint32_t uiRefList, uint32_t uiRefIdx)
+  {
+    rcMv = m_mv[uiRefList][uiRefIdx];
+    ruiDist = m_dist[uiRefList][uiRefIdx];
+  }
+
+  Mv& getAffineMv(uint32_t uiRefList, uint32_t uiRefIdx, uint32_t uiAffineMvIdx, int bP4) { return m_mvAffine[bP4][uiRefList][uiRefIdx][uiAffineMvIdx]; }
+
+  void copyAffineMvFrom(Mv(&racAffineMvs)[3], Distortion uiDist, uint32_t uiRefList, uint32_t uiRefIdx, int bP4)
+  {
+    memcpy(m_mvAffine[bP4][uiRefList][uiRefIdx], racAffineMvs, 3 * sizeof(Mv));
+    m_distAffine[bP4][uiRefList][uiRefIdx] = uiDist;
+  }
+
+  void copyAffineMvTo(Mv acAffineMvs[3], Distortion& ruiDist, uint32_t uiRefList, uint32_t uiRefIdx, int bP4)
+  {
+    memcpy(acAffineMvs, m_mvAffine[bP4][uiRefList][uiRefIdx], 3 * sizeof(Mv));
+    ruiDist = m_distAffine[bP4][uiRefList][uiRefIdx];
+  }
+};
+#endif
 
 #endif // __MOTIONINFO__
diff --git a/source/Lib/CommonLib/Rom.cpp b/source/Lib/CommonLib/Rom.cpp
index e2f8ccaca4af12c2862205ae42f2be8c6b255c1f..55061897d875aa7670c99123b2b5bfab943448b3 100644
--- a/source/Lib/CommonLib/Rom.cpp
+++ b/source/Lib/CommonLib/Rom.cpp
@@ -183,7 +183,71 @@ public:
 
 const int g_aiNonLMPosThrs[] = {  3,  1,  0 };
 
+#if JVET_L0646_GBI
+const int8_t g_GbiLog2WeightBase = 3;
+const int8_t g_GbiWeightBase = (1 << g_GbiLog2WeightBase);
+const int8_t g_GbiWeights[GBI_NUM] = { -2, 3, 4, 5, 10 };
+const int8_t g_GbiSearchOrder[GBI_NUM] = { GBI_DEFAULT, GBI_DEFAULT - 2, GBI_DEFAULT + 2, GBI_DEFAULT - 1, GBI_DEFAULT + 1 };
+int8_t g_GbiCodingOrder[GBI_NUM];
+int8_t g_GbiParsingOrder[GBI_NUM];
+
+int8_t getGbiWeight(uint8_t gbiIdx, uint8_t uhRefFrmList)
+{
+  // Weghts for the model: P0 + w * (P1 - P0) = (1-w) * P0 + w * P1
+  // Retuning  1-w for P0 or w for P1
+  return (uhRefFrmList == REF_PIC_LIST_0 ? g_GbiWeightBase - g_GbiWeights[gbiIdx] : g_GbiWeights[gbiIdx]);
+}
+
+void resetGbiCodingOrder(bool bRunDecoding, const CodingStructure &cs)
+{
+  // Form parsing order: { GBI_DEFAULT, GBI_DEFAULT+1, GBI_DEFAULT-1, GBI_DEFAULT+2, GBI_DEFAULT-2, ... }
+  g_GbiParsingOrder[0] = GBI_DEFAULT;
+  for (int i = 1; i <= (GBI_NUM >> 1); ++i)
+  {
+    g_GbiParsingOrder[2 * i - 1] = GBI_DEFAULT + (int8_t)i;
+    g_GbiParsingOrder[2 * i] = GBI_DEFAULT - (int8_t)i;
+  }
+
+  // Form encoding order
+  if (!bRunDecoding)
+  {
+    for (int i = 0; i < GBI_NUM; ++i)
+    {
+      g_GbiCodingOrder[(uint32_t)g_GbiParsingOrder[i]] = i;
+    }
+  }
+}
+
+uint32_t deriveWeightIdxBits(uint8_t gbiIdx) // Note: align this with TEncSbac::codeGbiIdx and TDecSbac::parseGbiIdx
+{
+  uint32_t numBits = 1;
+  uint8_t  gbiCodingIdx = (uint8_t)g_GbiCodingOrder[gbiIdx];
+
+  if (GBI_NUM > 2 && gbiCodingIdx != 0)
+  {
+    uint32_t prefixNumBits = GBI_NUM - 2;
+    uint32_t step = 1;
+    uint8_t  prefixSymbol = gbiCodingIdx;
 
+    // Truncated unary code
+    uint8_t idx = 1;
+    for (int ui = 0; ui < prefixNumBits; ++ui)
+    {
+      if (prefixSymbol == idx)
+      {
+        ++numBits;
+        break;
+      }
+      else
+      {
+        ++numBits;
+        idx += step;
+      }
+    }
+  }
+  return numBits;
+}
+#endif
 
 // initialize ROM variables
 void initROM()
diff --git a/source/Lib/CommonLib/Rom.h b/source/Lib/CommonLib/Rom.h
index e8c0c5acbd49b56815db5051c3782eaf898e49ea..e60c5127eab38cf836243de3a4e66220c365c514 100644
--- a/source/Lib/CommonLib/Rom.h
+++ b/source/Lib/CommonLib/Rom.h
@@ -212,6 +212,20 @@ extern MsgLevel g_verbosity;
 
 extern const int g_aiNonLMPosThrs[];
 
+#if JVET_L0646_GBI
+extern const int8_t g_GbiLog2WeightBase;
+extern const int8_t g_GbiWeightBase;
+extern const int8_t g_GbiWeights[GBI_NUM];
+extern const int8_t g_GbiSearchOrder[GBI_NUM];
+extern       int8_t g_GbiCodingOrder[GBI_NUM];
+extern       int8_t g_GbiParsingOrder[GBI_NUM];
+
+class CodingStructure;
+int8_t getGbiWeight(uint8_t gbiIdx, uint8_t uhRefFrmList);
+void resetGbiCodingOrder(bool bRunDecoding, const CodingStructure &cs);
+uint32_t deriveWeightIdxBits(uint8_t gbiIdx);
+#endif 
+
 constexpr uint8_t g_tbMax[257] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h
index 57acda967f0835978a34a19b2612f9a557d29927..d6ef610c7304e8361981d2524201642290e29fd9 100644
--- a/source/Lib/CommonLib/Slice.h
+++ b/source/Lib/CommonLib/Slice.h
@@ -810,6 +810,9 @@ private:
   bool              m_InterEMT;                   // 19
   bool              m_Affine;
   bool              m_AffineType;
+#if JVET_L0646_GBI
+  bool              m_GBi;                        //
+#endif
   bool              m_MTTEnabled;                 //
 #if ENABLE_WPP_PARALLELISM
   bool              m_NextDQP;
@@ -876,6 +879,10 @@ public:
   bool      getUseIntraEMT        ()                                      const     { return m_IntraEMT; }
   void      setUseInterEMT        ( bool b )                                        { m_InterEMT = b; }
   bool      getUseInterEMT        ()                                      const     { return m_InterEMT; }
+#if JVET_L0646_GBI
+  void      setUseGBi             ( bool b )                                        { m_GBi = b; }
+  bool      getUseGBi             ()                                      const     { return m_GBi; }
+#endif
   //=====  additional parameters  =====
   // qtbt
   void      setCTUSize            ( unsigned    ctuSize )                           { m_CTUSize = ctuSize; }
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 3ef736ae4442374b74280f6aa6d676a46b62d62c..d4a439ae7b4fdb1ef9bc0fd70367f5be1ce99a27 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -50,6 +50,8 @@
 #include <assert.h>
 #include <cassert>
 
+#define JVET_L0646_GBI                                    1 // Generalized bi-prediction (GBi)
+
 #define REUSE_CU_RESULTS                                  1
 
 #define REMOVE_MV_ADAPT_PREC                              1 // remove the high precision flag in the MV class
@@ -213,6 +215,10 @@
 #define ENABLE_SIMD_OPT_DIST                            ( 1 && ENABLE_SIMD_OPT )                            ///< SIMD optimization for the distortion calculations(SAD,SSE,HADAMARD), no impact on RD performance
 #define ENABLE_SIMD_OPT_AFFINE_ME                       ( 1 && ENABLE_SIMD_OPT )                            ///< SIMD optimization for affine ME, no impact on RD performance
 #define ENABLE_SIMD_OPT_ALF                             ( 1 && ENABLE_SIMD_OPT )                            ///< SIMD optimization for ALF
+#if ENABLE_SIMD_OPT_BUFFER
+#define ENABLE_SIMD_OPT_GBI                               1                                                 ///< SIMD optimization for GBi   
+#endif
+
 // End of SIMD optimizations
 
 
diff --git a/source/Lib/CommonLib/Unit.cpp b/source/Lib/CommonLib/Unit.cpp
index 7483af675d2db2af484d9ed6579645714d50ee37..6152c93f57b58f37865eb29ad338f314b8b6babd 100644
--- a/source/Lib/CommonLib/Unit.cpp
+++ b/source/Lib/CommonLib/Unit.cpp
@@ -266,6 +266,9 @@ CodingUnit& CodingUnit::operator=( const CodingUnit& other )
 #endif
   imv               = other.imv;
   imvNumCand        = other.imvNumCand;
+#if JVET_L0646_GBI
+  GBiIdx            = other.GBiIdx;
+#endif
   return *this;
 }
 
@@ -292,6 +295,9 @@ void CodingUnit::initData()
 #endif
   imv               = 0;
   imvNumCand        = 0;
+#if JVET_L0646_GBI
+  GBiIdx            = GBI_DEFAULT;
+#endif
 }
 
 
diff --git a/source/Lib/CommonLib/Unit.h b/source/Lib/CommonLib/Unit.h
index 9085d6e2285e5c72da206142e043edce2c2a6fab..93924a20065a649ddfe4e80adaf1b4553766616c 100644
--- a/source/Lib/CommonLib/Unit.h
+++ b/source/Lib/CommonLib/Unit.h
@@ -305,6 +305,10 @@ struct CodingUnit : public UnitArea
   uint32_t           tileIdx;
 #endif
   uint8_t          emtFlag;
+#if JVET_L0646_GBI
+  uint8_t         GBiIdx;
+  int             refIdxBi[2];
+#endif
   // needed for fast imv mode decisions
   int8_t          imvNumCand;
 
diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp
index feb106da69a7e27d1bf3a27f4015b3240c911d77..6d0c6f999c0565746035886fd95d87b84e3aca24 100644
--- a/source/Lib/CommonLib/UnitTools.cpp
+++ b/source/Lib/CommonLib/UnitTools.cpp
@@ -507,6 +507,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
   for (uint32_t ui = 0; ui < maxNumMergeCand; ++ui)
   {
     isCandInter[ui] = false;
+#if JVET_L0646_GBI
+    mrgCtx.GBiIdx[ui] = GBI_DEFAULT;
+#endif
     mrgCtx.interDirNeighbours[ui] = 0;
     mrgCtx.mrgTypeNeighbours [ui] = MRG_TYPE_DEFAULT_N;
     mrgCtx.mvFieldNeighbours[(ui << 1)    ].refIdx = NOT_VALID;
@@ -536,6 +539,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
 
     // get Inter Dir
     mrgCtx.interDirNeighbours[cnt] = miLeft.interDir;
+#if JVET_L0646_GBI
+    mrgCtx.GBiIdx[cnt] = (mrgCtx.interDirNeighbours[cnt] == 3) ? puLeft->cu->GBiIdx : GBI_DEFAULT;
+#endif
     // get Mv from Left
     mrgCtx.mvFieldNeighbours[cnt << 1].setMvField(miLeft.mv[0], miLeft.refIdx[0]);
 
@@ -575,6 +581,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
       // get Inter Dir
       mrgCtx.interDirNeighbours[cnt] = miAbove.interDir;
       // get Mv from Above
+#if JVET_L0646_GBI
+      mrgCtx.GBiIdx[cnt] = (mrgCtx.interDirNeighbours[cnt] == 3) ? puAbove->cu->GBiIdx : GBI_DEFAULT;
+#endif
       mrgCtx.mvFieldNeighbours[cnt << 1].setMvField( miAbove.mv[0], miAbove.refIdx[0] );
 
       if( slice.isInterB() )
@@ -617,6 +626,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
       // get Inter Dir
       mrgCtx.interDirNeighbours[cnt] = miAboveRight.interDir;
       // get Mv from Above-right
+#if JVET_L0646_GBI
+      mrgCtx.GBiIdx[cnt] = (mrgCtx.interDirNeighbours[cnt] == 3) ? puAboveRight->cu->GBiIdx : GBI_DEFAULT;
+#endif
       mrgCtx.mvFieldNeighbours[cnt << 1].setMvField( miAboveRight.mv[0], miAboveRight.refIdx[0] );
 
       if( slice.isInterB() )
@@ -657,6 +669,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
 
       // get Inter Dir
       mrgCtx.interDirNeighbours[cnt] = miBelowLeft.interDir;
+#if JVET_L0646_GBI
+      mrgCtx.GBiIdx[cnt] = (mrgCtx.interDirNeighbours[cnt] == 3) ? puLeftBottom->cu->GBiIdx : GBI_DEFAULT;
+#endif
       // get Mv from Bottom-Left
       mrgCtx.mvFieldNeighbours[cnt << 1].setMvField( miBelowLeft.mv[0], miBelowLeft.refIdx[0] );
 
@@ -740,6 +755,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
 
         // get Inter Dir
         mrgCtx.interDirNeighbours[cnt] = miAboveLeft.interDir;
+#if JVET_L0646_GBI
+        mrgCtx.GBiIdx[cnt] = (mrgCtx.interDirNeighbours[cnt] == 3) ? puAboveLeft->cu->GBiIdx : GBI_DEFAULT;
+#endif
         // get Mv from Above-Left
         mrgCtx.mvFieldNeighbours[cnt << 1].setMvField( miAboveLeft.mv[0], miAboveLeft.refIdx[0] );
 
@@ -867,6 +885,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
       if( addTMvp )
       {
         mrgCtx.interDirNeighbours[uiArrayAddr] = dir;
+#if JVET_L0646_GBI
+        mrgCtx.GBiIdx            [uiArrayAddr] = GBI_DEFAULT;
+#endif
         isCandInter              [uiArrayAddr] = true;
 
         if( mrgCandIdx == cnt && canFastExit )
@@ -903,6 +924,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
       {
         isCandInter[uiArrayAddr] = true;
         mrgCtx.interDirNeighbours[uiArrayAddr] = 3;
+#if JVET_L0646_GBI
+        mrgCtx.GBiIdx[uiArrayAddr] = ((mrgCtx.interDirNeighbours[uiArrayAddr] == 3)) ? CU::deriveGbiIdx(mrgCtx.GBiIdx[i], mrgCtx.GBiIdx[j]) : GBI_DEFAULT;
+#endif
 
         // get Mv from cand[i] and cand[j]
         mrgCtx.mvFieldNeighbours[ uiArrayAddr << 1     ].setMvField(mrgCtx.mvFieldNeighbours[ i << 1     ].mv, mrgCtx.mvFieldNeighbours[ i << 1     ].refIdx);
@@ -937,6 +961,9 @@ void PU::getInterMergeCandidates( const PredictionUnit &pu, MergeCtx& mrgCtx, co
   {
     isCandInter               [uiArrayAddr     ] = true;
     mrgCtx.interDirNeighbours [uiArrayAddr     ] = 1;
+#if JVET_L0646_GBI
+    mrgCtx.GBiIdx             [uiArrayAddr     ] = GBI_DEFAULT;
+#endif
     mrgCtx.mvFieldNeighbours  [uiArrayAddr << 1].setMvField(Mv(0, 0), r);
 
     if (slice.isInterB())
@@ -1867,7 +1894,11 @@ bool PU::isAffineMrgFlagCoded( const PredictionUnit &pu )
   }
   return getFirstAvailableAffineNeighbour( pu ) != nullptr;
 }
+#if JVET_L0646_GBI
+void PU::getAffineMergeCand( const PredictionUnit &pu, MvField(*mvFieldNeighbours)[3], unsigned char &interDirNeighbours, unsigned char &gbiIdx, int &numValidMergeCand )
+#else
 void PU::getAffineMergeCand( const PredictionUnit &pu, MvField (*mvFieldNeighbours)[3], unsigned char &interDirNeighbours, int &numValidMergeCand )
+#endif
 {
   for ( int mvNum = 0; mvNum < 3; mvNum++ )
   {
@@ -1879,6 +1910,9 @@ void PU::getAffineMergeCand( const PredictionUnit &pu, MvField (*mvFieldNeighbou
   if( puFirstNeighbour == nullptr )
   {
     numValidMergeCand = -1;
+#if JVET_L0646_GBI
+    gbiIdx = GBI_DEFAULT;
+#endif
     return;
   }
   else
@@ -1913,6 +1947,9 @@ void PU::getAffineMergeCand( const PredictionUnit &pu, MvField (*mvFieldNeighbou
       }
     }
   }
+#if JVET_L0646_GBI
+  gbiIdx = puFirstNeighbour->cu->GBiIdx;
+#endif
 }
 
 void PU::setAllAffineMvField( PredictionUnit &pu, MvField *mvField, RefPicList eRefList )
@@ -2211,6 +2248,9 @@ bool PU::getInterMergeSubPuMvpCand(const PredictionUnit &pu, MergeCtx& mrgCtx, b
         mrgCtx.mvFieldNeighbours[(count << 1) + currRefListId].setMvField(cColMv, 0);
         mrgCtx.interDirNeighbours[count] |= (1 << currRefListId);
         LICFlag = tempLICFlag;
+#if JVET_L0646_GBI
+        mrgCtx.GBiIdx[count] = GBI_DEFAULT;
+#endif
         found = true;
       }
       else
@@ -2456,6 +2496,9 @@ void PU::restrictBiPredMergeCands( const PredictionUnit &pu, MergeCtx& mergeCtx
       {
         mergeCtx.interDirNeighbours[ mergeCand ] = 1;
         mergeCtx.mvFieldNeighbours[( mergeCand << 1 ) + 1].setMvField( Mv( 0, 0 ), -1 );
+#if JVET_L0646_GBI
+        mergeCtx.GBiIdx[mergeCand] = GBI_DEFAULT;
+#endif
       }
     }
   }
@@ -2572,9 +2615,108 @@ int CU::getMaxNeighboriMVCandNum( const CodingStructure& cs, const Position& pos
   return maxImvNumCand;
 }
 
+#if JVET_L0646_GBI
+bool CU::isGBiIdxCoded( const CodingUnit &cu )
+{
+  if( cu.cs->sps->getSpsNext().getUseGBi() == false )
+  {
+    CHECK(cu.GBiIdx != GBI_DEFAULT, "Error: cu.GBiIdx != GBI_DEFAULT");
+    return false;
+  }
+
+  if( cu.predMode == MODE_INTRA || cu.cs->slice->isInterP() )
+  {
+    return false;
+  }
 
+  if( cu.lwidth() * cu.lheight() < GBI_SIZE_CONSTRAINT )
+  {
+    return false;
+  }
 
+  if( cu.firstPU->interDir == 3 && !cu.firstPU->mergeFlag )
+  {
+    return true;
+  }
 
+  return false;
+}
+
+uint8_t CU::getValidGbiIdx( const CodingUnit &cu )
+{
+  if( cu.firstPU->interDir == 3 && !cu.firstPU->mergeFlag )
+  {
+    return cu.GBiIdx;
+  }
+  else if( cu.firstPU->interDir == 3 && cu.firstPU->mergeFlag && cu.firstPU->mergeType == MRG_TYPE_DEFAULT_N )
+  {
+    // This is intended to do nothing here.
+  }
+  else if( cu.firstPU->mergeFlag && cu.firstPU->mergeType == MRG_TYPE_SUBPU_ATMVP )
+  {
+    CHECK(cu.GBiIdx != GBI_DEFAULT, " cu.GBiIdx != GBI_DEFAULT ");
+  }
+  else
+  {
+    CHECK(cu.GBiIdx != GBI_DEFAULT, " cu.GBiIdx != GBI_DEFAULT ");
+  }
+
+  return GBI_DEFAULT;
+}
+
+void CU::setGbiIdx( CodingUnit &cu, uint8_t uh )
+{
+  int8_t uhCnt = 0;
+
+  if( cu.firstPU->interDir == 3 && !cu.firstPU->mergeFlag )
+  {
+    cu.GBiIdx = uh;
+    ++uhCnt;
+  }
+  else if( cu.firstPU->interDir == 3 && cu.firstPU->mergeFlag && cu.firstPU->mergeType == MRG_TYPE_DEFAULT_N )
+  {
+    // This is intended to do nothing here.
+  }
+  else if( cu.firstPU->mergeFlag && cu.firstPU->mergeType == MRG_TYPE_SUBPU_ATMVP )
+  {
+    cu.GBiIdx = GBI_DEFAULT;
+  }
+  else
+  {
+    cu.GBiIdx = GBI_DEFAULT;
+  }
+
+  CHECK(uhCnt <= 0, " uhCnt <= 0 ");
+}
+
+uint8_t CU::deriveGbiIdx( uint8_t gbiLO, uint8_t gbiL1 )
+{
+  if( gbiLO == gbiL1 )
+  {
+    return gbiLO;
+  }
+  const int8_t w0 = getGbiWeight(gbiLO, REF_PIC_LIST_0);
+  const int8_t w1 = getGbiWeight(gbiL1, REF_PIC_LIST_1);
+  const int8_t th = g_GbiWeightBase >> 1;
+  const int8_t off = 1;
+
+  if( w0 == w1 || (w0 < (th - off) && w1 < (th - off)) || (w0 >(th + off) && w1 >(th + off)) )
+  {
+    return GBI_DEFAULT;
+  }
+  else
+  {
+    if( w0 > w1 )
+    {
+      return ( w0 >= th ? gbiLO : gbiL1 );
+    }
+    else
+    {
+      return ( w1 >= th ? gbiL1 : gbiLO );
+    }
+  }
+}
+#endif
 
 // TU tools
 
diff --git a/source/Lib/CommonLib/UnitTools.h b/source/Lib/CommonLib/UnitTools.h
index 909810f89df86612343aa66f3b77dd680251744f..3247170c6f357a977fdc39d16be213adee1a28ee 100644
--- a/source/Lib/CommonLib/UnitTools.h
+++ b/source/Lib/CommonLib/UnitTools.h
@@ -84,6 +84,12 @@ namespace CU
   bool hasNonTsCodedBlock             (const CodingUnit& cu);
   uint32_t getNumNonZeroCoeffNonTs        (const CodingUnit& cu);
 
+#if JVET_L0646_GBI
+  bool  isGBiIdxCoded                 (const CodingUnit& cu);
+  uint8_t getValidGbiIdx              (const CodingUnit& cu);
+  void  setGbiIdx                     (CodingUnit& cu, uint8_t uh);
+  uint8_t deriveGbiIdx                (uint8_t gbiLO, uint8_t gbiL1);
+#endif
 
   PUTraverser traversePUs             (      CodingUnit& cu);
   TUTraverser traverseTUs             (      CodingUnit& cu);
@@ -115,7 +121,11 @@ namespace PU
   bool isBipredRestriction            (const PredictionUnit &pu);
   void spanMotionInfo                 (      PredictionUnit &pu, const MergeCtx &mrgCtx = MergeCtx() );
   void applyImv                       (      PredictionUnit &pu, MergeCtx &mrgCtx, InterPrediction *interPred = NULL );
+#if JVET_L0646_GBI
+  void getAffineMergeCand             (const PredictionUnit &pu, MvField(*mvFieldNeighbours)[3], unsigned char &interDirNeighbours, unsigned char &gbiIdx, int &numValidMergeCand);
+#else
   void getAffineMergeCand             (const PredictionUnit &pu, MvField (*mvFieldNeighbours)[3], unsigned char &interDirNeighbours, int &numValidMergeCand );
+#endif
   bool isAffineMrgFlagCoded           (const PredictionUnit &pu );
   void getAffineMergeCand             (const PredictionUnit &pu, MvField (*mvFieldNeighbours)[3], unsigned char &interDirNeighbours, int &numValidMergeCand );
   void setAllAffineMvField            (      PredictionUnit &pu, MvField *mvField, RefPicList eRefList );
diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h
index d9693eedff4621d12ad655e76dbf118d991b4a52..ba4024fe55ad2b0412830bf5b57a1d303c96b12d 100644
--- a/source/Lib/CommonLib/x86/BufferX86.h
+++ b/source/Lib/CommonLib/x86/BufferX86.h
@@ -211,6 +211,171 @@ void reco_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int src
   }
 }
 
+#if ENABLE_SIMD_OPT_GBI
+template< X86_VEXT vext, int W >
+void removeWeightHighFreq_SSE(int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int width, int height, int shift, int gbiWeight)
+{
+  int normalizer = ((1 << 16) + (gbiWeight>0 ? (gbiWeight >> 1) : -(gbiWeight >> 1))) / gbiWeight;
+  int weight0 = normalizer << g_GbiLog2WeightBase;
+  int weight1 = (g_GbiWeightBase - gbiWeight)*normalizer;
+  int offset = 1 << (shift - 1);
+  if (W == 8)
+  {
+#if 0//USE_AVX2
+    if (vext >= AVX2)
+    {
+      __m256i vzero = _mm256_setzero_si256();
+      __m256i voffset = _mm256_set1_epi32(offset);
+      __m256i vw0 = _mm256_set1_epi32(weight0);
+      __m256i vw1 = _mm256_set1_epi32(weight1);
+
+      for (int row = 0; row < height; row++)
+      {
+        for (int col = 0; col < width; col += 8)
+        {
+          __m256i vsrc0, vsrc1;
+          __m128i a = _mm_load_si128((const __m128i *)&src0[col]);
+          __m128i b = _mm_load_si128((const __m128i *)&src1[col]);
+
+          vsrc0 = _mm256_cvtepi16_epi32(a);
+          vsrc1 = _mm256_cvtepi16_epi32(b);
+          vsrc0 = _mm256_mullo_epi32(vsrc0, vw0);
+          vsrc1 = _mm256_mullo_epi32(vsrc1, vw1);
+          vsrc0 = _mm256_add_epi32(_mm256_sub_epi32(vsrc0, vsrc1), voffset);
+          vsrc0 = _mm256_srai_epi32(vsrc0, shift);
+
+          vsrc0 = _mm256_packs_epi32(vsrc0, vzero);
+
+          _mm_store_si128((__m128i *)&src0[col], _mm256_castsi256_si128(vsrc0));
+        }
+
+        src0 += src0Stride;
+        src1 += src1Stride;
+      }
+    }
+    else
+#endif
+    {
+      __m128i vzero = _mm_setzero_si128();
+      __m128i voffset = _mm_set1_epi32(offset);
+      __m128i vw0 = _mm_set1_epi32(weight0);
+      __m128i vw1 = _mm_set1_epi32(weight1);
+
+      for (int row = 0; row < height; row++)
+      {
+        for (int col = 0; col < width; col += 8)
+        {
+          __m128i vsrc0 = _mm_load_si128((const __m128i *)&src0[col]);
+          __m128i vsrc1 = _mm_load_si128((const __m128i *)&src1[col]);
+
+          __m128i vtmp, vdst, vsrc;
+          vdst = _mm_cvtepi16_epi32(vsrc0);
+          vsrc = _mm_cvtepi16_epi32(vsrc1);
+          vdst = _mm_mullo_epi32(vdst, vw0);
+          vsrc = _mm_mullo_epi32(vsrc, vw1);
+          vtmp = _mm_add_epi32(_mm_sub_epi32(vdst, vsrc), voffset);
+          vtmp = _mm_srai_epi32(vtmp, shift);
+
+          vsrc0 = _mm_unpackhi_epi64(vsrc0, vzero);
+          vsrc1 = _mm_unpackhi_epi64(vsrc1, vzero);
+          vdst = _mm_cvtepi16_epi32(vsrc0);
+          vsrc = _mm_cvtepi16_epi32(vsrc1);
+          vdst = _mm_mullo_epi32(vdst, vw0);
+          vsrc = _mm_mullo_epi32(vsrc, vw1);
+          vdst = _mm_add_epi32(_mm_sub_epi32(vdst, vsrc), voffset);
+          vdst = _mm_srai_epi32(vdst, shift);
+          vdst = _mm_packs_epi32(vtmp, vdst);
+
+          _mm_store_si128((__m128i *)&src0[col], vdst);
+        }
+
+        src0 += src0Stride;
+        src1 += src1Stride;
+      }
+    }
+  }
+  else if (W == 4)
+  {
+    __m128i vzero = _mm_setzero_si128();
+    __m128i voffset = _mm_set1_epi32(offset);
+    __m128i vw0 = _mm_set1_epi32(weight0);
+    __m128i vw1 = _mm_set1_epi32(weight1);
+
+    for (int row = 0; row < height; row++)
+    {
+      __m128i vsum = _mm_loadl_epi64((const __m128i *)src0);
+      __m128i vdst = _mm_loadl_epi64((const __m128i *)src1);
+
+      vsum = _mm_cvtepi16_epi32(vsum);
+      vdst = _mm_cvtepi16_epi32(vdst);
+      vsum = _mm_mullo_epi32(vsum, vw0);
+      vdst = _mm_mullo_epi32(vdst, vw1);
+      vsum = _mm_add_epi32(_mm_sub_epi32(vsum, vdst), voffset);
+      vsum = _mm_srai_epi32(vsum, shift);
+      vsum = _mm_packs_epi32(vsum, vzero);
+
+      _mm_storel_epi64((__m128i *)src0, vsum);
+
+      src0 += src0Stride;
+      src1 += src1Stride;
+    }
+  }
+  else
+  {
+    THROW("Unsupported size");
+  }
+}
+
+template< X86_VEXT vext, int W >
+void removeHighFreq_SSE(int16_t* src0, int src0Stride, const int16_t* src1, int src1Stride, int width, int height)
+{
+  if (W == 8)
+  {
+    // TODO: AVX2 impl
+    {
+      for (int row = 0; row < height; row++)
+      {
+        for (int col = 0; col < width; col += 8)
+        {
+          __m128i vsrc0 = _mm_load_si128((const __m128i *)&src0[col]);
+          __m128i vsrc1 = _mm_load_si128((const __m128i *)&src1[col]);
+
+          vsrc0 = _mm_sub_epi16(_mm_slli_epi16(vsrc0, 1), vsrc1);
+          _mm_store_si128((__m128i *)&src0[col], vsrc0);
+        }
+
+        src0 += src0Stride;
+        src1 += src1Stride;
+      }
+    }
+  }
+  else if (W == 4)
+  {
+    for (int row = 0; row < height; row += 2)
+    {
+      __m128i vsrc0 = _mm_loadl_epi64((const __m128i *)src0);
+      __m128i vsrc1 = _mm_loadl_epi64((const __m128i *)src1);
+      __m128i vsrc0_2 = _mm_loadl_epi64((const __m128i *)(src0 + src0Stride));
+      __m128i vsrc1_2 = _mm_loadl_epi64((const __m128i *)(src1 + src1Stride));
+
+      vsrc0 = _mm_unpacklo_epi64(vsrc0, vsrc0_2);
+      vsrc1 = _mm_unpacklo_epi64(vsrc1, vsrc1_2);
+
+      vsrc0 = _mm_sub_epi16(_mm_slli_epi16(vsrc0, 1), vsrc1);
+      _mm_storel_epi64((__m128i *)src0, vsrc0);
+      _mm_storel_epi64((__m128i *)(src0 + src0Stride), _mm_unpackhi_epi64(vsrc0, vsrc0));
+
+      src0 += (src0Stride << 1);
+      src1 += (src1Stride << 1);
+    }
+  }
+  else
+  {
+    THROW("Unsupported size");
+  }
+}
+#endif
+
 template<bool doShift, bool shiftR, typename T> static inline void do_shift( T &vreg, int num );
 #if USE_AVX2
 template<> inline void do_shift<true,  true , __m256i>( __m256i &vreg, int num ) { vreg = _mm256_srai_epi32( vreg, num ); }
@@ -373,6 +538,12 @@ void PelBufferOps::_initPelBufOpsX86()
 
   linTf8 = linTf_SSE_entry<vext, 8>;
   linTf4 = linTf_SSE_entry<vext, 4>;
+#if ENABLE_SIMD_OPT_GBI
+  removeWeightHighFreq8 = removeWeightHighFreq_SSE<vext, 8>;
+  removeWeightHighFreq4 = removeWeightHighFreq_SSE<vext, 4>;
+  removeHighFreq8 = removeHighFreq_SSE<vext, 8>;
+  removeHighFreq4 = removeHighFreq_SSE<vext, 4>;
+#endif
 }
 
 template void PelBufferOps::_initPelBufOpsX86<SIMDX86>();
diff --git a/source/Lib/DecoderLib/CABACReader.cpp b/source/Lib/DecoderLib/CABACReader.cpp
index 8d7ebb8d0b6104ca610c040e3f2709e9bbadc0d0..dc8bf0308500a1099a1f541513f1e976eff6a337 100644
--- a/source/Lib/DecoderLib/CABACReader.cpp
+++ b/source/Lib/DecoderLib/CABACReader.cpp
@@ -834,10 +834,62 @@ void CABACReader::cu_pred_data( CodingUnit &cu )
 
   imv_mode   ( cu, mrgCtx );
 
+#if JVET_L0646_GBI
+  cu_gbi_flag( cu );
+#endif
+
 }
 
+#if JVET_L0646_GBI
+void CABACReader::cu_gbi_flag(CodingUnit& cu)
+{
+  if(!CU::isGBiIdxCoded(cu))
+  {
+    return;
+  }
+
+  uint8_t gbiIdx = GBI_DEFAULT;
+
+  CHECK(!(GBI_NUM > 1 && (GBI_NUM == 2 || (GBI_NUM & 0x01) == 1)), " !( GBI_NUM > 1 && ( GBI_NUM == 2 || ( GBI_NUM & 0x01 ) == 1 ) ) ");
+
+  RExt__DECODER_DEBUG_BIT_STATISTICS_CREATE_SET(STATS__CABAC_BITS__GBI_IDX);
+
+  int ctxId = 0;
+
+  uint32_t idx = 0;
+  uint32_t symbol;
+
+  symbol = (m_BinDecoder.decodeBin(Ctx::GBiIdx(ctxId)));
+
+  int32_t numGBi = (cu.slice->getCheckLDC()) ? 5 : 3;
 
+  if(symbol == 0)
+  {
+    uint32_t prefixNumBits = numGBi - 2;
+    uint32_t step = 1;
+
+    unsigned ctxIdGBi = 4;
+    idx = 1;
 
+    for(int ui = 0; ui < prefixNumBits; ++ui)
+    {
+      symbol = (m_BinDecoder.decodeBin(Ctx::GBiIdx(ctxIdGBi)));
+
+      if (symbol == 1)
+      {
+        break;
+      }
+      ctxIdGBi += step;
+      idx += step;
+    }
+  }
+
+  gbiIdx = (uint8_t)g_GbiParsingOrder[idx];
+  CU::setGbiIdx(cu, gbiIdx);
+
+  DTRACE(g_trace_ctx, D_SYNTAX, "cu_gbi_flag() gbi_idx=%d\n", cu.GBiIdx ? 1 : 0);
+}
+#endif
 
 void CABACReader::intra_luma_pred_modes( CodingUnit &cu )
 {
@@ -1153,6 +1205,9 @@ void CABACReader::prediction_unit( PredictionUnit& pu, MergeCtx& mrgCtx )
     pu.mv    [REF_PIC_LIST_1] = Mv(0, 0);
     pu.refIdx[REF_PIC_LIST_1] = -1;
     pu.interDir               =  1;
+#if JVET_L0646_GBI
+    pu.cu->GBiIdx = GBI_DEFAULT;
+#endif
   }
 
   PU::spanMotionInfo( pu, mrgCtx );
diff --git a/source/Lib/DecoderLib/CABACReader.h b/source/Lib/DecoderLib/CABACReader.h
index 611b1a7b5332cf03c86c5037153b969d612397ad..fd194650c2275ac547e9cff453b7c7f04d167b64 100644
--- a/source/Lib/DecoderLib/CABACReader.h
+++ b/source/Lib/DecoderLib/CABACReader.h
@@ -80,6 +80,9 @@ public:
   void        pred_mode                 ( CodingUnit&                   cu );
   void        pcm_flag                  ( CodingUnit&                   cu );
   void        cu_pred_data              ( CodingUnit&                   cu );
+#if JVET_L0646_GBI
+  void        cu_gbi_flag               ( CodingUnit&                   cu );
+#endif
   void        intra_luma_pred_modes     ( CodingUnit&                   cu );
   void        intra_chroma_pred_modes   ( CodingUnit&                   cu );
   bool        intra_chroma_lmc_mode     ( PredictionUnit&               pu );
diff --git a/source/Lib/DecoderLib/DecCu.cpp b/source/Lib/DecoderLib/DecCu.cpp
index 458230ba81d26716989eb8dacd7fe25925349d64..8fb1c75ba89f278c1a621440ce9e8fd16f45bade 100644
--- a/source/Lib/DecoderLib/DecCu.cpp
+++ b/source/Lib/DecoderLib/DecCu.cpp
@@ -407,6 +407,11 @@ void DecCu::xDeriveCUMV( CodingUnit &cu )
       CodingStatistics::IncrementStatisticTool( CodingStatisticsClassType{ STATS__TOOL_AFF, pu.Y().width, pu.Y().height } );
     }
 #endif
+
+#if JVET_L0646_GBI
+    uint8_t gbiIdx = GBI_DEFAULT;
+#endif
+
     if( pu.mergeFlag )
     {
       {
@@ -416,7 +421,11 @@ void DecCu::xDeriveCUMV( CodingUnit &cu )
           MvField       affineMvField[2][3];
           unsigned char interDirNeighbours;
           int           numValidMergeCand;
+#if JVET_L0646_GBI
+          PU::getAffineMergeCand( pu, affineMvField, interDirNeighbours, gbiIdx, numValidMergeCand);
+#else
           PU::getAffineMergeCand( pu, affineMvField, interDirNeighbours, numValidMergeCand );
+#endif
           pu.interDir = interDirNeighbours;
           for( int i = 0; i < 2; ++i )
           {
@@ -428,6 +437,9 @@ void DecCu::xDeriveCUMV( CodingUnit &cu )
               pu.mvpNum[i] = 0;
               pu.mvd[i]    = Mv();
               PU::setAllAffineMvField( pu, mvField, RefPicList( i ) );
+#if JVET_L0646_GBI
+              pu.cu->GBiIdx = gbiIdx;
+#endif
             }
           }
           PU::spanMotionInfo( pu, mrgCtx );
@@ -466,6 +478,9 @@ void DecCu::xDeriveCUMV( CodingUnit &cu )
             pu.mv    [REF_PIC_LIST_1] = Mv(0, 0);
             pu.refIdx[REF_PIC_LIST_1] = -1;
             pu.interDir               =  1;
+#if JVET_L0646_GBI
+            pu.cu->GBiIdx = GBI_DEFAULT;
+#endif
           }
 
           PU::spanMotionInfo( pu, mrgCtx );
diff --git a/source/Lib/DecoderLib/DecSlice.cpp b/source/Lib/DecoderLib/DecSlice.cpp
index 806a506bc1b4f62659d92f4e085486ad07c7cb74..3ad162dd6c0d53af6c3f6ef882f28afc768c69fc 100644
--- a/source/Lib/DecoderLib/DecSlice.cpp
+++ b/source/Lib/DecoderLib/DecSlice.cpp
@@ -222,7 +222,13 @@ void DecSlice::decompressSlice( Slice* slice, InputBitstream* bitstream )
     }
 #endif
 
-
+#if JVET_L0646_GBI
+    bool updateGbiCodingOrder = cs.slice->getSliceType() == B_SLICE && ctuTsAddr == startCtuTsAddr;
+    if(updateGbiCodingOrder)
+    {
+      resetGbiCodingOrder(true, cs);
+    }
+#endif
 
     isLastCtuOfSliceSegment = cabacReader.coding_tree_unit( cs, ctuArea, pic->m_prevQP, ctuRsAddr );
 
diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp
index d6fb31df22c0cc30ce33710a9052b897ee778e5c..2a5117e4604f87bbb6bc884da33fbd54d5556ed6 100644
--- a/source/Lib/DecoderLib/VLCReader.cpp
+++ b/source/Lib/DecoderLib/VLCReader.cpp
@@ -804,6 +804,9 @@ void HLSyntaxReader::parseSPSNext( SPSNext& spsNext, const bool usePCM )
   {
     READ_FLAG( symbol,  "affine_type_flag" );                       spsNext.setUseAffineType          ( symbol != 0 );
   }
+#if JVET_L0646_GBI
+  READ_FLAG( symbol,    "gbi_flag" );                               spsNext.setUseGBi                 ( symbol != 0 );
+#endif
   for( int k = 0; k < SPSNext::NumReservedFlags; k++ )
   {
     READ_FLAG( symbol,  "reserved_flag" );                          if( symbol != 0 ) EXIT("Incompatible version: SPSNext reserved flag not equal to zero (bitstream was probably created with newer software version)" );
diff --git a/source/Lib/EncoderLib/CABACWriter.cpp b/source/Lib/EncoderLib/CABACWriter.cpp
index c46fc745e2604d26a310858acd373faa5256f81e..0b1bc55580cd8ef6258887272f67bfbea4cc70e9 100644
--- a/source/Lib/EncoderLib/CABACWriter.cpp
+++ b/source/Lib/EncoderLib/CABACWriter.cpp
@@ -708,9 +708,56 @@ void CABACWriter::cu_pred_data( const CodingUnit& cu )
 
   imv_mode   ( cu );
 
+#if JVET_L0646_GBI
+  cu_gbi_flag( cu );
+#endif
+
 }
 
+#if JVET_L0646_GBI
+void CABACWriter::cu_gbi_flag(const CodingUnit& cu)
+{
+  if(!CU::isGBiIdxCoded(cu))
+  {
+    return;
+  }
+
+  CHECK(!(GBI_NUM > 1 && (GBI_NUM == 2 || (GBI_NUM & 0x01) == 1)), " !( GBI_NUM > 1 && ( GBI_NUM == 2 || ( GBI_NUM & 0x01 ) == 1 ) ) ");
+  const uint8_t gbiCodingIdx = (uint8_t)g_GbiCodingOrder[CU::getValidGbiIdx(cu)];
+
+  int ctxId = 0;
+
+  int32_t numGBi = (cu.slice->getCheckLDC()) ? 5 : 3;
+
+  m_BinEncoder.encodeBin((gbiCodingIdx == 0 ? 1 : 0), Ctx::GBiIdx(ctxId));
+
+  if(numGBi > 2 && gbiCodingIdx != 0)
+  {
+    uint32_t prefixNumBits = numGBi - 2;
+    uint32_t step = 1;
+    uint8_t prefixSymbol = gbiCodingIdx;
+
+    int ctxIdGBi = 4;
+    uint8_t idx = 1;
+    for(int ui = 0; ui < prefixNumBits; ++ui)
+    {
+      if (prefixSymbol == idx)
+      {
+        m_BinEncoder.encodeBin(1, Ctx::GBiIdx(ctxIdGBi));
+        break;
+      }
+      else
+      {
+        m_BinEncoder.encodeBin(0, Ctx::GBiIdx(ctxIdGBi));
+        ctxIdGBi += step;
+        idx += step;
+      }
+    }
+  }
 
+  DTRACE(g_trace_ctx, D_SYNTAX, "cu_gbi_flag() gbi_idx=%d\n", cu.GBiIdx ? 1 : 0);
+}
+#endif
 
 void CABACWriter::intra_luma_pred_modes( const CodingUnit& cu )
 {
diff --git a/source/Lib/EncoderLib/CABACWriter.h b/source/Lib/EncoderLib/CABACWriter.h
index ce9b4423c4dc77901215dc9afa80a9cc61d69c0d..b17bfadeef0a8c6034921de77b5838e451e51257 100644
--- a/source/Lib/EncoderLib/CABACWriter.h
+++ b/source/Lib/EncoderLib/CABACWriter.h
@@ -93,6 +93,9 @@ public:
   void        pcm_data                  ( const CodingUnit&             cu );
   void        pcm_flag                  ( const CodingUnit&             cu );
   void        cu_pred_data              ( const CodingUnit&             cu );
+#if JVET_L0646_GBI
+  void        cu_gbi_flag               ( const CodingUnit&             cu );
+#endif
   void        intra_luma_pred_modes     ( const CodingUnit&             cu );
   void        intra_luma_pred_mode      ( const PredictionUnit&         pu );
   void        intra_chroma_pred_modes   ( const CodingUnit&             cu );
diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h
index 871fca519e72a4a6a1eb1d704544cb7a097f89bf..71f50a481222dc17cdb80a41932e80d642a51ef8 100644
--- a/source/Lib/EncoderLib/EncCfg.h
+++ b/source/Lib/EncoderLib/EncCfg.h
@@ -204,6 +204,10 @@ protected:
   bool      m_AltDQPCoding;
 #endif
   bool      m_compositeRefEnabled;        //composite reference
+#if JVET_L0646_GBI
+  bool      m_GBi;
+  bool      m_GBiFast;
+#endif
   // ADD_NEW_TOOL : (encoder lib) add tool enabling flags and associated parameters here
 
   bool      m_useFastLCTU;
@@ -641,7 +645,12 @@ public:
 
   void      setUseCompositeRef              (bool b)         { m_compositeRefEnabled = b; }
   bool      getUseCompositeRef              ()         const { return m_compositeRefEnabled; }
-
+#if JVET_L0646_GBI
+  void      setUseGBi                       ( bool b )       { m_GBi = b; }
+  bool      getUseGBi                       ()         const { return m_GBi; }
+  void      setUseGBiFast                   ( uint32_t b )   { m_GBiFast = b; }
+  bool      getUseGBiFast                   ()         const { return m_GBiFast; }
+#endif
   // ADD_NEW_TOOL : (encoder lib) add access functions here
 
   void      setMaxCUWidth                   ( uint32_t  u )      { m_maxCUWidth  = u; }
@@ -767,7 +776,7 @@ public:
 #if X0038_LAMBDA_FROM_QP_CAPABILITY
   int       getIntraQPOffset                () const    { return  m_intraQPOffset; }
   int       getLambdaFromQPEnable           () const    { return  m_lambdaFromQPEnable; }
-#if ENABLE_QPA
+#if ENABLE_QPA | JVET_L0646_GBI
 public:
 #else
 protected:
diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp
index a78001f3f5777a2bcd780fec484fce72f5387497..926841c1057fb2740c447752eab355c7ab3f2bc3 100644
--- a/source/Lib/EncoderLib/EncCu.cpp
+++ b/source/Lib/EncoderLib/EncCu.cpp
@@ -1641,6 +1641,9 @@ void EncCu::xCheckRDCostAffineMerge2Nx2N( CodingStructure *&tempCS, CodingStruct
   unsigned char interDirNeighbours;
   int           numValidMergeCand;
   bool          hasNoResidual = false;
+#if JVET_L0646_GBI
+  uint8_t       gbiIdx = GBI_DEFAULT;
+#endif
 
 
   tempCS->initStructData( encTestMode.qp, encTestMode.lossless );
@@ -1664,7 +1667,11 @@ void EncCu::xCheckRDCostAffineMerge2Nx2N( CodingStructure *&tempCS, CodingStruct
 
   cu.firstPU->mergeFlag = true;
   cu.firstPU->mergeIdx  = 0;
+#if JVET_L0646_GBI
+  PU::getAffineMergeCand( *cu.firstPU, affineMvField, interDirNeighbours, gbiIdx, numValidMergeCand );
+#else
   PU::getAffineMergeCand( *cu.firstPU, affineMvField, interDirNeighbours, numValidMergeCand );
+#endif
   if( numValidMergeCand == -1 )
   {
     return;
@@ -1673,6 +1680,9 @@ void EncCu::xCheckRDCostAffineMerge2Nx2N( CodingStructure *&tempCS, CodingStruct
   cu.firstPU->interDir = interDirNeighbours;
   PU::setAllAffineMvField( *cu.firstPU, affineMvField[REF_PIC_LIST_0], REF_PIC_LIST_0 );
   PU::setAllAffineMvField( *cu.firstPU, affineMvField[REF_PIC_LIST_1], REF_PIC_LIST_1 );
+#if JVET_L0646_GBI
+  cu.GBiIdx = gbiIdx;
+#endif
 
   PU::spanMotionInfo( *cu.firstPU );
 
@@ -1699,6 +1709,53 @@ void EncCu::xCheckRDCostInter( CodingStructure *&tempCS, CodingStructure *&bestC
 {
   tempCS->initStructData( encTestMode.qp, encTestMode.lossless );
 
+#if JVET_L0646_GBI
+  
+  m_pcInterSearch->setAffineModeSelected(false);
+
+  if( tempCS->slice->getCheckLDC() )
+  {
+    m_bestGbiCost[0] = m_bestGbiCost[1] = std::numeric_limits<double>::max();
+    m_bestGbiIdx[0] = m_bestGbiIdx[1] = -1;
+  }
+
+  m_pcInterSearch->resetBufferedUniMotions();
+  int gbiLoopNum = (tempCS->slice->isInterB() ? GBI_NUM : 1);
+  gbiLoopNum = (tempCS->sps->getSpsNext().getUseGBi() ? gbiLoopNum : 1);
+
+  if( tempCS->area.lwidth() * tempCS->area.lheight() < GBI_SIZE_CONSTRAINT )
+  {
+    gbiLoopNum = 1;
+  }
+
+  double curBestCost = bestCS->cost;
+  double equGBiCost = MAX_DOUBLE;
+
+  for( int gbiLoopIdx = 0; gbiLoopIdx < gbiLoopNum; gbiLoopIdx++ )
+  {
+    if( m_pcEncCfg->getUseGBiFast() )
+    {
+      auto blkCache = dynamic_cast< CacheBlkInfoCtrl* >(m_modeCtrl);
+
+      if( blkCache )
+      {
+        bool isBestInter = blkCache->getInter(bestCS->area);
+        uint8_t bestGBiIdx = blkCache->getGbiIdx(bestCS->area);
+
+        if( isBestInter && g_GbiSearchOrder[gbiLoopIdx] != GBI_DEFAULT && g_GbiSearchOrder[gbiLoopIdx] != bestGBiIdx )
+        {
+          continue;
+        }
+      }
+    }
+    if( !tempCS->slice->getCheckLDC() )
+    {
+      if( gbiLoopIdx != 0 && gbiLoopIdx != 3 && gbiLoopIdx != 4 )
+      {
+        continue;
+      }
+    }
+#endif
 
   CodingUnit &cu      = tempCS->addCU( tempCS->area, partitioner.chType );
 
@@ -1716,18 +1773,70 @@ void EncCu::xCheckRDCostInter( CodingStructure *&tempCS, CodingStructure *&bestC
   cu.qp               = encTestMode.qp;
   CU::addPUs( cu );
 
+#if JVET_L0646_GBI
+  cu.GBiIdx = g_GbiSearchOrder[gbiLoopIdx];
+  uint8_t gbiIdx = cu.GBiIdx;
+  bool  testGbi = (gbiIdx != GBI_DEFAULT);
+#endif
 
   m_pcInterSearch->predInterSearch( cu, partitioner );
 
   const unsigned wIdx = gp_sizeIdxInfo->idxFrom( tempCS->area.lwidth () );
 
+#if JVET_L0646_GBI
+  gbiIdx = CU::getValidGbiIdx(cu);
+  if( testGbi && gbiIdx == GBI_DEFAULT ) // Enabled GBi but the search results is uni.
+  {
+    tempCS->initStructData(encTestMode.qp, encTestMode.lossless);
+    continue;
+  }
+  CHECK(!(testGbi || (!testGbi && gbiIdx == GBI_DEFAULT)), " !( bTestGbi || (!bTestGbi && gbiIdx == GBI_DEFAULT ) )");
+
+  bool isEqualUni = false;
+  if( m_pcEncCfg->getUseGBiFast() )
+  {
+    if( cu.firstPU->interDir != 3 && testGbi == 0 )
+    {
+      isEqualUni = true;
+    }
+  }
+#endif
 
   xEncodeInterResidual( tempCS, bestCS, partitioner, encTestMode, 0
     , m_pImvTempCS ? m_pImvTempCS[wIdx][encTestMode.partSize] : NULL
     , 1
     , 0
+#if JVET_L0646_GBI
+    , &equGBiCost
+#endif
   );
 
+#if JVET_L0646_GBI
+  if( g_GbiSearchOrder[gbiLoopIdx] == GBI_DEFAULT )
+    m_pcInterSearch->setAffineModeSelected((bestCS->cus.front()->affine && !(bestCS->cus.front()->firstPU->mergeFlag)));
+
+  tempCS->initStructData(encTestMode.qp, encTestMode.lossless);
+
+  double skipTH = MAX_DOUBLE;
+  skipTH = (m_pcEncCfg->getUseGBiFast() ? 1.05 : MAX_DOUBLE);
+  if( equGBiCost > curBestCost * skipTH )
+  {
+    break;
+  }
+
+  if( m_pcEncCfg->getUseGBiFast() )
+  {
+    if( isEqualUni == true && m_pcEncCfg->getIntraPeriod() == -1 )
+    {
+      break;
+    }
+  }
+  if( g_GbiSearchOrder[gbiLoopIdx] == GBI_DEFAULT && xIsGBiSkip(cu) && m_pcEncCfg->getUseGBiFast() )
+  {
+    break;
+  }
+ }  // for( UChar gbiLoopIdx = 0; gbiLoopIdx < gbiLoopNum; gbiLoopIdx++ )
+#endif
 }
 
 
@@ -1737,6 +1846,9 @@ void EncCu::xCheckRDCostInter( CodingStructure *&tempCS, CodingStructure *&bestC
 bool EncCu::xCheckRDCostInterIMV( CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &partitioner, const EncTestMode& encTestMode )
 {
   int iIMV = int( ( encTestMode.opts & ETO_IMV ) >> ETO_IMV_SHIFT );
+#if JVET_L0646_GBI
+  m_pcInterSearch->setAffineModeSelected(false);
+#endif
   // Only int-Pel, 4-Pel and fast 4-Pel allowed
   CHECK( iIMV != 1 && iIMV != 2 && iIMV != 3, "Unsupported IMV Mode" );
   // Fast 4-Pel Mode
@@ -1762,6 +1874,53 @@ bool EncCu::xCheckRDCostInterIMV( CodingStructure *&tempCS, CodingStructure *&be
     }
   }
 
+#if JVET_L0646_GBI
+  m_pcInterSearch->resetBufferedUniMotions();
+  int gbiLoopNum = (tempCS->slice->isInterB() ? GBI_NUM : 1);
+  gbiLoopNum = (pcCUInfo2Reuse != NULL ? 1 : gbiLoopNum);
+  gbiLoopNum = (tempCS->slice->getSPS()->getSpsNext().getUseGBi() ? gbiLoopNum : 1);
+
+  if( tempCS->area.lwidth() * tempCS->area.lheight() < GBI_SIZE_CONSTRAINT )
+  {
+    gbiLoopNum = 1;
+  }
+
+  double curBestCost = bestCS->cost;
+  double equGBiCost = MAX_DOUBLE;
+
+  for( int gbiLoopIdx = 0; gbiLoopIdx < gbiLoopNum; gbiLoopIdx++ )
+  {
+    if( m_pcEncCfg->getUseGBiFast() )
+    {
+      auto blkCache = dynamic_cast< CacheBlkInfoCtrl* >(m_modeCtrl);
+
+      if( blkCache )
+      {
+        bool isBestInter = blkCache->getInter(bestCS->area);
+        uint8_t bestGBiIdx = blkCache->getGbiIdx(bestCS->area);
+
+        if( isBestInter && g_GbiSearchOrder[gbiLoopIdx] != GBI_DEFAULT && g_GbiSearchOrder[gbiLoopIdx] != bestGBiIdx )
+        {
+          continue;
+        }
+      }
+    }
+
+    if( !tempCS->slice->getCheckLDC() )
+    {
+      if( gbiLoopIdx != 0 && gbiLoopIdx != 3 && gbiLoopIdx != 4 )
+      {
+        continue;
+      }
+    }
+
+    if( m_pcEncCfg->getUseGBiFast() && tempCS->slice->getCheckLDC() && g_GbiSearchOrder[gbiLoopIdx] != GBI_DEFAULT
+      && (m_bestGbiIdx[0] >= 0 && g_GbiSearchOrder[gbiLoopIdx] != m_bestGbiIdx[0])
+      && (m_bestGbiIdx[1] >= 0 && g_GbiSearchOrder[gbiLoopIdx] != m_bestGbiIdx[1]))
+    {
+      continue;
+    }
+#endif
 
   CodingUnit &cu = ( pcCUInfo2Reuse != nullptr ) ? *tempCS->getCU( partitioner.chType ) : tempCS->addCU( tempCS->area, partitioner.chType );
 
@@ -1795,12 +1954,21 @@ bool EncCu::xCheckRDCostInterIMV( CodingStructure *&tempCS, CodingStructure *&be
   cu.imv      = iIMV > 1 ? 2 : 1;
   cu.emtFlag  = false;
 
+#if JVET_L0646_GBI
+  bool testGbi;
+  uint8_t gbiIdx;
+#endif
   
   if( pcCUInfo2Reuse != nullptr )
   {
     // reuse the motion info from pcCUInfo2Reuse
     CU::resetMVDandMV2Int( cu, m_pcInterSearch );
 
+#if JVET_L0646_GBI
+    CHECK(cu.GBiIdx < 0 || cu.GBiIdx >= GBI_NUM, "cu.GBiIdx < 0 || cu.GBiIdx >= GBI_NUM");
+    gbiIdx = CU::getValidGbiIdx(cu);
+    testGbi = (gbiIdx != GBI_DEFAULT);
+#endif
 
     if( !CU::hasSubCUNonZeroMVd( cu ) )
     {
@@ -1814,11 +1982,36 @@ bool EncCu::xCheckRDCostInterIMV( CodingStructure *&tempCS, CodingStructure *&be
   }
   else
   {
+#if JVET_L0646_GBI 
+    cu.GBiIdx = g_GbiSearchOrder[gbiLoopIdx];
+    gbiIdx = cu.GBiIdx;
+    testGbi = (gbiIdx != GBI_DEFAULT);
+#endif
 
     m_pcInterSearch->predInterSearch( cu, partitioner );
 
+#if JVET_L0646_GBI
+    gbiIdx = CU::getValidGbiIdx(cu);
+#endif
   }
 
+#if JVET_L0646_GBI
+  if( testGbi && gbiIdx == GBI_DEFAULT ) // Enabled GBi but the search results is uni.
+  {
+    tempCS->initStructData(encTestMode.qp, encTestMode.lossless);
+    continue;
+  }
+  CHECK(!(testGbi || (!testGbi && gbiIdx == GBI_DEFAULT)), " !( bTestGbi || (!bTestGbi && gbiIdx == GBI_DEFAULT ) )");
+
+  bool isEqualUni = false;
+  if( m_pcEncCfg->getUseGBiFast() )
+  {
+    if( cu.firstPU->interDir != 3 && testGbi == 0 )
+    {
+      isEqualUni = true;
+    }
+  }
+#endif
 
   if( !CU::hasSubCUNonZeroMVd( cu ) )
   {
@@ -1830,8 +2023,35 @@ bool EncCu::xCheckRDCostInterIMV( CodingStructure *&tempCS, CodingStructure *&be
     , NULL
     , true
     , 0
+#if JVET_L0646_GBI
+    , &equGBiCost
+#endif
   );
 
+#if JVET_L0646_GBI
+  tempCS->initStructData(encTestMode.qp, encTestMode.lossless);
+
+  double skipTH = MAX_DOUBLE;
+  skipTH = (m_pcEncCfg->getUseGBiFast() ? 1.05 : MAX_DOUBLE);
+  if( equGBiCost > curBestCost * skipTH )
+  {
+    break;
+  }
+
+  if( m_pcEncCfg->getUseGBiFast() )
+  {
+    if( isEqualUni == true && m_pcEncCfg->getIntraPeriod() == -1 )
+    {
+      break;
+    }
+  }
+  if( g_GbiSearchOrder[gbiLoopIdx] == GBI_DEFAULT && xIsGBiSkip(cu) && m_pcEncCfg->getUseGBiFast() )
+  {
+    break;
+  }
+ } // for( UChar gbiLoopIdx = 0; gbiLoopIdx < gbiLoopNum; gbiLoopIdx++ )
+#endif
+
   return true;
 }
 
@@ -1839,6 +2059,9 @@ void EncCu::xEncodeInterResidual( CodingStructure *&tempCS, CodingStructure *&be
   , CodingStructure* imvCS
   , int emtMode
   , bool* bestHasNonResi
+#if JVET_L0646_GBI
+  , double* equGBiCost
+#endif
   )
 {
   if( residualPass == 1 && encTestMode.lossless )
@@ -1904,6 +2127,37 @@ void EncCu::xEncodeInterResidual( CodingStructure *&tempCS, CodingStructure *&be
 
     xCheckDQP( *tempCS, partitioner );
 
+#if JVET_L0646_GBI
+    if( ETM_INTER_ME == encTestMode.type )
+    {
+      if( equGBiCost != NULL )
+      {
+        if( tempCS->cost < (*equGBiCost) && cu->GBiIdx == GBI_DEFAULT )
+        {
+          (*equGBiCost) = tempCS->cost;
+        }
+      }
+      else
+      {
+        CHECK(equGBiCost == NULL, "equGBiCost == NULL");
+      }
+      if( tempCS->slice->getCheckLDC() && !cu->imv && cu->GBiIdx != GBI_DEFAULT && tempCS->cost < m_bestGbiCost[1] )
+      {
+        if( tempCS->cost < m_bestGbiCost[0] )
+        {
+          m_bestGbiCost[1] = m_bestGbiCost[0];
+          m_bestGbiCost[0] = tempCS->cost;
+          m_bestGbiIdx[1] = m_bestGbiIdx[0];
+          m_bestGbiIdx[0] = cu->GBiIdx;
+        }
+        else
+        {
+          m_bestGbiCost[1] = tempCS->cost;
+          m_bestGbiIdx[1] = cu->GBiIdx;
+        }
+      }
+    }
+#endif
 
     double emtFirstPassCost = tempCS->cost;
     if( imvCS && (tempCS->cost < imvCS->cost) )
@@ -2038,4 +2292,5 @@ void EncCu::xReuseCachedResult( CodingStructure *&tempCS, CodingStructure *&best
 
 #endif
 
+
 //! \}
diff --git a/source/Lib/EncoderLib/EncCu.h b/source/Lib/EncoderLib/EncCu.h
index da5ca4d9be280affd153c924895ac9dc33009ce0..d8131f4532a7f5b5e278e8284af1d8cf9275b42c 100644
--- a/source/Lib/EncoderLib/EncCu.h
+++ b/source/Lib/EncoderLib/EncCu.h
@@ -118,7 +118,10 @@ private:
 #if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
   EncLib*               m_pcEncLib;
 #endif
-
+#if JVET_L0646_GBI
+  int                   m_bestGbiIdx[2];
+  double                m_bestGbiCost[2];
+#endif
 #if SHARP_LUMA_DELTA_QP
   void    updateLambda      ( Slice* slice, double dQP );
 #endif
@@ -192,10 +195,23 @@ protected:
     , CodingStructure* imvCS = NULL
     , int emtMode = 1
     , bool* bestHasNonResi = NULL
+#if JVET_L0646_GBI
+    , double* equGBiCost = NULL
+#endif
   );
 #if REUSE_CU_RESULTS
   void xReuseCachedResult     ( CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &Partitioner );
 #endif
+
+#if JVET_L0646_GBI
+  bool xIsGBiSkip(const CodingUnit& cu)
+  {
+    return((m_pcEncCfg->getBaseQP() > 32) && ((cu.slice->getTLayer() >= 4)
+       || ((cu.refIdxBi[0] >= 0 && cu.refIdxBi[1] >= 0)
+       && (abs(cu.slice->getPOC() - cu.slice->getRefPOC(REF_PIC_LIST_0, cu.refIdxBi[0])) == 1
+       ||  abs(cu.slice->getPOC() - cu.slice->getRefPOC(REF_PIC_LIST_1, cu.refIdxBi[1])) == 1))));
+  }
+#endif
 };
 
 //! \}
diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp
index 3460e0c9f90b54afe65158eaddff8021b9bd8ced..38661820cc96a0e04fc02b0db5244357eca8fc3c 100644
--- a/source/Lib/EncoderLib/EncLib.cpp
+++ b/source/Lib/EncoderLib/EncLib.cpp
@@ -852,7 +852,9 @@ void EncLib::xInitSPS(SPS &sps)
   sps.getSpsNext().setUseIntraEMT           ( m_IntraEMT );
   sps.getSpsNext().setUseInterEMT           ( m_InterEMT );
   sps.getSpsNext().setUseCompositeRef       ( m_compositeRefEnabled );
-
+#if JVET_L0646_GBI
+  sps.getSpsNext().setUseGBi                ( m_GBi );
+#endif
   // ADD_NEW_TOOL : (encoder lib) set tool enabling flags and associated parameters here
 
   int minCUSize = ( /*sps.getSpsNext().getUseQTBT() ? 1 << MIN_CU_LOG2 :*/ sps.getMaxCUWidth() >> sps.getLog2DiffMaxMinCodingBlockSize() );
diff --git a/source/Lib/EncoderLib/EncModeCtrl.cpp b/source/Lib/EncoderLib/EncModeCtrl.cpp
index 2871f8c41a76a533b22ed10e07a4fc7bb9c6f006..db0ef5bf6b7d7b56b9aaafd1edfb1007e80262f6 100644
--- a/source/Lib/EncoderLib/EncModeCtrl.cpp
+++ b/source/Lib/EncoderLib/EncModeCtrl.cpp
@@ -503,6 +503,29 @@ bool CacheBlkInfoCtrl::getMv( const UnitArea& area, const RefPicList refPicList,
   return m_codedCUInfo[idx1][idx2][idx3][idx4]->validMv[refPicList][iRefIdx];
 }
 
+#if JVET_L0646_GBI 
+bool CacheBlkInfoCtrl::getInter(const UnitArea& area)
+{
+  unsigned idx1, idx2, idx3, idx4;
+  getAreaIdx(area.Y(), *m_slice_chblk->getPPS()->pcv, idx1, idx2, idx3, idx4);
+
+  return m_codedCUInfo[idx1][idx2][idx3][idx4]->isInter;
+}
+void CacheBlkInfoCtrl::setGbiIdx(const UnitArea& area, uint8_t gBiIdx)
+{
+  unsigned idx1, idx2, idx3, idx4;
+  getAreaIdx(area.Y(), *m_slice_chblk->getPPS()->pcv, idx1, idx2, idx3, idx4);
+
+  m_codedCUInfo[idx1][idx2][idx3][idx4]->GBiIdx = gBiIdx;
+}
+uint8_t CacheBlkInfoCtrl::getGbiIdx(const UnitArea& area)
+{
+  unsigned idx1, idx2, idx3, idx4;
+  getAreaIdx(area.Y(), *m_slice_chblk->getPPS()->pcv, idx1, idx2, idx3, idx4);
+
+  return m_codedCUInfo[idx1][idx2][idx3][idx4]->GBiIdx;
+}
+#endif
 
 #if REUSE_CU_RESULTS
 static bool isTheSameNbHood( const CodingUnit &cu, const Partitioner &partitioner )
@@ -1482,6 +1505,9 @@ bool EncModeCtrlMTnoRQT::tryMode( const EncTestMode& encTestmode, const CodingSt
           relatedCU.isSkip   |= bestCU->skip;
 #else
           relatedCU.isSkip    = bestCU->skip;
+#endif
+#if JVET_L0646_GBI
+          relatedCU.GBiIdx    = bestCU->GBiIdx;
 #endif
         }
         else if( CU::isIntra( *bestCU ) )
diff --git a/source/Lib/EncoderLib/EncModeCtrl.h b/source/Lib/EncoderLib/EncModeCtrl.h
index 7209db6e34492a8aa9c97f5987d7785dc8f00344..987c6a6750dc5ff2fb66ab023c83c529c5324bdd 100644
--- a/source/Lib/EncoderLib/EncModeCtrl.h
+++ b/source/Lib/EncoderLib/EncModeCtrl.h
@@ -321,6 +321,9 @@ struct CodedCUInfo
   bool validMv[NUM_REF_PIC_LIST_01][MAX_STORED_CU_INFO_REFS];
   Mv   saveMv [NUM_REF_PIC_LIST_01][MAX_STORED_CU_INFO_REFS];
 
+#if JVET_L0646_GBI   
+  uint8_t GBiIdx;
+#endif
 
 #if ENABLE_SPLIT_PARALLELISM
 
@@ -369,6 +372,11 @@ public:
   bool getMv  ( const UnitArea& area, const RefPicList refPicList, const int iRefIdx,       Mv& rMv ) const;
   void setMv  ( const UnitArea& area, const RefPicList refPicList, const int iRefIdx, const Mv& rMv );
 
+#if JVET_L0646_GBI 
+  bool  getInter( const UnitArea& area );
+  void  setGbiIdx( const UnitArea& area, uint8_t gBiIdx );
+  uint8_t getGbiIdx( const UnitArea& area );
+#endif
 };
 
 #if REUSE_CU_RESULTS
diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp
index dd02a24b0caba44506d5d2c488319e37f9e9e3ff..c9791afa2acfb7ad0632ce30a6cf5a2896ec3257 100644
--- a/source/Lib/EncoderLib/EncSlice.cpp
+++ b/source/Lib/EncoderLib/EncSlice.cpp
@@ -1615,7 +1615,14 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons
     }
 #endif
 
-
+#if JVET_L0646_GBI
+    bool updateGbiCodingOrder = cs.slice->getSliceType() == B_SLICE && ctuTsAddr == startCtuTsAddr;
+    if( updateGbiCodingOrder )
+    {
+      resetGbiCodingOrder(false, cs);
+      m_pcInterSearch->initWeightIdxBits();
+    }
+#endif
 
 #if ENABLE_WPP_PARALLELISM
     pEncLib->getCuEncoder( dataId )->compressCtu( cs, ctuArea, ctuRsAddr, prevQP, currQP );
diff --git a/source/Lib/EncoderLib/InterSearch.cpp b/source/Lib/EncoderLib/InterSearch.cpp
index bfd32f0cb2b4dbf7671eebb0b8c4bdc13801aed8..2bff5100b06f4d8e3edc7999211bed1c84235e5a 100644
--- a/source/Lib/EncoderLib/InterSearch.cpp
+++ b/source/Lib/EncoderLib/InterSearch.cpp
@@ -727,6 +727,12 @@ void InterSearch::xMergeEstimation( PredictionUnit& pu, PelUnitBuf& origBuf, int
       uiMergeIdx = uiMergeCand;
     }
   }
+#if JVET_L0646_GBI
+  if( pu.cu->GBiIdx != GBI_DEFAULT )
+  {
+    pu.cu->GBiIdx = GBI_DEFAULT; // Reset to default for the rest modes.
+  }
+#endif
 
 }
 
@@ -772,7 +778,10 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
   int          bestBiPMvpL1    = 0;
   Distortion   biPDistTemp     = std::numeric_limits<Distortion>::max();
 
-
+#if JVET_L0646_GBI
+  uint8_t      gbiIdx          = (cu.cs->slice->isInterB() ? cu.GBiIdx : GBI_DEFAULT);
+  bool         enforceGBiPred = false;
+#endif
   MergeCtx     mergeCtx;
 
   // Loop over Prediction Units
@@ -875,6 +884,14 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
           {
             xMotionEstimation( pu, origBuf, eRefPicList, cMvPred[iRefList][iRefIdxTemp], iRefIdxTemp, cMvTemp[iRefList][iRefIdxTemp], aaiMvpIdx[iRefList][iRefIdxTemp], uiBitsTemp, uiCostTemp, amvp[eRefPicList] );
           }
+#if JVET_L0646_GBI
+          if( cu.cs->sps->getSpsNext().getUseGBi() && cu.GBiIdx == GBI_DEFAULT && cu.cs->slice->isInterB() )
+          {
+            const bool checkIdentical = true;
+            m_cUniMotions.setReadMode(checkIdentical, (uint32_t)iRefList, (uint32_t)iRefIdxTemp);
+            m_cUniMotions.copyFrom(cMvTemp[iRefList][iRefIdxTemp], uiCostTemp - m_pcRdCost->getCost(uiBitsTemp), (uint32_t)iRefList, (uint32_t)iRefIdxTemp);
+          }
+#endif
           xCopyAMVPInfo( &amvp[eRefPicList], &aacAMVPInfo[iRefList][iRefIdxTemp]); // must always be done ( also when AMVP_MODE = AM_NONE )
           xCheckBestMVP( eRefPicList, cMvTemp[iRefList][iRefIdxTemp], cMvPred[iRefList][iRefIdxTemp], aaiMvpIdx[iRefList][iRefIdxTemp], amvp[eRefPicList], uiBitsTemp, uiCostTemp, pu.cu->imv );
 
@@ -916,7 +933,11 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
         ::memcpy( cMvHevcTemp, cMvTemp, sizeof( cMvTemp ) );
       }
       //  Bi-predictive Motion estimation
-      if( ( cs.slice->isInterB() ) && ( PU::isBipredRestriction( pu ) == false ) )
+      if( ( cs.slice->isInterB() ) && ( PU::isBipredRestriction( pu ) == false ) 
+#if JVET_L0646_GBI
+        && (cu.slice->getCheckLDC() || gbiIdx == GBI_DEFAULT || !m_affineModeSelected || !m_pcEncCfg->getUseGBiFast())
+#endif 
+        )
       {
         cMvBi[0] = cMv[0];
         cMvBi[1] = cMv[1];
@@ -981,6 +1002,9 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
           iNumIter = 1;
         }
 
+#if JVET_L0646_GBI
+        enforceGBiPred = (gbiIdx != GBI_DEFAULT);
+#endif
         for ( int iIter = 0; iIter < iNumIter; iIter++ )
         {
           int         iRefList    = iIter % 2;
@@ -995,6 +1019,12 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
             {
               iRefList = 0;
             }
+#if JVET_L0646_GBI
+            if( gbiIdx != GBI_DEFAULT )
+            {
+              iRefList = ( abs( getGbiWeight(gbiIdx, REF_PIC_LIST_0 ) ) > abs( getGbiWeight(gbiIdx, REF_PIC_LIST_1 ) ) ? 1 : 0 );
+            }
+#endif
           }
           else if ( iIter == 0 )
           {
@@ -1025,9 +1055,20 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
 
           iRefStart = 0;
           iRefEnd   = cs.slice->getNumRefIdx(eRefPicList)-1;
-          for ( int iRefIdxTemp = iRefStart; iRefIdxTemp <= iRefEnd; iRefIdxTemp++ )
+          for( int iRefIdxTemp = iRefStart; iRefIdxTemp <= iRefEnd; iRefIdxTemp++ )
           {
+#if JVET_L0646_GBI
+            if( m_pcEncCfg->getUseGBiFast() && (gbiIdx != GBI_DEFAULT)
+              && (pu.cu->slice->getRefPic(eRefPicList, iRefIdxTemp)->getPOC() == pu.cu->slice->getRefPic(RefPicList(1 - iRefList), pu.refIdx[1 - iRefList])->getPOC())
+              && (!pu.cu->imv && pu.cu->slice->getTLayer()>1))
+            {
+              continue;
+            }
+#endif
             uiBitsTemp = uiMbBits[2] + uiMotBits[1-iRefList];
+#if JVET_L0646_GBI
+            uiBitsTemp += ((cs.slice->getSPS()->getSpsNext().getUseGBi() == true) ? getWeightIdxBits(gbiIdx) : 0);
+#endif
             if ( cs.slice->getNumRefIdx(eRefPicList) > 1 )
             {
               uiBitsTemp += iRefIdxTemp+1;
@@ -1051,6 +1092,9 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
 
               uiCostBi            = uiCostTemp;
               uiMotBits[iRefList] = uiBitsTemp - uiMbBits[2] - uiMotBits[1-iRefList];
+#if JVET_L0646_GBI
+              uiMotBits[iRefList] -= ((cs.slice->getSPS()->getSpsNext().getUseGBi() == true) ? getWeightIdxBits(gbiIdx) : 0);
+#endif
               uiBits[2]           = uiBitsTemp;
 
               if(iNumIter!=1)
@@ -1071,7 +1115,11 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
 
           if ( !bChanged )
           {
+#if JVET_L0646_GBI
+            if ((uiCostBi <= uiCost[0] && uiCostBi <= uiCost[1]) || enforceGBiPred)
+#else
             if ( uiCostBi <= uiCost[0] && uiCostBi <= uiCost[1] )
+#endif
             {
               xCopyAMVPInfo(&aacAMVPInfo[0][iRefIdxBi[0]], &amvp[REF_PIC_LIST_0]);
               xCheckBestMVP( REF_PIC_LIST_0, cMvBi[0], cMvPredBi[0][iRefIdxBi[0]], aaiMvpIdxBi[0][iRefIdxBi[0]], amvp[eRefPicList], uiBits[2], uiCostBi, pu.cu->imv);
@@ -1084,6 +1132,10 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
             break;
           }
         } // for loop-iter
+#if JVET_L0646_GBI
+        cu.refIdxBi[0] = iRefIdxBi[0];
+        cu.refIdxBi[1] = iRefIdxBi[1];
+#endif
       } // if (B_SLICE)
 
 
@@ -1110,6 +1162,12 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
     uiBits [1] = bitsValidList1;
     uiCost [1] = costValidList1;
 
+#if JVET_L0646_GBI
+    if( enforceGBiPred )
+    {
+      uiCost[0] = uiCost[1] = MAX_UINT;
+    }
+#endif
 
       uiLastModeTemp = uiLastMode;
       if ( uiCostBi <= uiCost[0] && uiCostBi <= uiCost[1])
@@ -1168,6 +1226,12 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
         uiMEBits = uiBits[1];
       }
 
+#if JVET_L0646_GBI 
+      if( gbiIdx != GBI_DEFAULT )
+      {
+        cu.GBiIdx = GBI_DEFAULT; // Reset to default for the Non-NormalMC modes.
+      }
+#endif
 
     if ( cu.partSize != SIZE_2Nx2N )
     {
@@ -1201,7 +1265,11 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
       uiHevcCost = ( uiCostBi <= uiCost[0] && uiCostBi <= uiCost[1] ) ? uiCostBi : ( ( uiCost[0] <= uiCost[1] ) ? uiCost[0] : uiCost[1] );
     }
     CHECK( !( !cu.cs->pcv->only2Nx2N || cu.partSize == SIZE_2Nx2N ), "Unexpected part size for QTBT." );
-    if (cu.Y().width > 8 && cu.Y().height > 8 && cu.partSize == SIZE_2Nx2N && cu.slice->getSPS()->getSpsNext().getUseAffine() && cu.imv == 0)
+    if (cu.Y().width > 8 && cu.Y().height > 8 && cu.partSize == SIZE_2Nx2N && cu.slice->getSPS()->getSpsNext().getUseAffine() && cu.imv == 0
+#if JVET_L0646_GBI
+      && (gbiIdx == GBI_DEFAULT || m_affineModeSelected || !m_pcEncCfg->getUseGBiFast())
+#endif      
+      )
     {
       // save normal hevc result
       uint32_t uiMRGIndex = pu.mergeIdx;
@@ -1226,7 +1294,12 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
       Mv acMvAffine4Para[2][33][3];
       int refIdx4Para[2] = { -1, -1 };
 
+#if JVET_L0646_GBI
+      xPredAffineInterSearch(pu, origBuf, puIdx, uiLastModeTemp, uiAffineCost, cMvHevcTemp, acMvAffine4Para, refIdx4Para, gbiIdx, enforceGBiPred,
+        ((cu.slice->getSPS()->getSpsNext().getUseGBi() == true) ? getWeightIdxBits(gbiIdx) : 0));
+#else
       xPredAffineInterSearch(pu, origBuf, puIdx, uiLastModeTemp, uiAffineCost, cMvHevcTemp, acMvAffine4Para, refIdx4Para);
+#endif
       if ( cu.slice->getSPS()->getSpsNext().getUseAffineType() )
       {
         if ( uiAffineCost < uiHevcCost * 1.05 ) ///< condition for 6 parameter affine ME
@@ -1261,7 +1334,12 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
 
           Distortion uiAffine6Cost = std::numeric_limits<Distortion>::max();
           cu.affineType = AFFINEMODEL_6PARAM;
+#if JVET_L0646_GBI
+          xPredAffineInterSearch(pu, origBuf, puIdx, uiLastModeTemp, uiAffine6Cost, cMvHevcTemp, acMvAffine4Para, refIdx4Para, gbiIdx, enforceGBiPred,
+            ((cu.slice->getSPS()->getSpsNext().getUseGBi() == true) ? getWeightIdxBits(gbiIdx) : 0));
+#else
           xPredAffineInterSearch(pu, origBuf, puIdx, uiLastModeTemp, uiAffine6Cost, cMvHevcTemp, acMvAffine4Para, refIdx4Para);
+#endif
 
           // reset to 4 parameter affine inter mode
           if ( uiAffineCost <= uiAffine6Cost )
@@ -1326,7 +1404,15 @@ void InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner)
       }
     }
 
-
+#if JVET_L0646_GBI
+    if( cu.firstPU->interDir == 3 && !cu.firstPU->mergeFlag )
+    {
+      if (gbiIdx != GBI_DEFAULT)
+      {
+        cu.GBiIdx = gbiIdx;
+      }
+    }
+#endif
     m_maxCompIDToPred = MAX_NUM_COMPONENT;
 
     {
@@ -1573,6 +1659,12 @@ Distortion InterSearch::xGetAffineTemplateCost( PredictionUnit& pu, PelUnitBuf&
 
 void InterSearch::xMotionEstimation(PredictionUnit& pu, PelUnitBuf& origBuf, RefPicList eRefPicList, Mv& rcMvPred, int iRefIdxPred, Mv& rcMv, int& riMVPIdx, uint32_t& ruiBits, Distortion& ruiCost, const AMVPInfo& amvpInfo, bool bBi)
 {
+#if JVET_L0646_GBI
+  if( pu.cu->cs->sps->getSpsNext().getUseGBi() && pu.cu->GBiIdx != GBI_DEFAULT && !bBi && xReadBufferedUniMv(pu, eRefPicList, iRefIdxPred, rcMvPred, rcMv, ruiBits, ruiCost) )
+  {
+    return;
+  }
+#endif
 
   Mv cMvHalf, cMvQter;
 
@@ -1591,10 +1683,17 @@ void InterSearch::xMotionEstimation(PredictionUnit& pu, PelUnitBuf& origBuf, Ref
     PelUnitBuf otherBuf = m_tmpPredStorage[1 - (int)eRefPicList].getBuf( UnitAreaRelative(*pu.cu, pu ));
     origBufTmp.copyFrom(origBuf);
     origBufTmp.removeHighFreq( otherBuf, m_pcEncCfg->getClipForBiPredMeEnabled(), pu.cu->slice->clpRngs()
+#if JVET_L0646_GBI
+                              ,getGbiWeight( pu.cu->GBiIdx, eRefPicList )
+#endif 
                               );
     pBuf = &origBufTmp;
 
+#if JVET_L0646_GBI
+    fWeight = xGetMEDistortionWeight( pu.cu->GBiIdx, eRefPicList );
+#else
     fWeight = 0.5;
+#endif
   }
   m_cDistParam.isBiPred = bBi;
 
@@ -2457,6 +2556,11 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
                                           Mv                    hevcMv[2][33]
                                         , Mv                    mvAffine4Para[2][33][3]
                                         , int                   refIdx4Para[2]
+#if JVET_L0646_GBI 
+                                        , uint8_t               gbiIdx
+                                        , bool                  enforceGBiPred
+                                        , uint32_t              gbiIdxBits
+#endif
                                          )
 {
   const Slice &slice = *pu.cu->slice;
@@ -2522,6 +2626,12 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
   pu.cu->affine = true;
   pu.mergeFlag = false;
 
+#if JVET_L0646_GBI
+  if( gbiIdx != GBI_DEFAULT )
+  {
+    pu.cu->GBiIdx = gbiIdx;
+  }
+#endif
 
   // Uni-directional prediction
   for ( int iRefList = 0; iRefList < iNumPredDir; iRefList++ )
@@ -2665,6 +2775,14 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
       {
         xAffineMotionEstimation( pu, origBuf, eRefPicList, cMvPred[iRefList][iRefIdxTemp], iRefIdxTemp, cMvTemp[iRefList][iRefIdxTemp], uiBitsTemp, uiCostTemp );
       }
+#if JVET_L0646_GBI
+      if(pu.cu->cs->sps->getSpsNext().getUseGBi() && pu.cu->GBiIdx == GBI_DEFAULT && pu.cu->slice->isInterB())
+      {
+        m_cUniMotions.setReadModeAffine(true, (uint8_t)iRefList, (uint8_t)iRefIdxTemp, pu.cu->affineType);
+        m_cUniMotions.copyAffineMvFrom(cMvTemp[iRefList][iRefIdxTemp], uiCostTemp - m_pcRdCost->getCost(uiBitsTemp), (uint8_t)iRefList, (uint8_t)iRefIdxTemp, pu.cu->affineType
+        );
+      }
+#endif
       // Set best AMVP Index
       xCopyAffineAMVPInfo( affiAMVPInfoTemp[eRefPicList], aacAffineAMVPInfo[iRefList][iRefIdxTemp] );
       xCheckBestAffineMVP( pu, affiAMVPInfoTemp[eRefPicList], eRefPicList, cMvTemp[iRefList][iRefIdxTemp], cMvPred[iRefList][iRefIdxTemp], aaiMvpIdx[iRefList][iRefIdxTemp], uiBitsTemp, uiCostTemp );
@@ -2785,6 +2903,12 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
         {
           iRefList = 0;
         }
+#if JVET_L0646_GBI
+        if( gbiIdx != GBI_DEFAULT )
+        {
+          iRefList = ( abs( getGbiWeight( gbiIdx, REF_PIC_LIST_0 ) ) > abs( getGbiWeight( gbiIdx, REF_PIC_LIST_1 ) ) ? 1 : 0 );
+        }
+#endif
       }
       else if ( iIter == 0 )
       {
@@ -2823,9 +2947,19 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
         {
           continue;
         }
-
+#if JVET_L0646_GBI
+        if(m_pcEncCfg->getUseGBiFast() && (gbiIdx != GBI_DEFAULT)
+          && (pu.cu->slice->getRefPic(eRefPicList, iRefIdxTemp)->getPOC() == pu.cu->slice->getRefPic(RefPicList(1 - iRefList), pu.refIdx[1 - iRefList])->getPOC())
+          && (pu.cu->affineType == AFFINEMODEL_4PARAM && pu.cu->slice->getTLayer()>1))
+        {
+          continue;
+        }
+#endif
         // update bits
         uiBitsTemp = uiMbBits[2] + uiMotBits[1-iRefList];
+#if JVET_L0646_GBI 
+        uiBitsTemp += ((pu.cu->slice->getSPS()->getSpsNext().getUseGBi() == true) ? gbiIdxBits : 0);
+#endif
         if( slice.getNumRefIdx(eRefPicList) > 1 )
         {
           uiBitsTemp += iRefIdxTemp+1;
@@ -2849,6 +2983,9 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
 
           uiCostBi            = uiCostTemp;
           uiMotBits[iRefList] = uiBitsTemp - uiMbBits[2] - uiMotBits[1-iRefList];
+#if JVET_L0646_GBI 
+          uiMotBits[iRefList] -= ((pu.cu->slice->getSPS()->getSpsNext().getUseGBi() == true) ? gbiIdxBits : 0);
+#endif
           uiBits[2]           = uiBitsTemp;
 
           if ( iNumIter != 1 ) // MC for next iter
@@ -2868,7 +3005,11 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
 
       if ( !bChanged )
       {
+#if JVET_L0646_GBI 
+        if ((uiCostBi <= uiCost[0] && uiCostBi <= uiCost[1]) || enforceGBiPred)
+#else
         if ( uiCostBi <= uiCost[0] && uiCostBi <= uiCost[1] )
+#endif
         {
           xCopyAffineAMVPInfo( aacAffineAMVPInfo[0][iRefIdxBi[0]], affiAMVPInfoTemp[REF_PIC_LIST_0] );
           xCheckBestAffineMVP( pu, affiAMVPInfoTemp[REF_PIC_LIST_0], REF_PIC_LIST_0, cMvBi[0], cMvPredBi[0][iRefIdxBi[0]], aaiMvpIdxBi[0][iRefIdxBi[0]], uiBits[2], uiCostBi );
@@ -2907,6 +3048,12 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
   uiBits[1]  = bitsValidList1;
   uiCost[1]  = costValidList1;
 
+#if JVET_L0646_GBI 
+  if( enforceGBiPred )
+  {
+    uiCost[0] = uiCost[1] = MAX_UINT;
+  }
+#endif
 
   // Affine ME result set
   if ( uiCostBi <= uiCost[0] && uiCostBi <= uiCost[1] ) // Bi
@@ -2995,6 +3142,12 @@ void InterSearch::xPredAffineInterSearch( PredictionUnit&       pu,
     pu.mvpIdx[REF_PIC_LIST_1] = aaiMvpIdx[1][iRefIdx[1]];
     pu.mvpNum[REF_PIC_LIST_1] = aaiMvpNum[1][iRefIdx[1]];
   }
+#if JVET_L0646_GBI
+  if( gbiIdx != GBI_DEFAULT )
+  {
+    pu.cu->GBiIdx = GBI_DEFAULT;
+  }
+#endif
 }
 
 void solveEqual( double** dEqualCoeff, int iOrder, double* dAffinePara )
@@ -3161,6 +3314,12 @@ void InterSearch::xAffineMotionEstimation( PredictionUnit& pu,
                                            Distortion&     ruiCost,
                                            bool            bBi )
 {
+#if JVET_L0646_GBI
+  if( pu.cu->cs->sps->getSpsNext().getUseGBi() && pu.cu->GBiIdx != GBI_DEFAULT && !bBi && xReadBufferedAffineUniMv(pu, eRefPicList, iRefIdxPred, acMvPred, acMv, ruiBits, ruiCost) )
+  {
+    return;
+  }
+#endif
 
   const int width  = pu.Y().width;
   const int height = pu.Y().height;
@@ -3180,10 +3339,17 @@ void InterSearch::xAffineMotionEstimation( PredictionUnit& pu,
     PelUnitBuf otherBuf = m_tmpPredStorage[1 - (int)eRefPicList].getBuf( UnitAreaRelative( *pu.cu, pu ) );
     origBufTmp.copyFrom(origBuf);
     origBufTmp.removeHighFreq(otherBuf, m_pcEncCfg->getClipForBiPredMeEnabled(), pu.cu->slice->clpRngs()
+#if JVET_L0646_GBI
+                             ,getGbiWeight(pu.cu->GBiIdx, eRefPicList)
+#endif 
                              );
     pBuf = &origBufTmp;
 
+#if JVET_L0646_GBI
+    fWeight = xGetMEDistortionWeight( pu.cu->GBiIdx, eRefPicList );
+#else
     fWeight = 0.5;
+#endif
   }
 
   // pred YUV
@@ -4547,3 +4713,69 @@ uint64_t InterSearch::xGetSymbolFracBitsInter(CodingStructure &cs, Partitioner &
   return fracBits;
 }
 
+#if JVET_L0646_GBI
+double InterSearch::xGetMEDistortionWeight(uint8_t gbiIdx, RefPicList eRefPicList)
+{
+  if( gbiIdx != GBI_DEFAULT )
+  {
+    return fabs((double)getGbiWeight(gbiIdx, eRefPicList) / (double)g_GbiWeightBase);
+  }
+  else
+  {
+    return 0.5;
+  }
+}
+bool InterSearch::xReadBufferedUniMv(PredictionUnit& pu, RefPicList eRefPicList, int32_t iRefIdx, Mv& pcMvPred, Mv& rcMv, uint32_t& ruiBits, Distortion& ruiCost)
+{
+  if (m_cUniMotions.isReadMode((uint32_t)eRefPicList, (uint32_t)iRefIdx))
+  {
+    m_cUniMotions.copyTo(rcMv, ruiCost, (uint32_t)eRefPicList, (uint32_t)iRefIdx);
+
+    m_pcRdCost->setPredictor(pcMvPred);
+    m_pcRdCost->setCostScale(0);
+
+    unsigned imvShift = pu.cu->imv << 1;
+    uint32_t uiMvBits = m_pcRdCost->getBitsOfVectorWithPredictor(rcMv.getHor(), rcMv.getVer(), imvShift);
+
+    ruiBits += uiMvBits;
+    ruiCost += m_pcRdCost->getCost(ruiBits);
+    return true;
+  }
+  return false;
+}
+bool InterSearch::xReadBufferedAffineUniMv(PredictionUnit& pu, RefPicList eRefPicList, int32_t iRefIdx, Mv acMvPred[3], Mv acMv[3], uint32_t& ruiBits, Distortion& ruiCost)
+{
+  if (m_cUniMotions.isReadModeAffine((uint32_t)eRefPicList, (uint32_t)iRefIdx, pu.cu->affineType))
+  {
+    m_cUniMotions.copyAffineMvTo(acMv, ruiCost, (uint32_t)eRefPicList, (uint32_t)iRefIdx, pu.cu->affineType);
+    m_pcRdCost->setCostScale(0);
+
+    uint32_t uiMvBits = 0;
+    for (int iVerIdx = 0; iVerIdx<(pu.cu->affineType ? 3 : 2); iVerIdx++)
+    {
+      if (iVerIdx)
+      {
+        m_pcRdCost->setPredictor(acMvPred[iVerIdx] + acMv[0] - acMvPred[0]);
+      }
+      else
+      {
+        m_pcRdCost->setPredictor(acMvPred[iVerIdx]);
+      }
+      const int shift = VCEG_AZ07_MV_ADD_PRECISION_BIT_FOR_STORE;
+      uiMvBits += m_pcRdCost->getBitsOfVectorWithPredictor(acMv[iVerIdx].getHor() >> shift, acMv[iVerIdx].getVer() >> shift, 0);
+    }
+    ruiBits += uiMvBits;
+    ruiCost += m_pcRdCost->getCost(ruiBits);
+    return true;
+  }
+  return false;
+}
+void InterSearch::initWeightIdxBits()
+{
+  for (int n = 0; n < GBI_NUM; ++n)
+  {
+    m_auiEstWeightIdxBits[n] = deriveWeightIdxBits(n);
+  }
+}
+#endif
+
diff --git a/source/Lib/EncoderLib/InterSearch.h b/source/Lib/EncoderLib/InterSearch.h
index bcb4c92613cf90b6f40310669717e379f5510e8a..c4689cca5f16cec59d86c53a2985a2174cd6d6cc 100644
--- a/source/Lib/EncoderLib/InterSearch.h
+++ b/source/Lib/EncoderLib/InterSearch.h
@@ -81,6 +81,11 @@ private:
   CodingStructure **m_pSaveCS;
 
   ClpRng          m_lumaClpRng;
+#if JVET_L0646_GBI 
+  uint32_t        m_auiEstWeightIdxBits[GBI_NUM];
+  GBiMotionParam  m_cUniMotions;
+  bool            m_affineModeSelected;
+#endif
 
 protected:
   // interface to option
@@ -133,7 +138,9 @@ public:
 #if ENABLE_SPLIT_PARALLELISM
   void copyState                    ( const InterSearch& other );
 #endif
-
+#if JVET_L0646_GBI
+  void setAffineModeSelected        ( bool flag) { m_affineModeSelected = flag; }
+#endif
 protected:
 
   /// sub-function for motion vector refinement used in fractional-pel accuracy
@@ -316,6 +323,11 @@ protected:
                                     Mv                    hevcMv[2][33]
                                   , Mv                    mvAffine4Para[2][33][3]
                                   , int                   refIdx4Para[2]
+#if JVET_L0646_GBI 
+                                  , uint8_t               gbiIdx = GBI_DEFAULT
+                                  , bool                  enforceGBiPred = false
+                                  , uint32_t              gbiIdxBits = 0
+#endif
                                   );
 
   void xAffineMotionEstimation    ( PredictionUnit& pu,
@@ -343,7 +355,16 @@ protected:
   void xCopyAffineAMVPInfo        ( AffineAMVPInfo& src, AffineAMVPInfo& dst );
   void xCheckBestAffineMVP        ( PredictionUnit &pu, AffineAMVPInfo &affineAMVPInfo, RefPicList eRefPicList, Mv acMv[3], Mv acMvPred[3], int& riMVPIdx, uint32_t& ruiBits, Distortion& ruiCost );
 
-
+#if JVET_L0646_GBI 
+  bool xReadBufferedAffineUniMv   ( PredictionUnit& pu, RefPicList eRefPicList, int32_t iRefIdx, Mv acMvPred[3], Mv acMv[3], uint32_t& ruiBits, Distortion& ruiCost);
+  double xGetMEDistortionWeight   ( uint8_t gbiIdx, RefPicList eRefPicList);
+  bool xReadBufferedUniMv         ( PredictionUnit& pu, RefPicList eRefPicList, int32_t iRefIdx, Mv& pcMvPred, Mv& rcMv, uint32_t& ruiBits, Distortion& ruiCost);
+public:
+  void resetBufferedUniMotions    () { m_cUniMotions.reset(); }
+  uint32_t getWeightIdxBits       ( uint8_t gbiIdx ) { return m_auiEstWeightIdxBits[gbiIdx]; }
+  void initWeightIdxBits          ();
+protected:
+#endif
 
   void xExtDIFUpSamplingH         ( CPelBuf* pcPattern );
   void xExtDIFUpSamplingQ         ( CPelBuf* pcPatternKey, Mv halfPelRef );
diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp
index 31d205d12e0b73173c23ec2217a0abcb0d107b7c..5d4a3cee0c213fc2d9754746002536a9a2bde442 100644
--- a/source/Lib/EncoderLib/VLCWriter.cpp
+++ b/source/Lib/EncoderLib/VLCWriter.cpp
@@ -544,7 +544,9 @@ void HLSWriter::codeSPSNext( const SPSNext& spsNext, const bool usePCM )
   {
     WRITE_FLAG( spsNext.getUseAffineType() ? 1 : 0,                                             "affine_type_flag" );
   }
-
+#if JVET_L0646_GBI
+  WRITE_FLAG( spsNext.getUseGBi() ? 1 : 0,                                                      "gbi_flag" );
+#endif
   for( int k = 0; k < SPSNext::NumReservedFlags; k++ )
   {
     WRITE_FLAG( 0,                                                                              "reserved_flag" );