diff --git a/source/App/DecoderApp/decmain.cpp b/source/App/DecoderApp/decmain.cpp
index c8a6e3bd7070cdcbd867a95d3f39c27cca6739e1..0a664aa461019162caa0c5c7d1d09092bce5eb47 100644
--- a/source/App/DecoderApp/decmain.cpp
+++ b/source/App/DecoderApp/decmain.cpp
@@ -54,7 +54,7 @@ int main(int argc, char* argv[])
 
   // print information
   fprintf( stdout, "\n" );
-  fprintf( stdout, "VVCSoftware: VTM Decoder Version %s ", VTM_VERSION );
+  fprintf( stdout, "VVCSoftware: ECM Decoder Version %s (VTM-%s) ", ECM_VERSION, VTM_VERSION );
   fprintf( stdout, NVM_ONOS );
   fprintf( stdout, NVM_COMPILEDBY );
   fprintf( stdout, NVM_BITS );
diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp
index 02bedf8fa1bb0f68541d7a1267d166976744de50..378561a7a65918bc4935c41dab9ab3711c3d642f 100644
--- a/source/App/EncoderApp/EncApp.cpp
+++ b/source/App/EncoderApp/EncApp.cpp
@@ -788,9 +788,9 @@ void EncApp::xInitLibCfg()
 
   m_cEncLib.setUseWrapAround                                     ( m_wrapAround );
   m_cEncLib.setWrapAroundOffset                                  ( m_wrapAroundOffset );
-#if IDCC_TPM_JEM
-  m_cEncLib.setUseIntraTMP(m_IntraTMP);
-  m_cEncLib.setIntraTMPMaxSize(m_IntraTMP_MaxSize);
+#if JVET_V0130_INTRA_TMP
+  m_cEncLib.setUseIntraTMP                                       ( m_intraTMP );
+  m_cEncLib.setIntraTMPMaxSize                                   ( m_intraTmpMaxSize );
 #endif
 #if JVET_V0094_BILATERAL_FILTER
   m_cEncLib.setUseBIF                                            ( m_BIF );
diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp
index 21890e532d76457d352cbe6d6015d70421bf7758..33be266cc7f211f1f44df7dd5c67d63811482e8e 100644
--- a/source/App/EncoderApp/EncAppCfg.cpp
+++ b/source/App/EncoderApp/EncAppCfg.cpp
@@ -1044,9 +1044,9 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] )
   ("AdditionalInterHypRefFrames",                     m_maxNumAddHypRefFrames,                              4, "max. number of ref frames for additional inter hypotheseis")
   ("AdditionalInterHypTries",                         m_addHypTries,                                        1, "number of tries for additional inter prediction hypotheseis")
 #endif
-#if IDCC_TPM_JEM
-  ("IntraTMP",                                        m_IntraTMP,                                        false, "intra Template Matching (0: off, 1:on)  [default: on]")
-  ("IntraTMPMaxSize",                                 m_IntraTMP_MaxSize,                                 64u, "intra Template Matching max CU size  [default: 64]")
+#if JVET_V0130_INTRA_TMP
+  ("IntraTMP",                                        m_intraTMP,                                       false, "intra Template Matching (0: off, 1:on)  [default: on]")
+  ("IntraTMPMaxSize",                                 m_intraTmpMaxSize,                                  64u, "intra Template Matching max CU size  [default: 64]")
 #endif
 #if JVET_V0094_BILATERAL_FILTER
   ("BIF",                                             m_BIF,                                            true, "bilateral filter   (0: off, 1:on)  [default: on]")
@@ -4176,12 +4176,12 @@ void EncAppCfg::xPrintParameter()
     }
 #endif
   }
-#if IDCC_TPM_JEM
-  msg(DETAILS, "Intra TMP: %d\n", m_IntraTMP);
-  msg(DETAILS, "Max CU size of TMP: %d\n", m_IntraTMP_MaxSize);
+#if INTRA_TEMPLATE_MATCHING
+  msg(DETAILS, "Intra TMP: %d\n", m_intraTMP);
+  msg(DETAILS, "Max CU size of TMP: %d\n", m_intraTmpMaxSize);
   msg(DETAILS, "dynamic search range with fixed comparison per pixel: \n");
-  msg(DETAILS, "	searchRangeWidth = %d*Width \n", IDCC_SearchRangeMultFactor);
-  msg(DETAILS, "	searchRangeHeight = %d*Heigh \n", IDCC_SearchRangeMultFactor);
+  msg(DETAILS, "	searchRangeWidth = %d*Width \n", TMP_SEARCH_RANGE_MULT_FACTOR );
+  msg(DETAILS, "	searchRangeHeight = %d*Heigh \n", TMP_SEARCH_RANGE_MULT_FACTOR );
 #endif
 
   msg( DETAILS, "Max Num Merge Candidates               : %d\n", m_maxNumMergeCand );
@@ -4316,9 +4316,9 @@ void EncAppCfg::xPrintParameter()
   {
     msg( VERBOSE, "WrapAroundOffset:%d ", m_wrapAroundOffset );
   }
-#if IDCC_TPM_JEM
-  msg( VERBOSE, "IntraTMP:%d ", m_IntraTMP);
-  msg( VERBOSE, "IntraTMP_MaxSize:%d ", m_IntraTMP_MaxSize);
+#if INTRA_TEMPLATE_MATCHING
+  msg( VERBOSE, "IntraTMP:%d ", m_intraTMP);
+  msg( VERBOSE, "IntraTmpMaxSize:%d ", m_intraTmpMaxSize);
 #endif
 #if JVET_V0094_BILATERAL_FILTER
   msg( VERBOSE, "BIF:%d ", m_BIF);
diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h
index a035b4d4c277f54a8360ef341a66000d81015815..83b5967d6cfb4d41bb9ed786457ce56db66ab722 100644
--- a/source/App/EncoderApp/EncAppCfg.h
+++ b/source/App/EncoderApp/EncAppCfg.h
@@ -411,9 +411,9 @@ protected:
   int       m_maxNumAddHypRefFrames;                          ///< max. number of ref frames for additional inter hypotheseis
   int       m_addHypTries;                                    ///< max. number of tries for additional inter hypotheseis
 #endif
-#if IDCC_TPM_JEM
-  bool      m_IntraTMP;                                       ///< intra Template Matching 
-  unsigned  m_IntraTMP_MaxSize;                               ///< max CU size for which intra TMP is allowed
+#if JVET_V0130_INTRA_TMP
+  bool      m_intraTMP;                                       ///< intra Template Matching 
+  unsigned  m_intraTmpMaxSize;                               ///< max CU size for which intra TMP is allowed
 #endif
 #if JVET_V0094_BILATERAL_FILTER
   bool      m_BIF;                                            ///< bilateral filter
diff --git a/source/App/EncoderApp/encmain.cpp b/source/App/EncoderApp/encmain.cpp
index 0dfefebecaeb29ce3ebcea6a9e55c2bbe2672ab9..44a68dc8f4bbf7432d8622841ed5abd2db83a58b 100644
--- a/source/App/EncoderApp/encmain.cpp
+++ b/source/App/EncoderApp/encmain.cpp
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
 {
   // print information
   fprintf( stdout, "\n" );
-  fprintf( stdout, "VVCSoftware: VTM Encoder Version %s ", VTM_VERSION );
+  fprintf( stdout, "VVCSoftware: ECM Encoder Version %s (VTM-%s) ", ECM_VERSION, VTM_VERSION );
   fprintf( stdout, NVM_ONOS );
   fprintf( stdout, NVM_COMPILEDBY );
   fprintf( stdout, NVM_BITS );
diff --git a/source/Lib/CommonLib/BilateralFilter.cpp b/source/Lib/CommonLib/BilateralFilter.cpp
index f5b7f0f3151c1666053530dbdf09ab11980649b2..38058a4da2d79cee6a2c2678f08320ee32b93a45 100755
--- a/source/Lib/CommonLib/BilateralFilter.cpp
+++ b/source/Lib/CommonLib/BilateralFilter.cpp
@@ -50,6 +50,13 @@
 
 BilateralFilter::BilateralFilter()
 {
+  m_bilateralFilterDiamond5x5 = blockBilateralFilterDiamond5x5;
+
+#if ENABLE_SIMD_BILATERAL_FILTER
+#ifdef TARGET_SIMD_X86
+  initBilateralFilterX86();
+#endif
+#endif
 }
 
 BilateralFilter::~BilateralFilter()
@@ -64,292 +71,193 @@ void BilateralFilter::destroy()
 {
 }
 
-void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO)
+const char* BilateralFilter::getFilterLutParameters( const int size, const PredMode predMode, const int32_t qp, int& bfac )
 {
-  int pad = 2;
-  int padwidth = iWidthExtSIMD;
-
-  __m128i center, left, right, up, down, lu, ld, ru, rd, diffabs, four, fifteen, lut, acc, temp, round_add, clipmin, clipmax, inputVals;
-  __m128i ll, rr, uu, dd;
-
-  four = _mm_set1_epi16(4);
-  fifteen = _mm_set1_epi16(15);
-  round_add = _mm_set1_epi16(bif_round_add);
-  clipmin = _mm_set1_epi16(clpRng.min);
-  clipmax = _mm_set1_epi16(clpRng.max);
+  if( size <= 4 )
+  {
+    bfac = 3;
+  }
+  else if( size >= 16 )
+  {
+    bfac = 1;
+  }
+  else
+  {
+    bfac = 2;
+  }
 
-  lut = _mm_loadu_si128((__m128i*)(LUTrowPtr));
-  acc = _mm_set1_epi32(0);
-  
-  // Copy back parameters
-  Pel *tempBlockPtr = (short*)blkFilt + (((padwidth+4) << 1) + 2);
-  int tempBlockStride = padwidth+4;
-  
-  
-  for (int col = 0; col < uiWidth; col += 8)
+  if( predMode == MODE_INTER )
   {
-    for (int row = 0; row < uiHeight; row++)
+    if( size <= 4 )
     {
-      acc = _mm_set1_epi32(0);
-      int16_t *point = &block[(row + pad)*padwidth + pad + col];
-      
-      center = _mm_loadu_si128((__m128i*)(point));
-      
-      //load neighbours
-      left = _mm_loadu_si128((__m128i*)(point - 1));
-      right = _mm_loadu_si128((__m128i*)(point + 1));
-      up = _mm_loadu_si128((__m128i*)(point - padwidth));
-      down = _mm_loadu_si128((__m128i*)(point + padwidth));
-      
-      lu = _mm_loadu_si128((__m128i*)(point - 1 - padwidth));
-      ld = _mm_loadu_si128((__m128i*)(point - 1 + padwidth));
-      ru = _mm_loadu_si128((__m128i*)(point + 1 - padwidth));
-      rd = _mm_loadu_si128((__m128i*)(point + 1 + padwidth));
-
-      ll = _mm_loadu_si128((__m128i*)(point - 2));
-      rr = _mm_loadu_si128((__m128i*)(point + 2));
-      uu = _mm_loadu_si128((__m128i*)(point - 2*padwidth));
-      dd = _mm_loadu_si128((__m128i*)(point + 2*padwidth));
-      
-      //calculate diffs
-      left = _mm_sub_epi16(left, center);
-      right = _mm_sub_epi16(right, center);
-      up = _mm_sub_epi16(up, center);
-      down = _mm_sub_epi16(down, center);
-      
-      lu = _mm_sub_epi16(lu, center);
-      ld = _mm_sub_epi16(ld, center);
-      ru = _mm_sub_epi16(ru, center);
-      rd = _mm_sub_epi16(rd, center);
-
-      ll = _mm_sub_epi16(ll, center);
-      rr = _mm_sub_epi16(rr, center);
-      uu = _mm_sub_epi16(uu, center);
-      dd = _mm_sub_epi16(dd, center);
-      
-      //LEFT!
-      //calculate abs
-      diffabs = _mm_abs_epi16(left); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_sign_epi16(diffabs, left);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //RIGHT!
-      //calculate abs
-      diffabs = _mm_abs_epi16(right); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_sign_epi16(diffabs, right);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //UP!
-      //calculate abs
-      diffabs = _mm_abs_epi16(up); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_sign_epi16(diffabs, up);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      
-      //DOWN!
-      //calculate abs
-      diffabs = _mm_abs_epi16(down); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_sign_epi16(diffabs, down);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      
-      //lu!
-      //calculate abs
-      diffabs = _mm_abs_epi16(lu); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, lu);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //ld!
-      //calculate abs
-      diffabs = _mm_abs_epi16(ld); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, ld);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //ru!
-      //calculate abs
-      diffabs = _mm_abs_epi16(ru); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, ru);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //rd!
-      //calculate abs
-      diffabs = _mm_abs_epi16(rd); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, rd);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-
-      //ll!
-      //calculate abs
-      diffabs = _mm_abs_epi16(ll); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, ll);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //rr!
-      //calculate abs
-      diffabs = _mm_abs_epi16(rr); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, rr);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //uu!
-      //calculate abs
-      diffabs = _mm_abs_epi16(uu); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, uu);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      //dd!
-      //calculate abs
-      diffabs = _mm_abs_epi16(dd); //abs
-      diffabs = _mm_add_epi16(diffabs, four); //+4
-      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
-      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
-      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
-      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
-      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
-      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
-      diffabs = _mm_sign_epi16(diffabs, dd);//fix sign!
-      acc = _mm_add_epi16(diffabs, acc); //add to acc
-      
-      if (bfac == 2)
-      {
-        acc = _mm_slli_epi16(acc, 1);   // Shift left to get 2*
-      }
-      else if (bfac == 3)
-      {
-        temp = _mm_slli_epi16(acc, 1);  // Multiply by two by shifting left
-        acc = _mm_add_epi16(acc, temp); // Add original value to get 3*
-      }
-      
-      // Add 16 and shift 5
-      acc = _mm_add_epi16(acc, round_add);
-      acc = _mm_srai_epi16(acc, bif_round_shift);
-      
-      // Instead we add our input values to the delta
-      if(isRDO)
-      {
-        acc = _mm_add_epi16(acc, center);
-      }
-      else
-      {
-        int16_t *recpoint = &recPtr[row * recStride + col];
-        inputVals = _mm_loadu_si128((__m128i*)(recpoint));
-        acc = _mm_add_epi16(acc, inputVals);
-      }
-      
-      // Clip
-      acc = _mm_max_epi16(acc, clipmin);
-      acc = _mm_min_epi16(acc, clipmax);
-
-      _mm_store_si128((__m128i*)(blkFilt + (row + pad) * (padwidth + 4) + col + pad), acc);
+      bfac = 2;
+    }
+    else if( size >= 16 )
+    {
+      bfac = 1;
+    }
+    else
+    {
+      bfac = 2;
     }
   }
-  
-  // Copy back from tempbufFilter to recBuf
-  int onerow = uiWidth * sizeof(Pel);
-  for(uint32_t yy = 0; yy < uiHeight; yy++)
+
+  int sqp = qp;
+
+  if( sqp < 17 )
+  {
+    sqp = 17;
+  }
+
+  if( sqp > 42 )
   {
-    std::memcpy(recPtr, tempBlockPtr, onerow);
-    recPtr += recStride;
-    tempBlockPtr += tempBlockStride;
+    sqp = 42;
   }
+
+  return m_wBIF[sqp - 17];
 }
 
-void BilateralFilter::blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO)
+void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr )
 {
   int pad = 2;
 
-#ifdef TARGET_SIMD_X86
-  if ((uiWidth >= 8) || (!isRDO && (uiWidth >= 4)))
+  int padwidth = iWidthExtSIMD;
+  int downbuffer[64];
+  int downleftbuffer[65];
+  int downrightbuffer[2][65];
+  int Shift, sg0, v0, idx, w0;
+  Shift = sizeof( int ) * 8 - 1;
+  downbuffer[0] = 0;
+
+  for( int x = 0; x < uiWidth; x++ )
   {
-    simdFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, isRDO);
+    int pixel = block[(-1 + pad)*padwidth + x + pad];
+    int below = block[(-1 + pad + 1)*padwidth + x + pad];
+    int diff = below - pixel;
+    sg0 = diff >> Shift;
+    v0 = (diff + sg0) ^ sg0;
+    v0 = (v0 + 4) >> 3;
+    idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+    w0 = LUTrowPtr[idx];
+    int mod = (w0 + sg0) ^ sg0;
+    downbuffer[x] = mod;
+
+    int belowright = block[(-1 + pad + 1)*padwidth + x + pad + 1];
+    diff = belowright - pixel;
+    sg0 = diff >> Shift;
+    v0 = (diff + sg0) ^ sg0;
+    v0 = (v0 + 4) >> 3;
+    idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+    w0 = LUTrowPtr[idx] >> 1;
+    mod = (w0 + sg0) ^ sg0;
+    downrightbuffer[1][x + 1] = mod;
+
+    int belowleft = block[(-1 + pad + 1)*padwidth + x + pad - 1];
+    diff = belowleft - pixel;
+    sg0 = diff >> Shift;
+    v0 = (diff + sg0) ^ sg0;
+    v0 = (v0 + 4) >> 3;
+    idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+    w0 = LUTrowPtr[idx] >> 1;
+    mod = (w0 + sg0) ^ sg0;
+    downleftbuffer[x] = mod;
   }
-  else
-#endif
+  int width = uiWidth;
+  for( int y = 0; y < uiHeight; y++ )
   {
-    
-    int padwidth = iWidthExtSIMD;
-    int downbuffer[64];
-    int downleftbuffer[65];
-    int downrightbuffer[2][65];
-    int Shift, sg0, v0, idx, w0;
-    Shift = sizeof(int) * 8 - 1;
-    downbuffer[0] = 0;
-      
-    for (int x = 0; x < uiWidth; x++)
-    {
-      int pixel = block[(-1 + pad)*padwidth + x + pad];
-      int below = block[(-1 + pad + 1)*padwidth + x + pad];
-      int diff = below - pixel;
+    int diff;
+
+    int16_t *rowStart = &block[(y + pad)*padwidth + pad];
+
+    int pixel = rowStart[-1];
+
+    int right = rowStart[0];
+    diff = right - pixel;
+    sg0 = diff >> Shift;
+    v0 = (diff + sg0) ^ sg0;
+    v0 = (v0 + 4) >> 3;
+    idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+    w0 = LUTrowPtr[idx];
+    int mod = (w0 + sg0) ^ sg0;
+    int rightmod = mod;
+
+    pixel = rowStart[-padwidth - 1];
+    int belowright = right;
+    diff = belowright - pixel;
+    sg0 = diff >> Shift;
+    v0 = (diff + sg0) ^ sg0;
+    v0 = (v0 + 4) >> 3;
+    idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+    w0 = LUTrowPtr[idx] >> 1;
+    mod = (w0 + sg0) ^ sg0;
+    downrightbuffer[(y + 1) % 2][0] = mod;
+
+    pixel = rowStart[-padwidth + width];
+    int belowleft = rowStart[width - 1];
+    diff = belowleft - pixel;
+    sg0 = diff >> Shift;
+    v0 = (diff + sg0) ^ sg0;
+    v0 = (v0 + 4) >> 3;
+    idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+    w0 = LUTrowPtr[idx] >> 1;
+    mod = (w0 + sg0) ^ sg0;
+    downleftbuffer[width] = mod;
+
+    for( int x = 0; x < uiWidth; x++ )
+    {
+      pixel = rowStart[x];
+      int modsum = 0;
+
+      int abovemod = -downbuffer[x];
+      modsum += abovemod;
+
+      int leftmod = -rightmod;
+      modsum += leftmod;
+
+      right = rowStart[x + 1];
+      diff = right - pixel;
       sg0 = diff >> Shift;
       v0 = (diff + sg0) ^ sg0;
       v0 = (v0 + 4) >> 3;
       idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
       w0 = LUTrowPtr[idx];
-      int mod = (w0 + sg0) ^ sg0;
+      mod = (w0 + sg0) ^ sg0;
+
+      modsum += mod;
+      rightmod = mod;
+
+      int below = rowStart[x + padwidth];
+      diff = below - pixel;
+      sg0 = diff >> Shift;
+      v0 = (diff + sg0) ^ sg0;
+      v0 = (v0 + 4) >> 3;
+      idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+      w0 = LUTrowPtr[idx];
+      mod = (w0 + sg0) ^ sg0;
+      modsum += mod;
       downbuffer[x] = mod;
-      
-      int belowright = block[(-1 + pad + 1)*padwidth + x + pad + 1];
+
+      int aboverightmod = -downleftbuffer[x + 1];
+      // modsum += ((int16_t)((uint16_t)((aboverightmod) >> 1)));
+      modsum += aboverightmod;
+
+      int aboveleftmod = -downrightbuffer[(y + 1) % 2][x];
+      // modsum += ((int16_t)((uint16_t)((aboveleftmod) >> 1)));
+      modsum += aboveleftmod;
+
+      int belowleft = rowStart[x + padwidth - 1];
+      diff = belowleft - pixel;
+      sg0 = diff >> Shift;
+      v0 = (diff + sg0) ^ sg0;
+      v0 = (v0 + 4) >> 3;
+      idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
+      w0 = LUTrowPtr[idx] >> 1;
+      mod = (w0 + sg0) ^ sg0;
+      // modsum += ((int16_t)((uint16_t)((mod) >> 1)));
+      modsum += mod;
+      downleftbuffer[x] = mod;
+
+      int belowright = rowStart[x + padwidth + 1];
       diff = belowright - pixel;
       sg0 = diff >> Shift;
       v0 = (diff + sg0) ^ sg0;
@@ -357,207 +265,87 @@ void BilateralFilter::blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t
       idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
       w0 = LUTrowPtr[idx] >> 1;
       mod = (w0 + sg0) ^ sg0;
-      downrightbuffer[1][x + 1] = mod;
-      
-      int belowleft = block[(-1 + pad + 1)*padwidth + x + pad - 1];
-      diff = belowleft - pixel;
+      //modsum += ((int16_t)((uint16_t)((mod) >> 1)));
+      modsum += mod;
+      downrightbuffer[y % 2][x + 1] = mod;
+
+      // For samples two pixels out, we do not reuse previously calculated
+      // values even though that is possible. Doing so would likely increase
+      // speed when SIMD is turned off.
+
+      int above = rowStart[x - 2 * padwidth];
+      diff = above - pixel;
       sg0 = diff >> Shift;
       v0 = (diff + sg0) ^ sg0;
       v0 = (v0 + 4) >> 3;
       idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
       w0 = LUTrowPtr[idx] >> 1;
       mod = (w0 + sg0) ^ sg0;
-      downleftbuffer[x] = mod;
-    }
-    int width = uiWidth;
-    for (int y = 0; y < uiHeight; y++)
-    {
-      int diff;
-      
-      int16_t *rowStart = &block[(y + pad)*padwidth + pad];
-      
-      int pixel = rowStart[-1];
-      
-      int right = rowStart[0];
-      diff = right - pixel;
+      modsum += mod;
+
+      below = rowStart[x + 2 * padwidth];
+      diff = below - pixel;
       sg0 = diff >> Shift;
       v0 = (diff + sg0) ^ sg0;
       v0 = (v0 + 4) >> 3;
       idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-      w0 = LUTrowPtr[idx];
-      int mod = (w0 + sg0) ^ sg0;
-      int rightmod = mod;
-      
-      pixel = rowStart[-padwidth - 1];
-      int belowright = right;
-      diff = belowright - pixel;
+      w0 = LUTrowPtr[idx] >> 1;
+      mod = (w0 + sg0) ^ sg0;
+      modsum += mod;
+
+      int left = rowStart[x - 2];
+      diff = left - pixel;
       sg0 = diff >> Shift;
       v0 = (diff + sg0) ^ sg0;
       v0 = (v0 + 4) >> 3;
       idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
       w0 = LUTrowPtr[idx] >> 1;
       mod = (w0 + sg0) ^ sg0;
-      downrightbuffer[(y + 1) % 2][0] = mod;
-      
-      pixel = rowStart[-padwidth + width];
-      int belowleft = rowStart[width - 1];
-      diff = belowleft - pixel;
+      modsum += mod;
+
+      right = rowStart[x + 2];
+      diff = right - pixel;
       sg0 = diff >> Shift;
       v0 = (diff + sg0) ^ sg0;
       v0 = (v0 + 4) >> 3;
       idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
       w0 = LUTrowPtr[idx] >> 1;
       mod = (w0 + sg0) ^ sg0;
-      downleftbuffer[width] = mod;
-      
-      
-      for (int x = 0; x < uiWidth; x++)
-      {
-        
-        pixel = rowStart[x];
-        
-        int modsum = 0;
-        
-        
-        int abovemod = -downbuffer[x];
-        modsum += abovemod;
-        
-        int leftmod = -rightmod;
-        modsum += leftmod;
-        
-        right = rowStart[x + 1];
-        diff = right - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx];
-        mod = (w0 + sg0) ^ sg0;
-        
-        modsum += mod;
-        rightmod = mod;
-        
-        int below = rowStart[x + padwidth];
-        diff = below - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx];
-        mod = (w0 + sg0) ^ sg0;
-        modsum += mod;
-        downbuffer[x] = mod;
-        
-        int aboverightmod = -downleftbuffer[x + 1];
-        // modsum += ((int16_t)((uint16_t)((aboverightmod) >> 1)));
-        modsum += aboverightmod;
-        
-        int aboveleftmod = -downrightbuffer[(y + 1) % 2][x];
-        // modsum += ((int16_t)((uint16_t)((aboveleftmod) >> 1)));
-        modsum += aboveleftmod;
-        
-        int belowleft = rowStart[x + padwidth - 1];
-        diff = belowleft - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx] >> 1;
-        mod = (w0 + sg0) ^ sg0;
-        // modsum += ((int16_t)((uint16_t)((mod) >> 1)));
-        modsum += mod;
-        downleftbuffer[x] = mod;
-        
-        int belowright = rowStart[x + padwidth + 1];
-        diff = belowright - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx] >> 1;
-        mod = (w0 + sg0) ^ sg0;
-        //modsum += ((int16_t)((uint16_t)((mod) >> 1)));
-        modsum += mod;
-        downrightbuffer[y % 2][x + 1] = mod;
-
-        // For samples two pixels out, we do not reuse previously calculated
-        // values even though that is possible. Doing so would likely increase
-        // speed when SIMD is turned off.
-        
-        int above = rowStart[x - 2*padwidth];
-        diff = above - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx] >> 1;
-        mod = (w0 + sg0) ^ sg0;
-        modsum += mod;
-        
-        below = rowStart[x + 2*padwidth];
-        diff = below - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx] >> 1;
-        mod = (w0 + sg0) ^ sg0;
-        modsum += mod;
-        
-        int left = rowStart[x - 2];
-        diff = left - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx] >> 1;
-        mod = (w0 + sg0) ^ sg0;
-        modsum += mod;
-        
-        right = rowStart[x + 2];
-        diff = right - pixel;
-        sg0 = diff >> Shift;
-        v0 = (diff + sg0) ^ sg0;
-        v0 = (v0 + 4) >> 3;
-        idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift));
-        w0 = LUTrowPtr[idx] >> 1;
-        mod = (w0 + sg0) ^ sg0;
-        modsum += mod;
-        
-        blkFilt[(y + pad)*(padwidth+4) + x + pad] = ((int16_t)((uint16_t)((modsum*bfac + bif_round_add) >> bif_round_shift)));
-      }
+      modsum += mod;
+
+      blkFilt[(y + pad)*(padwidth + 4) + x + pad] = (( int16_t ) (( uint16_t ) ((modsum*bfac + bif_round_add) >> bif_round_shift)));
     }
+  }
 
-    // Copy back
-    Pel *tempBlockPtr = (short*)blkFilt + (((padwidth+4) << 1) + 2);
-    int tempBlockStride = padwidth+4;
-    if(isRDO)
+  // Copy back
+  Pel *tempBlockPtr = ( short* ) blkFilt + (((padwidth + 4) << 1) + 2);
+  int tempBlockStride = padwidth + 4;
+  if( isRDO )
+  {
+    Pel *srcBlockPtr = ( short* ) block + (((padwidth) << 1) + 2);
+    int srcBlockStride = padwidth;
+    for( uint32_t yy = 0; yy < uiHeight; yy++ )
     {
-      Pel *srcBlockPtr = (short*)block + (((padwidth) << 1) + 2);
-      int srcBlockStride = padwidth;
-      for(uint32_t yy = 0; yy < uiHeight; yy++)
+      for( uint32_t xx = 0; xx < uiWidth; xx++ )
       {
-        for(uint32_t xx = 0; xx < uiWidth; xx++)
-        {
-          recPtr[xx] = ClipPel(srcBlockPtr[xx] + tempBlockPtr[xx], clpRng);
-        }
-        recPtr += recStride;
-        tempBlockPtr += tempBlockStride;
-        srcBlockPtr += srcBlockStride;
+        recPtr[xx] = ClipPel( srcBlockPtr[xx] + tempBlockPtr[xx], clpRng );
       }
+      recPtr += recStride;
+      tempBlockPtr += tempBlockStride;
+      srcBlockPtr += srcBlockStride;
     }
-    else
+  }
+  else
+  {
+    for( uint32_t yy = 0; yy < uiHeight; yy++ )
     {
-      for(uint32_t yy = 0; yy < uiHeight; yy++)
+      for( uint32_t xx = 0; xx < uiWidth; xx++ )
       {
-        for(uint32_t xx = 0; xx < uiWidth; xx++)
-        {
-          // new result = old result (which is SAO-treated already) + diff due to bilateral filtering
-          recPtr[xx] = ClipPel<int>(recPtr[xx] + tempBlockPtr[xx], clpRng);
-        }
-        recPtr += recStride;
-        tempBlockPtr += tempBlockStride;
+        // new result = old result (which is SAO-treated already) + diff due to bilateral filtering
+        recPtr[xx] = ClipPel<int>( recPtr[xx] + tempBlockPtr[xx], clpRng );
       }
+      recPtr += recStride;
+      tempBlockPtr += tempBlockStride;
     }
   }
 }
@@ -568,34 +356,10 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
   const unsigned uiHeight = predBuf.height;
   
   int bfac = 1;
-  
-  int size = std::min(uiWidth, uiHeight);
-  if (size <= 4)
-    bfac = 3;
-  else if (size >= 16)
-    bfac = 1;
-  else
-    bfac = 2;
-  if (currTU.cu->predMode == MODE_INTER)
-  {
-    if (size <= 4)
-      bfac = 2;
-    else if (size >= 16)
-      bfac = 1;
-    else
-      bfac = 2;
-  }
-  
-  qp = qp + currTU.cs->pps->getBIFQPOffset();
   int bif_round_add = (BIF_ROUND_ADD) >> (currTU.cs->pps->getBIFStrength());
-  int bif_round_shift = (BIF_ROUND_SHIFT) - (currTU.cs->pps->getBIFStrength());
-  
-  int sqp = qp;
-  if(sqp<17)
-    sqp = 17;
-  if(sqp>42)
-    sqp = 42;
-  LUTrowPtr = wBIF[sqp-17];
+  int bif_round_shift = ( BIF_ROUND_SHIFT ) -(currTU.cs->pps->getBIFStrength());
+
+  const char* LUTrowPtr = getFilterLutParameters( std::min( uiWidth, uiHeight ), currTU.cu->predMode, qp + currTU.cs->pps->getBIFQPOffset(), bfac );
 
   const unsigned uiPredStride = predBuf.stride;
   const unsigned uiStrideRes = resiBuf.stride;
@@ -612,7 +376,6 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
   Pel *piRecoTemp = piReco;
   // Reco = Pred + Resi
   
-  
   Pel *tempBlockPtr;
   
   uint32_t   uiWidthExt = uiWidth + (NUMBER_PADDED_SAMPLES << 1);
@@ -702,9 +465,13 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
       {
         // copy 4 pixels one line above block from block to blockx + 3
         std::copy(piRecIPred - (uiRecIPredStride)+blockx, piRecIPred - (uiRecIPredStride)+blockx + 1, tempblock + 2 + uiWidthExt + blockx);
-        if(doReshape)
-          for(int xx = 0; xx < 1; xx++)
-            tempblock[2+uiWidthExt+blockx+xx] = pLUT[tempblock[2+uiWidthExt+blockx+xx]];
+        if( doReshape )
+        {
+          for( int xx = 0; xx < 1; xx++ )
+          {
+            tempblock[2 + uiWidthExt + blockx + xx] = pLUT[tempblock[2 + uiWidthExt + blockx + xx]];
+          }
+        }
       }
     }
     else if (subTuHor)
@@ -716,13 +483,23 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
       const Pel *earlierPel = earlierHalfBuf.buf + (currTU.prev->lheight() - 1)*earlierStride;
       
       std::copy(earlierPel, earlierPel + area.width, tempblock + 2 + uiWidthExt);
-      if(doReshape)
-        for(int xx = 0; xx < area.width; xx++)
-          tempblock[2+uiWidthExt+xx] = pLUT[tempblock[2+uiWidthExt+xx]];
+      if( doReshape )
+      {
+        for( int xx = 0; xx < area.width; xx++ )
+        {
+          tempblock[2 + uiWidthExt + xx] = pLUT[tempblock[2 + uiWidthExt + xx]];
+        }
+      }
+
       std::copy(earlierPel - earlierStride, earlierPel - earlierStride + area.width, tempblock + 2);
-      if(doReshape)
-        for(int xx = 0; xx < area.width; xx++)
-          tempblock[2+xx] = pLUT[tempblock[2+xx]];
+
+      if( doReshape )
+      {
+        for( int xx = 0; xx < area.width; xx++ )
+        {
+          tempblock[2 + xx] = pLUT[tempblock[2 + xx]];
+        }
+      }
     }
     // left column
     if (leftAvailable)
@@ -767,7 +544,7 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
   std::copy(tempblock  + uiWidthExt, tempblock + uiWidthExt + uiWidthExt, tempblock);
   std::copy(tempblock  + uiWidthExt*(uiHeightExt-2), tempblock  + uiWidthExt*(uiHeightExt-2) + uiWidthExt, tempblock + uiWidthExt*(uiHeightExt-1));
 
-  blockBilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, uiWidth + 4, bfac, bif_round_add, bif_round_shift, true);
+  m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, uiWidth + 4, bfac, bif_round_add, bif_round_shift, true, LUTrowPtr );
 
   if (!useReco)
   {
@@ -802,36 +579,9 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB
   
   int recStride = rec.get(COMPONENT_Y).stride;
   Pel *recPtr = rec.get(COMPONENT_Y).bufAt(compArea);
-
-  int size = std::min(uiWidth, uiHeight);
   
-  int bfac = 1;
-  
-  if (size <= 4)
-    bfac = 3;
-  else if (size >= 16)
-    bfac = 1;
-  else
-    bfac = 2;
-  if (currTU.cu->predMode == MODE_INTER)
-  {
-    if (size <= 4)
-      bfac = 2;
-    else if (size >= 16)
-      bfac = 1;
-    else
-      bfac = 2;
-  }
-  
-  // Offset qp before deciding on LUT:
-  qp = qp + currTU.cs->pps->getBIFQPOffset();
-  
-  int sqp = qp;
-  if(sqp<17)
-    sqp = 17;
-  if(sqp>42)
-    sqp = 42;
-  LUTrowPtr = wBIF[sqp-17];
+  int bfac = 1;  
+  const char* LUTrowPtr = getFilterLutParameters( std::min( uiWidth, uiHeight ), currTU.cu->predMode, qp + currTU.cs->pps->getBIFQPOffset(), bfac );
   
   int bif_round_add = (BIF_ROUND_ADD) >> (currTU.cs->pps->getBIFStrength());
   int bif_round_shift = (BIF_ROUND_SHIFT) - (currTU.cs->pps->getBIFStrength());
@@ -846,8 +596,10 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB
   uint32_t   uiHeightExt = uiHeight + (NUMBER_PADDED_SAMPLES << 1);
   
   int iWidthExtSIMD = uiWidthExt;
-  if(uiWidth < 8)
+  if( uiWidth < 8 )
+  {
     iWidthExtSIMD = 8 + (NUMBER_PADDED_SAMPLES << 1);
+  }
   
   Pel *tempBlockPtr;
   
@@ -902,7 +654,7 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB
     {
       std::memcpy(tempBlockPtr, srcPtr, (uiWidthExt) * sizeof(Pel));
     }
-    return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false);
+    return m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false, LUTrowPtr );
   }
   else
   {
@@ -1034,7 +786,7 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB
     }
   }
   
-  blockBilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false);
+  m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false, LUTrowPtr );
 }
 
 void BilateralFilter::clipNotBilaterallyFilteredBlocks(const CPelUnitBuf& src, PelUnitBuf& rec, const ClpRng& clpRng, TransformUnit & currTU)
@@ -1225,10 +977,14 @@ void BilateralFilter::bilateralFilterPicRDOperCTU(CodingStructure& cs, PelUnitBu
     rec.copyFrom(src);
   }
 
-  if (bifParams.frmOn == 0)
-    std::fill(bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 0);
-  else if (bifParams.allCtuOn)
-    std::fill(bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 1);
+  if( bifParams.frmOn == 0 )
+  {
+    std::fill( bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 0 );
+  }
+  else if( bifParams.allCtuOn )
+  {
+    std::fill( bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 1 );
+  }
 }
 
 #endif
diff --git a/source/Lib/CommonLib/BilateralFilter.h b/source/Lib/CommonLib/BilateralFilter.h
index 0504013e76fec51da5db3ac5c9ff019188456647..4cbd41be7cb19150bafba652de4e71334c4ec209 100755
--- a/source/Lib/CommonLib/BilateralFilter.h
+++ b/source/Lib/CommonLib/BilateralFilter.h
@@ -64,12 +64,10 @@ private:
   // = 2313 128-bit words which has been rounded up to 2320 above. 
   short *tempblockFiltered = &tempblockFilteredTemp[-2];
 
-  void blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO);
-  void simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO);
-
-  char *LUTrowPtr;
+  void (*m_bilateralFilterDiamond5x5)( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr);
+  static void blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr );
   
-  char wBIF[26][16] = {
+  char m_wBIF[26][16] = {
   {  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, },
   {  0,   1,   1,   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, },
   {  0,   2,   2,   2,   1,   1,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0, },
@@ -112,6 +110,19 @@ public:
   void bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitBuf& rec, int32_t qp, const ClpRng& clpRng, TransformUnit & currTU);
   void clipNotBilaterallyFilteredBlocks(const CPelUnitBuf& src, PelUnitBuf& rec, const ClpRng& clpRng, TransformUnit & currTU);
 
+  const char* getFilterLutParameters( const int size, const PredMode predMode, const int qp, int& bfac );
+
+#if ENABLE_SIMD_BILATERAL_FILTER
+#ifdef TARGET_SIMD_X86
+  template<X86_VEXT vext>
+  static void simdFilterDiamond5x5( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr );
+
+  void    initBilateralFilterX86();
+  template <X86_VEXT vext>
+  void    _initBilateralFilterX86();
+#endif
+#endif
+
 };
 
 #endif
diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
index 7210ce71c7abed003447fe9083e549086a3dfcc7..7a26296aafdd575c98bcc7e5a92fffb67e651a08 100644
--- a/source/Lib/CommonLib/CommonDef.h
+++ b/source/Lib/CommonLib/CommonDef.h
@@ -477,11 +477,6 @@ static const int ALF_VB_POS_ABOVE_CTUROW_CHMA = 2;
 static const int MAX_ENCODER_DEBLOCKING_QUALITY_LAYERS =           8 ;
 #endif
 
-#if IDCC_TPM_JEM
-static const int USE_MORE_BLOCKSIZE_DEPTH_MAX = IDCC_TMP_MaxSize_Depth - 1;
-static const int INIT_THRESHOULD_SHIFTBITS = 2;  ///< (default 2) Early skip threshold for checking distance.
-#endif
-
 #if SHARP_LUMA_DELTA_QP
 static const uint32_t LUMA_LEVEL_TO_DQP_LUT_MAXSIZE =                1024; ///< max LUT size for QP offset based on luma
 
@@ -955,4 +950,12 @@ static const int MAX_FILTER_LENGTH_FIXED = 13;
 static const int FIX_FILTER_NUM_COEFF    = 42;
 #endif
 
+#if JVET_V0130_INTRA_TMP
+static const int TMP_TEMPLATE_SIZE =            4; // must be multiple of 4 for SIMD
+static const int TMP_MAXSIZE_DEPTH =            6; // should be log2(TMP_TEMPLATE_SIZE): keep as 6 to avoid any error
+static const int USE_MORE_BLOCKSIZE_DEPTH_MAX = TMP_MAXSIZE_DEPTH - 1;
+static const int INIT_THRESHOULD_SHIFTBITS =    2;  ///< (default 2) Early skip threshold for checking distance.
+static const int TMP_SEARCH_RANGE_MULT_FACTOR = 5;
+#endif
+
 #endif // end of #ifndef  __COMMONDEF__
diff --git a/source/Lib/CommonLib/ContextModelling.cpp b/source/Lib/CommonLib/ContextModelling.cpp
index 7855d636f03d74734a51f7f92acb5460e63cabfc..895e739d4da85b93e85c63642f9cebff728cca06 100644
--- a/source/Lib/CommonLib/ContextModelling.cpp
+++ b/source/Lib/CommonLib/ContextModelling.cpp
@@ -778,17 +778,17 @@ void MergeCtx::setMmvdMergeCandiInfo(PredictionUnit& pu, int candIdx)
   PU::restrictBiPredMergeCandsOne(pu);
 }
 
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 unsigned DeriveCtx::CtxTmpFlag(const CodingUnit& cu)
 {
 	const CodingStructure* cs = cu.cs;
 	unsigned ctxId = 0;
 
 	const CodingUnit* cuLeft = cs->getCURestricted(cu.lumaPos().offset(-1, 0), cu, CH_L);
-	ctxId = (cuLeft && cuLeft->TmpFlag) ? 1 : 0;
+	ctxId = (cuLeft && cuLeft->tmpFlag) ? 1 : 0;
 
 	const CodingUnit* cuAbove = cs->getCURestricted(cu.lumaPos().offset(0, -1), cu, CH_L);
-	ctxId += (cuAbove && cuAbove->TmpFlag) ? 1 : 0;
+	ctxId += (cuAbove && cuAbove->tmpFlag) ? 1 : 0;
 
 	ctxId = (cu.lwidth() > 2 * cu.lheight() || cu.lheight() > 2 * cu.lwidth()) ? 3 : ctxId;
 
diff --git a/source/Lib/CommonLib/ContextModelling.h b/source/Lib/CommonLib/ContextModelling.h
index 6e3ad4616ecfce51ec1eacf7418eb2608fce7406..834e03af34a3c377352cdbcc5d6afa0f90870959 100644
--- a/source/Lib/CommonLib/ContextModelling.h
+++ b/source/Lib/CommonLib/ContextModelling.h
@@ -611,7 +611,7 @@ unsigned CtxAffineFlag( const CodingUnit& cu );
 unsigned CtxPredModeFlag( const CodingUnit& cu );
 unsigned CtxIBCFlag(const CodingUnit& cu);
 unsigned CtxMipFlag   ( const CodingUnit& cu );
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 unsigned CtxTmpFlag(const CodingUnit& cu);
 #endif
 unsigned CtxPltCopyFlag( const unsigned prevRunType, const unsigned dist );
diff --git a/source/Lib/CommonLib/Contexts.cpp b/source/Lib/CommonLib/Contexts.cpp
index 5b61e1b97c98302457599341a7218b96a4771802..dfce0eb5fa9ac8c61ce60203397cadb392f9f0dd 100644
--- a/source/Lib/CommonLib/Contexts.cpp
+++ b/source/Lib/CommonLib/Contexts.cpp
@@ -1000,15 +1000,15 @@ const CtxSet ContextSetCfg::MipFlag = ContextSetCfg::addCtxSet
 	{ 9,  9,  8,  6 },
 	{ 10, 10,  9,  6 }
 });
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 const CtxSet ContextSetCfg::TmpFlag = ContextSetCfg::addCtxSet
 ({
   {  CNU,  CNU,  CNU,  CNU, },
   {  CNU,  CNU,  CNU,  CNU, },
   {  CNU,  CNU,  CNU,  CNU, },
-  {   DWS,  DWS,   DWS,   DWS, },
-  {   DWS,  DWS,   DWS,   DWS, },
-  {   DWS,  DWS,   DWS,   DWS, },
+  {  DWS,  DWS,  DWS,  DWS, },
+  {  DWS,  DWS,  DWS,  DWS, },
+  {  DWS,  DWS,  DWS,  DWS, },
 	});
 #endif
 
@@ -2089,13 +2089,13 @@ const CtxSet ContextSetCfg::MipFlag = ContextSetCfg::addCtxSet
   {  33,  49,  50,  25, },
   {   9,  10,   9,   6, },
 });
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 const CtxSet ContextSetCfg::TmpFlag = ContextSetCfg::addCtxSet
 ({
   {  CNU,  CNU,  CNU,  CNU, },
   {  CNU,  CNU,  CNU,  CNU, },
   {  CNU,  CNU,  CNU,  CNU, },
-  {   DWS,  DWS,   DWS,   DWS, },
+  {  DWS,  DWS,  DWS,  DWS, },
 	});
 #endif
 
diff --git a/source/Lib/CommonLib/Contexts.h b/source/Lib/CommonLib/Contexts.h
index 0506ecf1983adfc686a63adacecea0f47646e06f..8993e11bbff8dccf4930e823bf6bb63ff935030c 100644
--- a/source/Lib/CommonLib/Contexts.h
+++ b/source/Lib/CommonLib/Contexts.h
@@ -245,7 +245,7 @@ public:
   static const CtxSet   CclmModeIdx;
   static const CtxSet   IntraChromaPredMode;
   static const CtxSet   MipFlag;
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   static const CtxSet   TmpFlag;
 #endif
 #if MMLM
diff --git a/source/Lib/CommonLib/IntraPrediction.cpp b/source/Lib/CommonLib/IntraPrediction.cpp
index 4df14069b127922b9e232a2631d6b7e8356ddf01..bc5c898ed0d09a8e270f0db2f8d5176041b8e4dc 100644
--- a/source/Lib/CommonLib/IntraPrediction.cpp
+++ b/source/Lib/CommonLib/IntraPrediction.cpp
@@ -686,7 +686,7 @@ void IntraPrediction::initPredIntraParams(const PredictionUnit & pu, const CompA
   if(   sps.getSpsRangeExtension().getIntraSmoothingDisabledFlag()
     || !isLuma( chType )
     || useISP
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 	  || PU::isTmp(pu, chType)
 #endif
     || PU::isMIP( pu, chType )
@@ -1394,7 +1394,7 @@ void IntraPrediction::initIntraPatternChTypeISP(const CodingUnit& cu, const Comp
   }
 }
 
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area)
 {
 	const ChannelType      chType = toChannelType(area.compID);
@@ -1402,14 +1402,12 @@ bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area)
 	const SPS& sps = *cs.sps;
 	const PreCalcValues& pcv = *cs.pcv;
 
-
 	const int  tuWidth = area.width;
 	const int  tuHeight = area.height;
 	const int  predSize = m_topRefLength;
 	const int  predHSize = m_leftRefLength;
 	//const int predStride = predSize;
 
-
 	const int  unitWidth = pcv.minCUWidth >> getComponentScaleX(area.compID, sps.getChromaFormatIdc());
 	const int  unitHeight = pcv.minCUHeight >> getComponentScaleY(area.compID, sps.getChromaFormatIdc());
 
@@ -1421,8 +1419,10 @@ bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area)
 	const int  numAboveRightUnits = totalAboveUnits - numAboveUnits;
 	const int  numLeftBelowUnits = totalLeftUnits - numLeftUnits;
 
-	if (numAboveUnits <= 0 || numLeftUnits <= 0 || numAboveRightUnits <= 0 || numLeftBelowUnits <= 0)
-		return false;
+  if( numAboveUnits <= 0 || numLeftUnits <= 0 || numAboveRightUnits <= 0 || numLeftBelowUnits <= 0 )
+  {
+    return false;
+  }
 
 	// ----- Step 1: analyze neighborhood -----
 	const Position posLT = area;
diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h
index 5d7ae3173634dd702b00ccf6248c3021e7e5f51e..95a98729e8b67700dfec4de8d5f5f4ad4614de51 100644
--- a/source/Lib/CommonLib/IntraPrediction.h
+++ b/source/Lib/CommonLib/IntraPrediction.h
@@ -144,7 +144,7 @@ protected:
 
   void xPredIntraBDPCM            ( const CPelBuf &pSrc, PelBuf &pDst, const uint32_t dirMode, const ClpRng& clpRng );
   Pel  xGetPredValDc              ( const CPelBuf &pSrc, const Size &dstSize );
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   bool isRefTemplateAvailable(CodingUnit& cu, CompArea& area);
 #endif
 
diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp
index 94ef8a9abcb4733eb8b16024af158ac1cfdbf6a5..934d14c39ac50bc84650f081b54be54510cfe0e7 100644
--- a/source/Lib/CommonLib/Slice.cpp
+++ b/source/Lib/CommonLib/Slice.cpp
@@ -3113,9 +3113,9 @@ SPS::SPS()
 #if ENABLE_DIMD
 , m_dimd                      ( false )
 #endif
-#if IDCC_TPM_JEM
-, m_IntraTMP                  ( false )
-, m_IntraTMP_MaxSize          ( 64 )                             
+#if JVET_V0130_INTRA_TMP
+, m_intraTMP                  ( false )
+, m_intraTmpMaxSize           ( 64 )                             
 #endif
 #if ENABLE_OBMC
 , m_OBMC                      ( false )
diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h
index ec74b504c951a6289b65f4852c511a86dc6f5efe..f53244b1dbbedad876acfc6f5385cf719faa4399 100644
--- a/source/Lib/CommonLib/Slice.h
+++ b/source/Lib/CommonLib/Slice.h
@@ -1651,9 +1651,9 @@ private:
 #if ENABLE_DIMD
   bool              m_dimd;
 #endif
-#if IDCC_TPM_JEM
-  bool              m_IntraTMP;                                       ///< intra Template Matching 
-  unsigned          m_IntraTMP_MaxSize;                               ///< max CU size for which intra TMP is allowed
+#if JVET_V0130_INTRA_TMP
+  bool              m_intraTMP;                                       ///< intra Template Matching 
+  unsigned          m_intraTmpMaxSize;                               ///< max CU size for which intra TMP is allowed
 #endif
 #if ENABLE_OBMC
   bool              m_OBMC;
@@ -2077,11 +2077,11 @@ void                    setCCALFEnabledFlag( bool b )
   void      setUseDimd         ( bool b )                                        { m_dimd = b; }
   bool      getUseDimd         ()                                      const     { return m_dimd; }
 #endif
-#if IDCC_TPM_JEM
-  void      setUseIntraTMP(bool b) { m_IntraTMP = b; }
-  bool      getUseIntraTMP() const { return m_IntraTMP; }
-  void      setIntraTMPMaxSize(unsigned n) { m_IntraTMP_MaxSize = n; }
-  unsigned  getIntraTMPMaxSize() const { return m_IntraTMP_MaxSize; }
+#if JVET_V0130_INTRA_TMP
+  void      setUseIntraTMP     (bool b)                                          { m_intraTMP = b; }
+  bool      getUseIntraTMP     ()                                      const     { return m_intraTMP; }
+  void      setIntraTMPMaxSize (unsigned n)                                      { m_intraTmpMaxSize = n; }
+  unsigned  getIntraTMPMaxSize ()                                      const     { return m_intraTmpMaxSize; }
 #endif
 #if ENABLE_OBMC
   void      setUseOBMC         ( bool b )                                        { m_OBMC = b; }
diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp
index c74ab4c431e67373d95234b32e93db2c78fd177f..5889375a2e2f21adafaf099111dfe9810d04b936 100644
--- a/source/Lib/CommonLib/TrQuant.cpp
+++ b/source/Lib/CommonLib/TrQuant.cpp
@@ -55,16 +55,14 @@
 #include "CommonLib/CodingStatistics.h"
 #endif
 
-#if IDCC_TMP_SIMD
+#if ENABLE_SIMD_TMP
 #include "CommonDefX86.h"
 #endif
 
-#if IDCC_TPM_JEM
-
+#if JVET_V0130_INTRA_TMP
 unsigned int g_uiDepth2Width[5] = { 4, 8, 16, 32, 64 };
 #endif
 
-
 struct coeffGroupRDStats
 {
   int    iNNZbeforePos0;
@@ -197,8 +195,8 @@ TrQuant::TrQuant() : m_quant( nullptr )
     m_fwdICT[-2]  = fwdTransformCbCr<-2>;
     m_fwdICT[ 3]  = fwdTransformCbCr< 3>;
     m_fwdICT[-3]  = fwdTransformCbCr<-3>;
-#if IDCC_TPM_JEM
-	m_pppTarPatch = NULL;
+#if JVET_V0130_INTRA_TMP
+	  m_pppTarPatch = NULL;
 #endif
   }
 }
@@ -210,17 +208,15 @@ TrQuant::~TrQuant()
     delete m_quant;
     m_quant = nullptr;
   }
-#if IDCC_TPM_JEM
-#endif
 
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   if (m_pppTarPatch != NULL)
   {
 	  for (unsigned int uiDepth = 0; uiDepth < USE_MORE_BLOCKSIZE_DEPTH_MAX; uiDepth++)
 	  {
 		  unsigned int blkSize = g_uiDepth2Width[uiDepth];
 
-		  unsigned int patchSize = blkSize + IDCC_TemplateSize;
+		  unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE;
 		  for (unsigned int uiRow = 0; uiRow < patchSize; uiRow++)
 		  {
 			  if (m_pppTarPatch[uiDepth][uiRow] != NULL)
@@ -275,7 +271,7 @@ void TrQuant::init( const Quant* otherQuant,
   }
 
 
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   unsigned int blkSize;
   
   if (m_pppTarPatch == NULL)
@@ -285,7 +281,7 @@ void TrQuant::init( const Quant* otherQuant,
 	  {
 		  blkSize = g_uiDepth2Width[uiDepth];
 
-		  unsigned int patchSize = blkSize + IDCC_TemplateSize;
+		  unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE;
 		  m_pppTarPatch[uiDepth] = new Pel * [patchSize];
 		  for (unsigned int uiRow = 0; uiRow < patchSize; uiRow++)
 		  {
@@ -328,6 +324,9 @@ void TrQuant::init( const Quant* otherQuant,
 #if ENABLE_SIMD_SIGN_PREDICTION
   m_computeSAD = xComputeSAD;
 #endif
+#if INTRA_TEMPLATE_MATCHING
+  m_calcTemplateDiff = calcTemplateDiff;
+#endif
 
 #if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
 #ifdef TARGET_SIMD_X86
@@ -435,32 +434,30 @@ void TrQuant::invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32
   }
 }
 
-#if IDCC_TPM_JEM
-void insertNode(DistType diff, int& iXOffset, int& iYOffset, DistType& pDiff, int& pX, int& pY, short& pId, unsigned int& setId)
+#if JVET_V0130_INTRA_TMP
+void insertNode(int diff, int& iXOffset, int& iYOffset, int& pDiff, int& pX, int& pY, short& pId, unsigned int& setId)
 {
 	pDiff = diff;
 	pX = iXOffset;
 	pY = iYOffset;
 	pId = setId;
 }
-#if IDCC_TPM_JEM
+
 void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iHorMax, int& iVerMin, int& iVerMax, unsigned int uiTemplateSize, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int iCurrY, int iCurrX, int offsetLCUY, int offsetLCUX)
 {
-	int SearchRange_Height, SearchRange_Width;
-	
-	SearchRange_Width = IDCC_SearchRangeMultFactor * uiBlkWidth;
-	SearchRange_Height = IDCC_SearchRangeMultFactor * uiBlkHeight;
-	int  iMvShift = 0;
+	int searchRangeWidth  = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkWidth;
+	int searchRangeHeight = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkHeight;
+	int iMvShift = 0;
 	int iTemplateSize = uiTemplateSize;
 	int iBlkWidth = uiBlkWidth;
 	int iBlkHeight = uiBlkHeight;
 	if (regionId == 0) //above outside LCU
 	{
-		iHorMax = std::min((iCurrX + SearchRange_Width) << iMvShift, (int)((pcCU->cs->sps->getMaxPicWidthInLumaSamples() - iBlkWidth) << iMvShift));
-		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - SearchRange_Width) << iMvShift);
+		iHorMax = std::min((iCurrX + searchRangeWidth) << iMvShift, (int)((pcCU->cs->sps->getMaxPicWidthInLumaSamples() - iBlkWidth) << iMvShift));
+		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift);
 
 		iVerMax = (iCurrY - iBlkHeight - offsetLCUY) << iMvShift;
-		iVerMin = std::max(((iTemplateSize) << iMvShift), ((iCurrY - SearchRange_Height) << iMvShift));
+		iVerMin = std::max(((iTemplateSize) << iMvShift), ((iCurrY - searchRangeHeight) << iMvShift));
 
 		iHorMin = iHorMin - iCurrX;
 		iHorMax = iHorMax - iCurrX;
@@ -470,7 +467,7 @@ void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iH
 	else if (regionId == 1) //left outside LCU
 	{
 		iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift;
-		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - SearchRange_Width) << iMvShift);
+		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift);
 
 		iVerMin = std::max((iTemplateSize) << iMvShift, (iCurrY - iBlkHeight - offsetLCUY) << iMvShift);
 		iVerMax = (iCurrY) << iMvShift;
@@ -482,7 +479,7 @@ void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iH
 	}
 	else if (regionId == 2) //left outside LCU (can reach the bottom row of LCU)
 	{
-		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - SearchRange_Width) << iMvShift);
+		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift);
 		iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift;
 		iVerMin = (iCurrY + 1) << iMvShift;
 		iVerMax = std::min(pcCU->cs->sps->getMaxPicHeightInLumaSamples() - iBlkHeight, (iCurrY - offsetLCUY + pcCU->cs->sps->getCTUSize() - iBlkHeight) << iMvShift);
@@ -493,10 +490,7 @@ void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iH
 		iVerMin = iVerMin - iCurrY;
 	}
 }
-#endif
-#endif
 
-#if IDCC_TPM_JEM
 TempLibFast::TempLibFast()
 {
 }
@@ -504,12 +498,10 @@ TempLibFast::TempLibFast()
 TempLibFast::~TempLibFast()
 {
 }
-#endif
 
-#if IDCC_TPM_JEM
 void TempLibFast::initTemplateDiff(unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int bitDepth)
 {
-	DistType maxValue = ((1 << bitDepth) >> (INIT_THRESHOULD_SHIFTBITS)) * (uiPatchHeight * uiPatchWidth - uiBlkHeight * uiBlkWidth);
+	int maxValue = ((1 << bitDepth) >> (INIT_THRESHOULD_SHIFTBITS)) * (uiPatchHeight * uiPatchWidth - uiBlkHeight * uiBlkWidth);
 	m_diffMax = maxValue;
 	{
 		m_pDiff = maxValue;
@@ -519,8 +511,8 @@ void TempLibFast::initTemplateDiff(unsigned int uiPatchWidth, unsigned int uiPat
 void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight)
 {
 	const ComponentID compID = COMPONENT_Y;
-	unsigned int uiPatchWidth = uiBlkWidth + IDCC_TemplateSize;
-	unsigned int uiPatchHeight = uiBlkHeight + IDCC_TemplateSize;
+	unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
+	unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
 	unsigned int uiTarDepth = floorLog2(std::max(uiBlkHeight, uiBlkWidth)) - 2;
 	Pel** tarPatch = m_pppTarPatch[uiTarDepth];
 	CompArea area = pcCU->blocks[compID];
@@ -528,13 +520,11 @@ void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsig
 	unsigned int  uiPicStride = pcCU->cs->picture->getRecoBuf(compID).stride;
 	unsigned int uiY, uiX;
 
-
-
 	//fill template
 	//up-left & up 
 	Pel* tarTemp;
-	Pel* pCurrTemp = pCurrStart - IDCC_TemplateSize * uiPicStride - IDCC_TemplateSize;
-	for (uiY = 0; uiY < IDCC_TemplateSize; uiY++)
+	Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride - TMP_TEMPLATE_SIZE;
+	for (uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++)
 	{
 		tarTemp = tarPatch[uiY]; 
 		for (uiX = 0; uiX < uiPatchWidth; uiX++)
@@ -544,10 +534,10 @@ void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsig
 		pCurrTemp += uiPicStride;
 	}
 	//left
-	for (uiY = IDCC_TemplateSize; uiY < uiPatchHeight; uiY++)
+	for (uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++)
 	{
 		tarTemp = tarPatch[uiY];
-		for (uiX = 0; uiX < IDCC_TemplateSize; uiX++)
+		for (uiX = 0; uiX < TMP_TEMPLATE_SIZE; uiX++)
 		{
 			tarTemp[uiX] = pCurrTemp[uiX];
 		}
@@ -559,8 +549,8 @@ void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, un
 {
 	const ComponentID compID = COMPONENT_Y;
 	const int channelBitDepth = pcCU->cs->sps->getBitDepth(toChannelType(compID));
-	unsigned int uiPatchWidth = uiBlkWidth + IDCC_TemplateSize;
-	unsigned int uiPatchHeight = uiBlkHeight + IDCC_TemplateSize;
+	unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
+	unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
 	unsigned int uiTarDepth = floorLog2(std::max(uiBlkWidth, uiBlkHeight)) - 2;
 	Pel** tarPatch = getTargetPatch(uiTarDepth);
 	//Initialize the library for saving the best candidates
@@ -568,25 +558,29 @@ void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, un
 	short setId = 0; //record the reference picture.
 	searchCandidateFromOnePicIntra(pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId);
 	//count collected candidate number
-	DistType pDiff = m_tempLibFast.getDiff();
-	DistType maxDiff = m_tempLibFast.getDiffMax();
+	int pDiff = m_tempLibFast.getDiff();
+	int maxDiff = m_tempLibFast.getDiffMax();
 	
 
-	if (pDiff < maxDiff)
-		m_uiVaildCandiNum = 1;
-	else
-		m_uiVaildCandiNum = 0;
+  if( pDiff < maxDiff )
+  {
+    m_uiVaildCandiNum = 1;
+  }
+  else
+  {
+    m_uiVaildCandiNum = 0;
+  }
 }
 
 void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId)
 {
 	const ComponentID compID = COMPONENT_Y;
-	unsigned int uiBlkWidth = uiPatchWidth - IDCC_TemplateSize;
-	unsigned int uiBlkHeight = uiPatchHeight - IDCC_TemplateSize;
+	unsigned int uiBlkWidth = uiPatchWidth - TMP_TEMPLATE_SIZE;
+	unsigned int uiBlkHeight = uiPatchHeight - TMP_TEMPLATE_SIZE;
 
 	int pX = m_tempLibFast.getX();
 	int pY = m_tempLibFast.getY();
-	DistType pDiff = m_tempLibFast.getDiff();
+	int pDiff = m_tempLibFast.getDiff();
 	short pId = m_tempLibFast.getId();
 	CompArea area = pcCU->blocks[compID];
 	int  refStride = pcCU->cs->picture->getRecoBuf(compID).stride;
@@ -594,9 +588,7 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 	Pel* ref = pcCU->cs->picture->getRecoBuf(area).buf;
 	
 	setRefPicUsed(ref); //facilitate the access of each candidate point 
-	
 	setStride(refStride);
-
 	
 	Mv cTmpMvPred;
 	cTmpMvPred.setZero();
@@ -614,27 +606,23 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 
 
 	int iYOffset, iXOffset;
-	DistType diff;
+	int diff;
 	Pel* refCurr;
 
-
-#define REGION_NUM 3
-	int mvYMins[REGION_NUM];
-	int mvYMaxs[REGION_NUM];
-	int mvXMins[REGION_NUM];
-	int mvXMaxs[REGION_NUM];
-	int regionNum = REGION_NUM;
+	const int regionNum = 3;
+	int mvYMins[regionNum];
+	int mvYMaxs[regionNum];
+	int mvXMins[regionNum];
+	int mvXMaxs[regionNum];
 	int regionId = 0;
 
-
 	//1. check the near pixels within LCU
 	//above pixels in LCU
-	int iTemplateSize = IDCC_TemplateSize;
+	int iTemplateSize = TMP_TEMPLATE_SIZE;
 	int iBlkWidth = uiBlkWidth;
 	int iBlkHeight = uiBlkHeight;
 	regionId = 0;
 	int iMvShift = 0;
-	
 
 	int iVerMin = std::max(((iTemplateSize) << iMvShift), (iCurrY - offsetLCUY - iBlkHeight + 1) << iMvShift);
 	int iVerMax = (iCurrY - iBlkHeight) << iMvShift; 
@@ -646,8 +634,6 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 	mvYMins[regionId] = iVerMin - iCurrY;
 	mvYMaxs[regionId] = iVerMax - iCurrY;
 
-
-
 	//check within CTU pixels
 	for (regionId = 0; regionId < 1; regionId++)
 	{
@@ -659,12 +645,13 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 		{
 			continue;
 		}
+
 		for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset--)
 		{
 			for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--)
 			{
 				refCurr = ref + iYOffset * refStride + iXOffset;
-				diff = calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
+				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
 				if (diff < (pDiff))
 				{
 					insertNode(diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId); 
@@ -680,8 +667,9 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 	//2. check the pixels outside CTU
 	for (regionId = 0; regionId < regionNum; regionId++)
 	{// this function fills in the range the template matching for pixels outside the current CTU
-		clipMvIntraConstraint(pcCU, regionId, mvXMins[regionId], mvXMaxs[regionId], mvYMins[regionId], mvYMaxs[regionId], IDCC_TemplateSize, uiBlkWidth, uiBlkHeight, iCurrY, iCurrX, offsetLCUY, offsetLCUX);
+		clipMvIntraConstraint(pcCU, regionId, mvXMins[regionId], mvXMaxs[regionId], mvYMins[regionId], mvYMaxs[regionId], TMP_TEMPLATE_SIZE, uiBlkWidth, uiBlkHeight, iCurrY, iCurrX, offsetLCUY, offsetLCUX);
 	}
+
 	for (regionId = 0; regionId < regionNum; regionId++)
 	{
 		int mvYMin = mvYMins[regionId];
@@ -697,11 +685,13 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 			for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--)
 			{
 				refCurr = ref + iYOffset * refStride + iXOffset;
-				diff = calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
+				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
+
 				if (diff < (pDiff))
 				{
 					insertNode(diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId);
 				}
+
         if (pDiff == 0)
         {
           regionId = regionNum;
@@ -709,6 +699,7 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 			}
 		}
 	}
+
 	m_tempLibFast.m_pX = pX;
 	m_tempLibFast.m_pY = pY;
 	m_tempLibFast.m_pDiff = pDiff;
@@ -717,8 +708,8 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int& foundCandiNum)
 {
 	bool bSucceedFlag = true;
-	unsigned int uiPatchWidth = uiBlkWidth + IDCC_TemplateSize;
-	unsigned int uiPatchHeight = uiBlkHeight + IDCC_TemplateSize;
+	unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
+	unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
 
 	foundCandiNum = m_uiVaildCandiNum;
 	if (foundCandiNum < 1)
@@ -732,8 +723,8 @@ bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned
 	int picStride = getStride();
 	int iOffsetY, iOffsetX;
 	Pel* refTarget;
-	unsigned int uiHeight = uiPatchHeight - IDCC_TemplateSize;
-	unsigned int uiWidth = uiPatchWidth - IDCC_TemplateSize;
+	unsigned int uiHeight = uiPatchHeight - TMP_TEMPLATE_SIZE;
+	unsigned int uiWidth = uiPatchWidth - TMP_TEMPLATE_SIZE;
 
 	//the data center: we use the prediction block as the center now.
 	//collect the candidates
@@ -755,128 +746,46 @@ bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned
 	return bSucceedFlag;
 }
 
-DistType TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, DistType iMax)
+int TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax)
 {
-	DistType iDiffSum = 0;
-	int iY;
-	Pel* refPatchRow = ref - IDCC_TemplateSize * uiStride - IDCC_TemplateSize;
-	Pel* tarPatchRow;
-
-	uint32_t uiSum;
-	// horizontal difference
-	for (iY = 0; iY < IDCC_TemplateSize; iY++)
-	{
-		tarPatchRow = tarPatch[iY];
-		const short* pSrc1 = (const short*)tarPatchRow;
-		const short* pSrc2 = (const short*)refPatchRow;
-
-		// SIMD difference
-		//int  iRows = uiPatchHeight;
-		int  iCols = uiPatchWidth;
-		if ((iCols & 7) == 0)
-		{
-			// Do with step of 8
-			__m128i vzero = _mm_setzero_si128();
-			__m128i vsum32 = vzero;
-			//for (int iY = 0; iY < iRows; iY += iSubStep)
-			{
-				__m128i vsum16 = vzero;
-				for (int iX = 0; iX < iCols; iX += 8)
-				{
-					__m128i vsrc1 = _mm_loadu_si128((const __m128i*)(&pSrc1[iX]));
-					__m128i vsrc2 = _mm_lddqu_si128((const __m128i*)(&pSrc2[iX]));
-					vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
-				}
-				__m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-				vsum32 = _mm_add_epi32(vsum32, vsumtemp);
-				//pSrc1 += iStrideSrc1;
-				//pSrc2 += iStrideSrc2;
-			}
-			vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-			vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-			uiSum = _mm_cvtsi128_si32(vsum32);
-		}
-		else
-		{
-			// Do with step of 4
-			__m128i vzero = _mm_setzero_si128();
-			__m128i vsum32 = vzero;
-			//for (int iY = 0; iY < iRows; iY += iSubStep)
-			{
-				__m128i vsum16 = vzero;
-				for (int iX = 0; iX < iCols; iX += 4)
-				{
-					__m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
-					__m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
-					vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
-				}
-				__m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-				vsum32 = _mm_add_epi32(vsum32, vsumtemp);
-				//pSrc1 += iStrideSrc1;
-				//pSrc2 += iStrideSrc2;
-			}
-			vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-			vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-			uiSum = _mm_cvtsi128_si32(vsum32);
-		}
-		iDiffSum += uiSum;
-
-		if (iDiffSum > iMax) //for speeding up
-		{
-			return iDiffSum;
-		}
-		// update location
-		refPatchRow += uiStride;
-	}
+  int iDiffSum = 0;
+  Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+  Pel* tarPatchRow;
 
-	// vertical difference
-	int  iCols = IDCC_TemplateSize;
-	for (iY = IDCC_TemplateSize; iY < uiPatchHeight; iY++)
-	{
-		tarPatchRow = tarPatch[iY];
-		const short* pSrc1 = (const short*)tarPatchRow;
-		const short* pSrc2 = (const short*)refPatchRow ;
-
-		// SIMD difference
-
-		// Do with step of 4
-		__m128i vzero = _mm_setzero_si128();
-		__m128i vsum32 = vzero;
-		//for (int iY = 0; iY < iRows; iY += iSubStep)
-		{
-			__m128i vsum16 = vzero;
-			for (int iX = 0; iX < iCols; iX += 4)
-			{
-				__m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
-				__m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
-				vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
-			}
-			__m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-			vsum32 = _mm_add_epi32(vsum32, vsumtemp);
-			//pSrc1 += iStrideSrc1;
-			//pSrc2 += iStrideSrc2;
-		}
-		vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-		vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-		uiSum = _mm_cvtsi128_si32(vsum32);
+  // horizontal difference
+  for( int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
+  {
+    tarPatchRow = tarPatch[iY];
+    for( int iX = 0; iX < uiPatchWidth; iX++ )
+    {
+      iDiffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
+    }
+    if( iDiffSum > iMax ) //for speeding up
+    {
+      return iDiffSum;
+    }
+    refPatchRow += uiStride;
+  }
 
-		iDiffSum += uiSum;
+  // vertical difference
+  for( int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ )
+  {
+    tarPatchRow = tarPatch[iY];
+    for( int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++ )
+    {
+      iDiffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
+    }
+    if( iDiffSum > iMax ) //for speeding up
+    {
+      return iDiffSum;
+    }
+    refPatchRow += uiStride;
+  }
 
-		if (iDiffSum > iMax) //for speeding up
-		{
-			return iDiffSum;
-		}
-		// update location
-		refPatchRow += uiStride;
-	}
-	
-	return iDiffSum;
-	
+  return iDiffSum;
 }
 #endif
 
-
-
 uint32_t TrQuant::getLFNSTIntraMode( int wideAngPredMode )
 {
   uint32_t intraMode;
@@ -930,11 +839,11 @@ void TrQuant::xInvLfnst( const TransformUnit &tu, const ComponentID compID )
     {
       intraMode = PLANAR_IDX;
     }
-#if IDCC_TPM_JEM
-	if (PU::isTmp(*tu.cs->getPU(area.pos(), toChannelType(compID)), toChannelType(compID)))
-	{
-		intraMode = PLANAR_IDX;
-  }
+#if JVET_V0130_INTRA_TMP
+    if( PU::isTmp( *tu.cs->getPU( area.pos(), toChannelType( compID ) ), toChannelType( compID ) ) )
+    {
+      intraMode = PLANAR_IDX;
+    }
 #endif
     CHECK( intraMode >= NUM_INTRA_MODE - 1, "Invalid intra mode" );
 
@@ -1076,11 +985,11 @@ void TrQuant::xFwdLfnst( const TransformUnit &tu, const ComponentID compID, cons
     {
       intraMode = PLANAR_IDX;
     }
-#if IDCC_TPM_JEM
-	if (PU::isTmp(*tu.cs->getPU(area.pos(), toChannelType(compID)), toChannelType(compID)))
-	{
-		intraMode = PLANAR_IDX;
-  }
+#if JVET_V0130_INTRA_TMP
+    if( PU::isTmp( *tu.cs->getPU( area.pos(), toChannelType( compID ) ), toChannelType( compID ) ) )
+    {
+      intraMode = PLANAR_IDX;
+    }
 #endif
     CHECK( intraMode >= NUM_INTRA_MODE - 1, "Invalid intra mode" );
 
@@ -1319,8 +1228,8 @@ void TrQuant::getTrTypes(const TransformUnit tu, const ComponentID compID, int &
     return;
   }
 
-#if IDCC_TPM_JEM
-  if (isImplicitMTS || isISP || tu.cu->TmpFlag)
+#if JVET_V0130_INTRA_TMP
+  if (isImplicitMTS || isISP || tu.cu->tmpFlag)
 #else
   if (isImplicitMTS || isISP)
 #endif
diff --git a/source/Lib/CommonLib/TrQuant.h b/source/Lib/CommonLib/TrQuant.h
index f116d83e256f8ab45ccf75ef0296a4a6ca14bcf1..8556825e4e00241ab087347b8a5609876dfea999 100644
--- a/source/Lib/CommonLib/TrQuant.h
+++ b/source/Lib/CommonLib/TrQuant.h
@@ -57,10 +57,7 @@ typedef void InvTrans(const TCoeff*, TCoeff*, int, int, int, int, const TCoeff,
 
 
 
-#if IDCC_TPM_JEM
-
-
-#define MAX_1DTRANS_LEN         (1 << (((USE_MORE_BLOCKSIZE_DEPTH_MAX) + 1) << 1)) ///< 4x4 = 16, 8x8 = 64, 16x16=256, 32x32 = 1024
+#if JVET_V0130_INTRA_TMP
 extern unsigned int g_uiDepth2Width[5];
 extern unsigned int g_uiDepth2MaxCandiNum[5];
 
@@ -71,13 +68,13 @@ public:
 	int m_pY;    //offset Y
 	int m_pXInteger;    //offset X for integer pixel search
 	int m_pYInteger;    //offset Y for integer pixel search
-	DistType m_pDiffInteger;
+	int m_pDiffInteger;
 	int getXInteger() { return m_pXInteger; }
 	int getYInteger() { return m_pYInteger; }
-	DistType getDiffInteger() { return m_pDiffInteger; }
+	int getDiffInteger() { return m_pDiffInteger; }
 	short m_pIdInteger; //frame id
 	short getIdInteger() { return m_pIdInteger; }
-	DistType m_pDiff; //mse
+	int m_pDiff; //mse
 	short m_pId; //frame id
 	
 
@@ -86,7 +83,7 @@ public:
 	//void init();
 	int getX() { return m_pX; }
 	int getY() { return m_pY; }
-	DistType getDiff() { return m_pDiff; }
+	int getDiff() { return m_pDiff; }
 	short getId() { return m_pId; }
 	/*void initDiff(unsigned int uiPatchSize, int bitDepth);
 	void initDiff(unsigned int uiPatchSize, int bitDepth, int iCandiNumber);*/
@@ -132,13 +129,14 @@ public:
   void fwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize );
   void invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize );
 #endif
-#if IDCC_TPM_JEM
-  DistType calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, DistType iMax);
-  Pel** getTargetPatch(unsigned int uiDepth) { return m_pppTarPatch[uiDepth]; }
-  Pel* getRefPicUsed() { return m_refPicUsed; }
-  void setRefPicUsed(Pel* ref) { m_refPicUsed = ref; }
-  unsigned int getStride() { return m_uiPicStride; }
-  void setStride(unsigned int uiPicStride) { m_uiPicStride = uiPicStride; }
+#if JVET_V0130_INTRA_TMP
+  int ( *m_calcTemplateDiff )(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
+  static int calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
+  Pel** getTargetPatch(unsigned int uiDepth)       { return m_pppTarPatch[uiDepth]; }
+  Pel* getRefPicUsed()                             { return m_refPicUsed; }
+  void setRefPicUsed(Pel* ref)                     { m_refPicUsed = ref; }
+  unsigned int getStride()                         { return m_uiPicStride; }
+  void         setStride(unsigned int uiPicStride) { m_uiPicStride = uiPicStride; }
 
   void searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId);
   void candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight);
@@ -199,14 +197,14 @@ public:
 
 protected:
   TCoeff   m_tempCoeff[MAX_TB_SIZEY * MAX_TB_SIZEY];
-#if IDCC_TPM_JEM
-  int m_uiPartLibSize;
-  TempLibFast m_tempLibFast;
-  Pel* m_refPicUsed;
-  Picture* m_refPicBuf;
+#if JVET_V0130_INTRA_TMP
+  int          m_uiPartLibSize;
+  TempLibFast  m_tempLibFast;
+  Pel*         m_refPicUsed;
+  Picture*     m_refPicBuf;
   unsigned int m_uiPicStride;
   unsigned int m_uiVaildCandiNum;
-  Pel*** m_pppTarPatch;
+  Pel***       m_pppTarPatch;
 #endif
 #if SIGN_PREDICTION
   Pel      m_tempSignPredResid[SIGN_PRED_MAX_BS * SIGN_PRED_MAX_BS * 2]{0};
@@ -282,7 +280,7 @@ private:
   static void fastInverseTransform_SIMD( const TCoeff *coeff, TCoeff *block, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum );
 #endif
 
-#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
+#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP
 #ifdef TARGET_SIMD_X86
   void    initTrQuantX86();
   template <X86_VEXT vext>
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 67363bc713e5b5816e9dff58f580e9e4cea20e5a..9cb01a37576d9d26be8fbad57ed503f2b3f6a32d 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -1,4 +1,4 @@
-/* The copyright in this software is being made available under the BSD
+ï»¿/* The copyright in this software is being made available under the BSD
  * License, included below. This software may be subject to other third party
  * and contributor rights, including patent rights, and no such rights are
  * granted under this license.
@@ -86,25 +86,6 @@
 #define INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS             1 // Enable 2xN and Nx2 block by removing SCIPU constraints
 #define CCLM_LATENCY_RESTRICTION_RMV                      1 // remove the latency between luma and chroma restriction of CCLM
 #define LMS_LINEAR_MODEL                                  1 // LMS for parameters derivation of CCLM and MMLM mode, Remove constraint in derivation of neighbouring samples
-#define IDCC_TPM_JEM									  1 // template matching prediction as implemented in JEM-7.2
-
-#if IDCC_TPM_JEM
-
-#define IDCC_TMP_SIMD									1
-
-#define IDCC_SearchRangeMultFactor						5
-
-#if IDCC_TMP_SIMD
-#define IDCC_TemplateSize								4 // must be multiple of 4 for SIMD
-#else
-#define IDCC_TemplateSize								4
-#endif
-
-#define IDCC_TMP_MaxSize_Depth							6 // should be log2(IDCC_TMP_MaxSize): keep as 6 to avoid any error
-
-typedef       int             DistType;
-#endif
-
 
 //-- inter
 #define CIIP_RM_BLOCK_SIZE_CONSTRAINTS                    1 // Remove the 64x64 restriction and enable 8x4/4x8 block for CIIP
@@ -129,6 +110,7 @@ typedef       int             DistType;
 #define SECONDARY_MPM                                     1 // Primary MPM and Secondary MPM: Add neighbouring modes into MPMs from positions AR, BL, AL, derived modes
 #define ENABLE_DIMD                                       1 // Decoder side intra mode derivation
 #define JVET_V0087_DIMD_NO_ISP                            ENABLE_DIMD // disallow combination of DIMD and ISP
+#define JVET_V0130_INTRA_TMP                              1 // JVET-V0130: template matching prediction
 
 // Inter
 #define CIIP_PDPC                                         1 // apply pdpc to megre prediction as a new CIIP mode (CIIP_PDPC) additional to CIIP mode
@@ -167,6 +149,12 @@ typedef       int             DistType;
 #if SIGN_PREDICTION
 #define ENABLE_SIMD_SIGN_PREDICTION                       1
 #endif
+#if JVET_V0130_INTRA_TMP
+#define ENABLE_SIMD_TMP									                  1
+#endif
+#if JVET_V0094_BILATERAL_FILTER
+#define ENABLE_SIMD_BILATERAL_FILTER                      1
+#endif
 
 #endif // tools
 
diff --git a/source/Lib/CommonLib/Unit.cpp b/source/Lib/CommonLib/Unit.cpp
index c281acf6ffe27554228e07829e54c7becf1f8a15..0d5f7f2a3ca47b9c20ed0a550dc52c9a4388ad19 100644
--- a/source/Lib/CommonLib/Unit.cpp
+++ b/source/Lib/CommonLib/Unit.cpp
@@ -303,8 +303,8 @@ CodingUnit& CodingUnit::operator=( const CodingUnit& other )
   smvdMode        = other.smvdMode;
   ispMode           = other.ispMode;
   mipFlag           = other.mipFlag;
-#if IDCC_TPM_JEM
-  TmpFlag = other.TmpFlag;
+#if JVET_V0130_INTRA_TMP
+  tmpFlag           = other.tmpFlag;
 #endif
 #if INTER_LIC
   LICFlag           = other.LICFlag;
@@ -387,8 +387,8 @@ void CodingUnit::initData()
   smvdMode        = 0;
   ispMode           = 0;
   mipFlag           = false;
-#if IDCC_TPM_JEM
-  TmpFlag = false;
+#if JVET_V0130_INTRA_TMP
+  tmpFlag = false;
 #endif
 #if INTER_LIC
   LICFlag = false;
diff --git a/source/Lib/CommonLib/Unit.h b/source/Lib/CommonLib/Unit.h
index e24b8c58bc8be25c0646fcbe72bf6af3eb96bd8a..31fd09ac0aff9313a9fc4a67fee1363dc61d2db2 100644
--- a/source/Lib/CommonLib/Unit.h
+++ b/source/Lib/CommonLib/Unit.h
@@ -334,8 +334,8 @@ struct CodingUnit : public UnitArea
   uint8_t         BcwIdx;
   int8_t          refIdxBi[2];
   bool           mipFlag;
-#if IDCC_TPM_JEM
-  bool			 TmpFlag;
+#if JVET_V0130_INTRA_TMP
+  bool		    	 tmpFlag;
 #endif
 #if INTER_LIC
   bool           LICFlag;
diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp
index 9f6b464441c3aac5f52ae109540b67491cf1415b..ce87e544860d11b5b58c14eb9b268e068afcf705 100644
--- a/source/Lib/CommonLib/UnitTools.cpp
+++ b/source/Lib/CommonLib/UnitTools.cpp
@@ -333,7 +333,7 @@ uint32_t CU::getCtuAddr( const CodingUnit &cu )
 {
   return getCtuAddr( cu.blocks[cu.chType].lumaPos(), *cu.cs->pcv );
 }
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 Position CU::getCtuXYAddr(const CodingUnit& cu)
 {
 	return Position((cu.blocks[cu.chType].lumaPos().x >> cu.cs->pcv->maxCUWidthLog2) << cu.cs->pcv->maxCUWidthLog2, (cu.blocks[cu.chType].lumaPos().y >> cu.cs->pcv->maxCUHeightLog2) << cu.cs->pcv->maxCUHeightLog2);
@@ -944,10 +944,10 @@ bool PU::isMIP(const PredictionUnit &pu, const ChannelType &chType)
     return isDMChromaMIP(pu) && (pu.intraDir[CHANNEL_TYPE_CHROMA] == DM_CHROMA_IDX);
   }
 }
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 bool PU::isTmp(const PredictionUnit& pu, const ChannelType& chType)
 {
-	return (chType == CHANNEL_TYPE_LUMA && pu.cu->TmpFlag);
+	return (chType == CHANNEL_TYPE_LUMA && pu.cu->tmpFlag);
 }
 #endif
 bool PU::isDMChromaMIP(const PredictionUnit &pu)
@@ -961,7 +961,7 @@ bool PU::isDMChromaMIP(const PredictionUnit &pu)
 
 uint32_t PU::getIntraDirLuma( const PredictionUnit &pu )
 {
-#if IDCC_TPM_JEM
+#if INTRA_TEMPLATE_MATCHING
 	if (isMIP(pu) || isTmp(pu))
 #else
   if (isMIP(pu))
@@ -4984,8 +4984,8 @@ bool CU::isMTSAllowed(const CodingUnit &cu, const ComponentID compID)
   mtsAllowed &= cuWidth <= maxSize && cuHeight <= maxSize;
   mtsAllowed &= !cu.ispMode;
   mtsAllowed &= !cu.sbtInfo;
-#if IDCC_TMP_ImplicitMTS
-  mtsAllowed &= !cu.TmpFlag;
+#if JVET_V0130_INTRA_TMP
+  mtsAllowed &= !cu.tmpFlag;
 #endif
   mtsAllowed &= !(cu.bdpcmMode && cuWidth <= tsMaxSize && cuHeight <= tsMaxSize);
   return mtsAllowed;
@@ -5321,8 +5321,8 @@ bool allowLfnstWithMip(const Size& block)
   }
   return false;
 }
-#if IDCC_TPM_JEM
-bool allowLfnstWithTpm()
+#if JVET_V0130_INTRA_TMP
+bool allowLfnstWithTmp()
 {
 	return true;
 }
diff --git a/source/Lib/CommonLib/UnitTools.h b/source/Lib/CommonLib/UnitTools.h
index a9c47df9814bcf6a67ce72f5f47f0f8dae446d50..ab2c96231fb0bfa826aa208253437a3eac1656c5 100644
--- a/source/Lib/CommonLib/UnitTools.h
+++ b/source/Lib/CommonLib/UnitTools.h
@@ -70,8 +70,8 @@ namespace CU
   bool isSameSubPic                   (const CodingUnit &cu, const CodingUnit &cu2);
   bool isLastSubCUOfCtu               (const CodingUnit &cu);
   uint32_t getCtuAddr                     (const CodingUnit &cu);
-#if IDCC_TPM_JEM
-  Position getCtuXYAddr(const CodingUnit& cu);
+#if JVET_V0130_INTRA_TMP
+  Position getCtuXYAddr               (const CodingUnit& cu);
 #endif
   int  predictQP                      (const CodingUnit& cu, const int prevQP );
 
@@ -141,7 +141,7 @@ namespace PU
   int  getIntraMPMs(const PredictionUnit &pu, unsigned *mpm, const ChannelType &channelType = CHANNEL_TYPE_LUMA);
 #endif
   bool          isMIP                 (const PredictionUnit &pu, const ChannelType &chType = CHANNEL_TYPE_LUMA);
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   bool          isTmp(const PredictionUnit& pu, const ChannelType& chType = CHANNEL_TYPE_LUMA);
 #endif
   bool          isDMChromaMIP         (const PredictionUnit &pu);
@@ -274,8 +274,8 @@ uint32_t getCtuAddr        (const Position& pos, const PreCalcValues &pcv);
 int  getNumModesMip   (const Size& block);
 int getMipSizeId      (const Size& block);
 bool allowLfnstWithMip(const Size& block);
-#if IDCC_TPM_JEM
-bool allowLfnstWithTpm();
+#if JVET_V0130_INTRA_TMP
+bool allowLfnstWithTmp();
 #endif
 
 template<typename T, size_t N>
diff --git a/source/Lib/CommonLib/dtrace_blockstatistics.cpp b/source/Lib/CommonLib/dtrace_blockstatistics.cpp
index a5e071fbcb35f0572326c2367eaedf59a7de3cd4..2ec1079052779c5c57c9d79f01fa7c9e6bde5b2c 100644
--- a/source/Lib/CommonLib/dtrace_blockstatistics.cpp
+++ b/source/Lib/CommonLib/dtrace_blockstatistics.cpp
@@ -884,8 +884,8 @@ void writeAllData(const CodingStructure& cs, const UnitArea& ctuArea)
           if(chType == CHANNEL_TYPE_LUMA)
           {
             DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::MIPFlag), cu.mipFlag);
-#if IDCC_TPM_JEM
-			DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::TmpFlag), cu.TmpFlag);
+#if JVET_V0130_INTRA_TMP
+            DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::TmpFlag), cu.tmpFlag);
 #endif
             DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::ISPMode), cu.ispMode);
           }
diff --git a/source/Lib/CommonLib/dtrace_blockstatistics.h b/source/Lib/CommonLib/dtrace_blockstatistics.h
index a416227b6dbfe3abf96f6b3f34b7ae77b0fcd397..fe3032dd9323b1abe8443f40833b807eaf6ac2a9 100644
--- a/source/Lib/CommonLib/dtrace_blockstatistics.h
+++ b/source/Lib/CommonLib/dtrace_blockstatistics.h
@@ -78,7 +78,7 @@ enum class BlockStatistic {
   Chroma_IntraMode,
   MultiRefIdx,
   MIPFlag,
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   TmpFlag,
 #endif
   ISPMode,
@@ -173,7 +173,7 @@ static const std::map<BlockStatistic, std::tuple<std::string, BlockStatisticType
   { BlockStatistic::JointCbCr,              std::tuple<std::string, BlockStatisticType, std::string>{"JointCbCr",                   BlockStatisticType::Flag,                   ""}},
 
   { BlockStatistic::MIPFlag,                std::tuple<std::string, BlockStatisticType, std::string>{"MIPFlag",                     BlockStatisticType::Flag,                   ""}},
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   { BlockStatistic::TmpFlag,                std::tuple<std::string, BlockStatisticType, std::string>{"TmpFlag",                     BlockStatisticType::Flag,                   ""}},
 #endif
   { BlockStatistic::ISPMode,                std::tuple<std::string, BlockStatisticType, std::string>{"ISPMode",                     BlockStatisticType::Integer,                "[0, " + std::to_string(NUM_INTRA_SUBPARTITIONS_MODES) + "]"}},
diff --git a/source/Lib/CommonLib/version.h b/source/Lib/CommonLib/version.h
index 250f2cd9fd501badc15831fc0c5ffc13aeb723e1..70ad90a766d6e3ce8d2c18d989ca90082b7ce57a 100644
--- a/source/Lib/CommonLib/version.h
+++ b/source/Lib/CommonLib/version.h
@@ -1,3 +1,4 @@
 #if ! defined( VTM_VERSION )
 #define VTM_VERSION "10.0"
+#define ECM_VERSION "0.0"
 #endif
diff --git a/source/Lib/CommonLib/x86/BilateralFilterX86.h b/source/Lib/CommonLib/x86/BilateralFilterX86.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0ff50f957dc3f99f8e0c257b9402817955fc418
--- /dev/null
+++ b/source/Lib/CommonLib/x86/BilateralFilterX86.h
@@ -0,0 +1,311 @@
+/* The copyright in this software is being made available under the BSD
+ * License, included below. This software may be subject to other third party
+ * and contributor rights, including patent rights, and no such rights are
+ * granted under this license.
+ *
+ * Copyright (c) 2010-2021, ITU/ISO/IEC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+ *    be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "CommonDefX86.h"
+#include "../BilateralFilter.h"
+
+#ifdef TARGET_SIMD_X86
+#if defined _MSC_VER
+#include <tmmintrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#if ENABLE_SIMD_BILATERAL_FILTER
+template<X86_VEXT vext>
+void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr )
+{
+  if( uiWidth < 4 || ( uiWidth < 8 && isRDO ) )
+  {
+    return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, isRDO, LUTrowPtr );
+  }
+
+  int pad = 2;
+  int padwidth = iWidthExtSIMD;
+
+  __m128i center, left, right, up, down, lu, ld, ru, rd, diffabs, four, fifteen, lut, acc, temp, round_add, clipmin, clipmax, inputVals;
+  __m128i ll, rr, uu, dd;
+
+  four = _mm_set1_epi16(4);
+  fifteen = _mm_set1_epi16(15);
+  round_add = _mm_set1_epi16(bif_round_add);
+  clipmin = _mm_set1_epi16(clpRng.min);
+  clipmax = _mm_set1_epi16(clpRng.max);
+
+  lut = _mm_loadu_si128((__m128i*)(LUTrowPtr));
+  acc = _mm_set1_epi32(0);
+  
+  // Copy back parameters
+  Pel *tempBlockPtr = (short*)blkFilt + (((padwidth+4) << 1) + 2);
+  int tempBlockStride = padwidth+4;  
+  
+  for (int col = 0; col < uiWidth; col += 8)
+  {
+    for (int row = 0; row < uiHeight; row++)
+    {
+      acc = _mm_set1_epi32(0);
+      int16_t *point = &block[(row + pad)*padwidth + pad + col];
+      
+      center = _mm_loadu_si128((__m128i*)(point));
+      
+      //load neighbours
+      left = _mm_loadu_si128((__m128i*)(point - 1));
+      right = _mm_loadu_si128((__m128i*)(point + 1));
+      up = _mm_loadu_si128((__m128i*)(point - padwidth));
+      down = _mm_loadu_si128((__m128i*)(point + padwidth));
+      
+      lu = _mm_loadu_si128((__m128i*)(point - 1 - padwidth));
+      ld = _mm_loadu_si128((__m128i*)(point - 1 + padwidth));
+      ru = _mm_loadu_si128((__m128i*)(point + 1 - padwidth));
+      rd = _mm_loadu_si128((__m128i*)(point + 1 + padwidth));
+
+      ll = _mm_loadu_si128((__m128i*)(point - 2));
+      rr = _mm_loadu_si128((__m128i*)(point + 2));
+      uu = _mm_loadu_si128((__m128i*)(point - 2*padwidth));
+      dd = _mm_loadu_si128((__m128i*)(point + 2*padwidth));
+      
+      //calculate diffs
+      left = _mm_sub_epi16(left, center);
+      right = _mm_sub_epi16(right, center);
+      up = _mm_sub_epi16(up, center);
+      down = _mm_sub_epi16(down, center);
+      
+      lu = _mm_sub_epi16(lu, center);
+      ld = _mm_sub_epi16(ld, center);
+      ru = _mm_sub_epi16(ru, center);
+      rd = _mm_sub_epi16(rd, center);
+
+      ll = _mm_sub_epi16(ll, center);
+      rr = _mm_sub_epi16(rr, center);
+      uu = _mm_sub_epi16(uu, center);
+      dd = _mm_sub_epi16(dd, center);
+      
+      //LEFT!
+      //calculate abs
+      diffabs = _mm_abs_epi16(left); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_sign_epi16(diffabs, left);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //RIGHT!
+      //calculate abs
+      diffabs = _mm_abs_epi16(right); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_sign_epi16(diffabs, right);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //UP!
+      //calculate abs
+      diffabs = _mm_abs_epi16(up); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_sign_epi16(diffabs, up);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      
+      //DOWN!
+      //calculate abs
+      diffabs = _mm_abs_epi16(down); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_sign_epi16(diffabs, down);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      
+      //lu!
+      //calculate abs
+      diffabs = _mm_abs_epi16(lu); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, lu);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //ld!
+      //calculate abs
+      diffabs = _mm_abs_epi16(ld); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, ld);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //ru!
+      //calculate abs
+      diffabs = _mm_abs_epi16(ru); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, ru);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //rd!
+      //calculate abs
+      diffabs = _mm_abs_epi16(rd); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, rd);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+
+      //ll!
+      //calculate abs
+      diffabs = _mm_abs_epi16(ll); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, ll);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //rr!
+      //calculate abs
+      diffabs = _mm_abs_epi16(rr); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, rr);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //uu!
+      //calculate abs
+      diffabs = _mm_abs_epi16(uu); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, uu);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      //dd!
+      //calculate abs
+      diffabs = _mm_abs_epi16(dd); //abs
+      diffabs = _mm_add_epi16(diffabs, four); //+4
+      diffabs = _mm_srai_epi16(diffabs, 3); //>>3
+      diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15)
+      diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8
+      diffabs = _mm_shuffle_epi8(lut, diffabs);//lut
+      diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit
+      diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift!
+      diffabs = _mm_sign_epi16(diffabs, dd);//fix sign!
+      acc = _mm_add_epi16(diffabs, acc); //add to acc
+      
+      if (bfac == 2)
+      {
+        acc = _mm_slli_epi16(acc, 1);   // Shift left to get 2*
+      }
+      else if (bfac == 3)
+      {
+        temp = _mm_slli_epi16(acc, 1);  // Multiply by two by shifting left
+        acc = _mm_add_epi16(acc, temp); // Add original value to get 3*
+      }
+      
+      // Add 16 and shift 5
+      acc = _mm_add_epi16(acc, round_add);
+      acc = _mm_srai_epi16(acc, bif_round_shift);
+      
+      // Instead we add our input values to the delta
+      if(isRDO)
+      {
+        acc = _mm_add_epi16(acc, center);
+      }
+      else
+      {
+        int16_t *recpoint = &recPtr[row * recStride + col];
+        inputVals = _mm_loadu_si128((__m128i*)(recpoint));
+        acc = _mm_add_epi16(acc, inputVals);
+      }
+      
+      // Clip
+      acc = _mm_max_epi16(acc, clipmin);
+      acc = _mm_min_epi16(acc, clipmax);
+
+      _mm_store_si128((__m128i*)(blkFilt + (row + pad) * (padwidth + 4) + col + pad), acc);
+    }
+  }
+  
+  // Copy back from tempbufFilter to recBuf
+  int onerow = uiWidth * sizeof(Pel);
+  for(uint32_t yy = 0; yy < uiHeight; yy++)
+  {
+    std::memcpy(recPtr, tempBlockPtr, onerow);
+    recPtr += recStride;
+    tempBlockPtr += tempBlockStride;
+  }
+}
+
+template <X86_VEXT vext>
+void BilateralFilter::_initBilateralFilterX86()
+{
+  m_bilateralFilterDiamond5x5 = simdFilterDiamond5x5<vext>;  
+}
+
+template void BilateralFilter::_initBilateralFilterX86<SIMDX86>();
+#endif
+#endif   // TARGET_SIMD_X86
diff --git a/source/Lib/CommonLib/x86/InitX86.cpp b/source/Lib/CommonLib/x86/InitX86.cpp
index 9511379d34cd6fb40cc6a9159316acba912bfa7e..b8e30fbdd77ecd10595385509d689aeff69652ba 100644
--- a/source/Lib/CommonLib/x86/InitX86.cpp
+++ b/source/Lib/CommonLib/x86/InitX86.cpp
@@ -50,9 +50,12 @@
 
 #include "CommonLib/IbcHashMap.h"
 
-#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
+#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP
 #include "CommonLib/TrQuant.h"
 #endif
+#if ENABLE_SIMD_BILATERAL_FILTER
+#include "CommonLib/BilateralFilter.h"
+#endif
 
 #ifdef TARGET_SIMD_X86
 
@@ -190,7 +193,7 @@ void IbcHashMap::initIbcHashMapX86()
 }
 #endif
 
-#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
+#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP
 void TrQuant::initTrQuantX86()
 {
   auto vext = read_x86_extension_flags();
@@ -213,5 +216,28 @@ void TrQuant::initTrQuantX86()
 }
 #endif
 
+#if ENABLE_SIMD_BILATERAL_FILTER
+void BilateralFilter::initBilateralFilterX86()
+{
+  auto vext = read_x86_extension_flags();
+  switch( vext )
+  {
+  case AVX512:
+  case AVX2:
+    _initBilateralFilterX86<AVX2>();
+    break;
+  case AVX:
+    _initBilateralFilterX86<AVX>();
+    break;
+  case SSE42:
+  case SSE41:
+    _initBilateralFilterX86<SSE41>();
+    break;
+  default:
+    break;
+  }
+}
+#endif
+
 #endif
 
diff --git a/source/Lib/CommonLib/x86/TrQuantX86.h b/source/Lib/CommonLib/x86/TrQuantX86.h
index 5239521da29f73107d0f9773c23f0da7039316f2..5f7238fc60ea6aa9385a797399af4a7ccb3b524f 100644
--- a/source/Lib/CommonLib/x86/TrQuantX86.h
+++ b/source/Lib/CommonLib/x86/TrQuantX86.h
@@ -410,7 +410,129 @@ uint32_t computeSAD_SIMD( const Pel* ref, const Pel* cur, const int size )
 }
 #endif
 
-#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
+#if ENABLE_SIMD_TMP
+template< X86_VEXT vext >
+int calcTemplateDiffSIMD( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax )
+{
+  int iDiffSum = 0;
+  int iY;
+  Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+  Pel* tarPatchRow;
+  uint32_t uiSum;
+
+  // horizontal difference
+  for( iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = ( const short* ) tarPatchRow;
+    const short* pSrc2 = ( const short* ) refPatchRow;
+
+    // SIMD difference
+    //int  iRows = uiPatchHeight;
+    int  iCols = uiPatchWidth;
+    if( (iCols & 7) == 0 )
+    {
+      // Do with step of 8
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for( int iX = 0; iX < iCols; iX += 8 )
+        {
+          __m128i vsrc1 = _mm_loadu_si128( (const __m128i*)(&pSrc1[iX]) );
+          __m128i vsrc2 = _mm_lddqu_si128( (const __m128i*)(&pSrc2[iX]) );
+          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
+        }
+        __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+      uiSum = _mm_cvtsi128_si32( vsum32 );
+    }
+    else
+    {
+      // Do with step of 4
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for( int iX = 0; iX < iCols; iX += 4 )
+        {
+          __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] );
+          __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] );
+          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
+        }
+        __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+      uiSum = _mm_cvtsi128_si32( vsum32 );
+    }
+    iDiffSum += uiSum;
+
+    if( iDiffSum > iMax ) //for speeding up
+    {
+      return iDiffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+
+  // vertical difference
+  int  iCols = TMP_TEMPLATE_SIZE;
+
+  for( iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ )
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = ( const short* ) tarPatchRow;
+    const short* pSrc2 = ( const short* ) refPatchRow;
+
+    // SIMD difference
+
+    // Do with step of 4
+    __m128i vzero = _mm_setzero_si128();
+    __m128i vsum32 = vzero;
+    //for (int iY = 0; iY < iRows; iY += iSubStep)
+    {
+      __m128i vsum16 = vzero;
+      for( int iX = 0; iX < iCols; iX += 4 )
+      {
+        __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] );
+        __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] );
+        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
+      }
+      __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+      vsum32 = _mm_add_epi32( vsum32, vsumtemp );
+      //pSrc1 += iStrideSrc1;
+      //pSrc2 += iStrideSrc2;
+    }
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+    uiSum = _mm_cvtsi128_si32( vsum32 );
+
+    iDiffSum += uiSum;
+
+    if( iDiffSum > iMax ) //for speeding up
+    {
+      return iDiffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+
+  return iDiffSum;
+}
+#endif
+
+#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP
 template <X86_VEXT vext>
 void TrQuant::_initTrQuantX86()
 {
@@ -545,6 +667,10 @@ void TrQuant::_initTrQuantX86()
   fastInvTrans[2][5] = fastInverseTransform_SIMD<DST7, 64>;
 #endif
 #endif
+
+#if ENABLE_SIMD_TMP
+  m_calcTemplateDiff = calcTemplateDiffSIMD<vext>;
+#endif
 }
 
 template void TrQuant::_initTrQuantX86<SIMDX86>();
diff --git a/source/Lib/CommonLib/x86/avx/BilateralFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/BilateralFilter_avx.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3
--- /dev/null
+++ b/source/Lib/CommonLib/x86/avx/BilateralFilter_avx.cpp
@@ -0,0 +1 @@
+#include "../BilateralFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/avx2/BilateralFilter_avx2.cpp b/source/Lib/CommonLib/x86/avx2/BilateralFilter_avx2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3
--- /dev/null
+++ b/source/Lib/CommonLib/x86/avx2/BilateralFilter_avx2.cpp
@@ -0,0 +1 @@
+#include "../BilateralFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/sse41/BilateralFilter_sse41.cpp b/source/Lib/CommonLib/x86/sse41/BilateralFilter_sse41.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3
--- /dev/null
+++ b/source/Lib/CommonLib/x86/sse41/BilateralFilter_sse41.cpp
@@ -0,0 +1 @@
+#include "../BilateralFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp b/source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3
--- /dev/null
+++ b/source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp
@@ -0,0 +1 @@
+#include "../BilateralFilterX86.h"
diff --git a/source/Lib/DecoderLib/CABACReader.cpp b/source/Lib/DecoderLib/CABACReader.cpp
index 10f7f72b23ee698e39a7cea978eaf391eabd11e2..0539150a948c6a7aab9dad25b7c846d7787f8e4a 100644
--- a/source/Lib/DecoderLib/CABACReader.cpp
+++ b/source/Lib/DecoderLib/CABACReader.cpp
@@ -1606,16 +1606,20 @@ void CABACReader::intra_luma_pred_modes( CodingUnit &cu )
     cu.firstPU->intraDir[0] = cu.bdpcmMode == 2? VER_IDX : HOR_IDX;
     return;
   }
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   int TMP_MaxSize=cu.cs->sps->getIntraTMPMaxSize();
   if (cu.lwidth() <= TMP_MaxSize && cu.lheight() <= TMP_MaxSize)
   {
-	  Tmp_Flag(cu);
-	  if (cu.TmpFlag)
-		  return;
+	  tmp_flag(cu);
+    if( cu.tmpFlag )
+    {
+      return;
+    }
   }
   else
-	  cu.TmpFlag = 0;
+  {
+    cu.tmpFlag = 0;
+  }
 #endif
   mip_flag(cu);
   if (cu.mipFlag)
@@ -3862,10 +3866,10 @@ void CABACReader::residual_lfnst_mode( CodingUnit& cu,  CUCtx& cuCtx  )
   int chIdx = cu.isSepTree() && cu.chType == CHANNEL_TYPE_CHROMA ? 1 : 0;
 #endif
   if ((cu.ispMode && !CU::canUseLfnstWithISP(cu, cu.chType))
-#if IDCC_TPM_JEM
-   || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.TmpFlag && !allowLfnstWithTpm())))
+#if JVET_V0130_INTRA_TMP
+    || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.tmpFlag && !allowLfnstWithTmp())))
 #else
-      || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize()))
+    || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize()))
 #endif
 #if INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS
     || (CS::isDualITree(*cu.cs) && cu.chType == CHANNEL_TYPE_CHROMA && std::min(cu.blocks[1].width, cu.blocks[1].height) < 4)
@@ -4506,8 +4510,8 @@ unsigned CABACReader::code_unary_fixed( unsigned ctxId, unsigned unary_max, unsi
   }
   return idx;
 }
-#if IDCC_TPM_JEM
-void CABACReader::Tmp_Flag(CodingUnit& cu)
+#if JVET_V0130_INTRA_TMP
+void CABACReader::tmp_flag(CodingUnit& cu)
 {
 	RExt__DECODER_DEBUG_BIT_STATISTICS_CREATE_SET(STATS__CABAC_BITS__OTHER);
 
@@ -4518,13 +4522,13 @@ void CABACReader::Tmp_Flag(CodingUnit& cu)
 
   if( !cu.cs->sps->getUseIntraTMP() )
   {
-    cu.TmpFlag = false;
+    cu.tmpFlag = false;
     return;
   }
 
 	unsigned ctxId = DeriveCtx::CtxTmpFlag(cu);
-	cu.TmpFlag = m_BinDecoder.decodeBin(Ctx::TmpFlag(ctxId));
-	DTRACE(g_trace_ctx, D_SYNTAX, "Tmp_Flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.TmpFlag ? 1 : 0);
+	cu.tmpFlag = m_BinDecoder.decodeBin(Ctx::TmpFlag(ctxId));
+	DTRACE(g_trace_ctx, D_SYNTAX, "tmp_flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.tmpFlag ? 1 : 0);
 }
 #endif
 void CABACReader::mip_flag( CodingUnit& cu )
diff --git a/source/Lib/DecoderLib/CABACReader.h b/source/Lib/DecoderLib/CABACReader.h
index 5513be42e0602be439420806bd6e1531a62a82cb..59607c833b83baa691603e354f34bc03f1ea175e 100644
--- a/source/Lib/DecoderLib/CABACReader.h
+++ b/source/Lib/DecoderLib/CABACReader.h
@@ -107,8 +107,8 @@ public:
   void        adaptive_color_transform(CodingUnit&             cu);
   void        sbt_mode                  ( CodingUnit&                   cu );
   void        end_of_ctu                ( CodingUnit&                   cu,     CUCtx&          cuCtx );
-#if IDCC_TPM_JEM
-  void        Tmp_Flag(CodingUnit& cu);
+#if JVET_V0130_INTRA_TMP
+  void        tmp_flag                  ( CodingUnit&                   cu );
 #endif
   void        mip_flag                  ( CodingUnit&                   cu );
   void        mip_pred_modes            ( CodingUnit&                   cu );
diff --git a/source/Lib/DecoderLib/DecCu.cpp b/source/Lib/DecoderLib/DecCu.cpp
index b20d81b6efc70f8061829531de46d87bc5ba5fec..19f5affab59e9e932206a0c7ec8a21a8304935e8 100644
--- a/source/Lib/DecoderLib/DecCu.cpp
+++ b/source/Lib/DecoderLib/DecCu.cpp
@@ -307,7 +307,7 @@ void DecCu::xIntraRecBlk( TransformUnit& tu, const ComponentID compID )
   }
   else
   {
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 	  if (PU::isTmp(pu, chType))
 	  {
 		  int foundCandiNum;
@@ -512,11 +512,11 @@ void DecCu::xIntraRecACTBlk(TransformUnit& tu)
 
     PelBuf piPred = cs.getPredBuf(area);
     m_pcIntraPred->initIntraPatternChType(*tu.cu, area);
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
 	if (PU::isTmp(pu, chType))
 	{
 		int foundCandiNum;
-		const unsigned int           uiStride = cs.picture->getRecoBuf(COMPONENT_Y).stride;
+		const unsigned int uiStride = cs.picture->getRecoBuf(COMPONENT_Y).stride;
 		m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight());
 		m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight());
 		m_pcTrQuant->generateTMPrediction(piPred.buf, uiStride, pu.lwidth(), pu.lheight(), foundCandiNum);
diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp
index d8a529556478f0dfcd2abd49c7bc895579a9e39b..b41bffd482821ebc8174ffe69f8abc181fe6215b 100644
--- a/source/Lib/DecoderLib/VLCReader.cpp
+++ b/source/Lib/DecoderLib/VLCReader.cpp
@@ -2260,7 +2260,7 @@ void HLSyntaxReader::parseSPS(SPS* pcSPS)
 #if ENABLE_DIMD
   READ_FLAG(uiCode, "sps_dimd_enabled_flag");                           pcSPS->setUseDimd(uiCode != 0);
 #endif
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   READ_FLAG(uiCode, "sps_intraTMP_enabled_flag");                   pcSPS->setUseIntraTMP( uiCode != 0 );
   if(pcSPS->getUseIntraTMP())
   {
diff --git a/source/Lib/EncoderLib/CABACWriter.cpp b/source/Lib/EncoderLib/CABACWriter.cpp
index 20225abd53984d871c9e6aedd1a193a17597ee7f..391f28a2fbac914005f1b5bf19360e80dc8806a9 100644
--- a/source/Lib/EncoderLib/CABACWriter.cpp
+++ b/source/Lib/EncoderLib/CABACWriter.cpp
@@ -1200,13 +1200,15 @@ void CABACWriter::intra_luma_pred_modes( const CodingUnit& cu )
     cu.firstPU->intraDir[0] = cu.bdpcmMode == 2? VER_IDX : HOR_IDX;
     return;
   }
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   int TMP_MaxSize=cu.cs->sps->getIntraTMPMaxSize();
   if (cu.lwidth() <= TMP_MaxSize && cu.lheight() <= TMP_MaxSize)
   {
-	  Tmp_Flag(cu);
-	  if (cu.TmpFlag)
-		  return;
+	  tmp_flag(cu);
+    if( cu.tmpFlag )
+    {
+      return;
+    }
   }
 #endif
   mip_flag(cu);
@@ -1389,15 +1391,17 @@ void CABACWriter::intra_luma_pred_mode( const PredictionUnit& pu )
 {
 
   if( pu.cu->bdpcmMode ) return;
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   // check if sufficient search range is available
   //bool bCheck = pu.cu->
   int TMP_MaxSize=pu.cu->cs->sps->getIntraTMPMaxSize();
   if (pu.cu->lwidth() <= TMP_MaxSize && pu.cu->lheight() <= TMP_MaxSize)
   {
-	  Tmp_Flag(*pu.cu);
-	  if (pu.cu->TmpFlag)
-		  return;
+	  tmp_flag(*pu.cu);
+    if( pu.cu->tmpFlag )
+    {
+      return;
+    }
   }
 #endif
   mip_flag(*pu.cu);
@@ -3592,10 +3596,10 @@ void CABACWriter::residual_lfnst_mode( const CodingUnit& cu, CUCtx& cuCtx )
   int chIdx = cu.isSepTree() && cu.chType == CHANNEL_TYPE_CHROMA ? 1 : 0;
 #endif
   if( ( cu.ispMode && !CU::canUseLfnstWithISP( cu, cu.chType ) ) ||
-#if IDCC_TPM_JEM
-  (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.TmpFlag && !allowLfnstWithTpm()))) ||
+#if JVET_V0130_INTRA_TMP
+    (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.tmpFlag && !allowLfnstWithTmp()))) ||
 #else
-      (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) ||
+    (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) ||
 #endif
 #if INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS
     (CS::isDualITree(*cu.cs) && cu.chType == CHANNEL_TYPE_CHROMA && std::min(cu.blocks[1].width, cu.blocks[1].height) < 4)
@@ -4222,8 +4226,8 @@ void CABACWriter::code_unary_fixed( unsigned symbol, unsigned ctxId, unsigned un
   }
 }
 
-#if IDCC_TPM_JEM
-void CABACWriter::Tmp_Flag(const CodingUnit& cu)
+#if JVET_V0130_INTRA_TMP
+void CABACWriter::tmp_flag(const CodingUnit& cu)
 {
 	if (!cu.Y().valid())
 	{
@@ -4236,8 +4240,8 @@ void CABACWriter::Tmp_Flag(const CodingUnit& cu)
   }
 
 	unsigned ctxId = DeriveCtx::CtxTmpFlag(cu);
-	m_BinEncoder.encodeBin(cu.TmpFlag, Ctx::TmpFlag(ctxId));
-	DTRACE(g_trace_ctx, D_SYNTAX, "Tmp_Flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.TmpFlag ? 1 : 0);
+	m_BinEncoder.encodeBin(cu.tmpFlag, Ctx::TmpFlag(ctxId));
+	DTRACE(g_trace_ctx, D_SYNTAX, "tmp_flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.tmpFlag ? 1 : 0);
 }
 #endif
 
diff --git a/source/Lib/EncoderLib/CABACWriter.h b/source/Lib/EncoderLib/CABACWriter.h
index 6fd5d20e5c5551d20001297e47eae3fdb0fd6643..afbf6ed7c1a280ea67cfcaadaabca11236e8cdb6 100644
--- a/source/Lib/EncoderLib/CABACWriter.h
+++ b/source/Lib/EncoderLib/CABACWriter.h
@@ -116,8 +116,8 @@ public:
   void        adaptive_color_transform(const CodingUnit&             cu);
   void        sbt_mode                  ( const CodingUnit&             cu );
   void        end_of_ctu                ( const CodingUnit&             cu,       CUCtx&            cuCtx );
-#if IDCC_TPM_JEM
-  void        Tmp_Flag(const CodingUnit& cu);
+#if JVET_V0130_INTRA_TMP
+  void        tmp_flag                  ( const CodingUnit& cu );
 #endif
   void        mip_flag                  ( const CodingUnit&             cu );
   void        mip_pred_modes            ( const CodingUnit&             cu );
diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h
index 55d4aaef581fad47017085d9a45b98f31ef67cf5..a0740d6c0da4a2a4250e539130baa9fa9b919772 100644
--- a/source/Lib/EncoderLib/EncCfg.h
+++ b/source/Lib/EncoderLib/EncCfg.h
@@ -806,9 +806,9 @@ protected:
 
   bool        m_alf;                                          ///< Adaptive Loop Filter
 
-#if IDCC_TPM_JEM
-  bool      m_IntraTMP;                                       ///< intra Template Matching 
-  unsigned  m_IntraTMP_MaxSize;                               ///< max CU size for which intra TMP is allowed
+#if JVET_V0130_INTRA_TMP
+  bool        m_intraTMP;                                       ///< intra Template Matching 
+  unsigned    m_intraTmpMaxSize;                               ///< max CU size for which intra TMP is allowed
 #endif
 #if JVET_V0094_BILATERAL_FILTER
   bool        m_BIF;
@@ -1287,11 +1287,11 @@ public:
   bool      getUseWrapAround                ()         const { return m_wrapAround; }
   void      setWrapAroundOffset             ( unsigned u )   { m_wrapAroundOffset = u; }
   unsigned  getWrapAroundOffset             ()         const { return m_wrapAroundOffset; }
-#if IDCC_TPM_JEM
-  void      setUseIntraTMP(bool b) { m_IntraTMP = b; }
-  bool      getUseIntraTMP() { return m_IntraTMP; }
-  void      setIntraTMPMaxSize(unsigned n) { m_IntraTMP_MaxSize = n; }
-  unsigned  getIntraTMPMaxSize() { return m_IntraTMP_MaxSize; }
+#if JVET_V0130_INTRA_TMP
+  void      setUseIntraTMP                  (bool b)         { m_intraTMP = b; }
+  bool      getUseIntraTMP()                           const { return m_intraTMP; }
+  void      setIntraTMPMaxSize              (unsigned n)     { m_intraTmpMaxSize = n; }
+  unsigned  getIntraTMPMaxSize()                       const { return m_intraTmpMaxSize; }
 #endif
 #if JVET_V0094_BILATERAL_FILTER
   void      setUseBIF                       ( bool b )       { m_BIF = b; }
diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp
index 24f7f15146b2d4f0d3587cf2b2870839a6913383..59dbb6c05e0ec253279e2b27f0db3d0cb8cbef52 100644
--- a/source/Lib/EncoderLib/EncCu.cpp
+++ b/source/Lib/EncoderLib/EncCu.cpp
@@ -2008,8 +2008,8 @@ bool EncCu::xCheckRDCostIntra(CodingStructure *&tempCS, CodingStructure *&bestCS
               m_modeCtrl->setISPMode(cu.ispMode);
               m_modeCtrl->setISPLfnstIdx(cu.lfnstIdx);
               m_modeCtrl->setMIPFlagISPPass(cu.mipFlag);
-#if IDCC_TPM_JEM
-			  m_modeCtrl->setTPMFlagISPPass(cu.TmpFlag);
+#if JVET_V0130_INTRA_TMP
+			        m_modeCtrl->setTPMFlagISPPass(cu.tmpFlag);
 #endif
               m_modeCtrl->setBestISPIntraModeRelCU(cu.ispMode ? PU::getFinalIntraMode(*cu.firstPU, CHANNEL_TYPE_LUMA) : UINT8_MAX);
               m_modeCtrl->setBestDCT2NonISPCostRelCU(m_modeCtrl->getMtsFirstPassNoIspCost());
@@ -3881,8 +3881,8 @@ void EncCu::xCheckRDCostMergeGeo2Nx2N(CodingStructure *&tempCS, CodingStructure
   cu.mmvdSkip = false;
   cu.skip = false;
   cu.mipFlag = false;
-#if IDCC_TPM_JEM
-  cu.TmpFlag = false;
+#if JVET_V0130_INTRA_TMP
+  cu.tmpFlag = false;
 #endif
   cu.bdpcmMode = 0;
 
@@ -4109,8 +4109,8 @@ void EncCu::xCheckRDCostMergeGeo2Nx2N(CodingStructure *&tempCS, CodingStructure
       cu.mmvdSkip = false;
       cu.skip = false;
       cu.mipFlag = false;
-#if IDCC_TPM_JEM
-	  cu.TmpFlag = false;
+#if JVET_V0130_INTRA_TMP
+	    cu.tmpFlag = false;
 #endif
       cu.bdpcmMode = 0;
       PredictionUnit &pu = tempCS->addPU(cu, pm.chType);
@@ -4808,8 +4808,8 @@ void EncCu::xCheckSATDCostGeoMerge(CodingStructure *&tempCS, CodingUnit &cu, Pre
   cu.mmvdSkip = false;
   cu.skip = false;
   cu.mipFlag = false;
-#if IDCC_TPM_JEM
-  cu.TmpFlag = false;
+#if JVET_V0130_INTRA_TMP
+  cu.tmpFlag = false;
 #endif
   cu.bdpcmMode = 0;
   pu.mergeFlag = true;
@@ -7677,8 +7677,6 @@ void EncCu::xCheckRDCostInterMultiHyp2Nx2N(CodingStructure *&tempCS, CodingStruc
   std::stable_sort(mhResults.begin(), mhResults.end(), RDCostComp);
 
   // actual testing with "true" RD costs
-  if (std::min((int)mhResults.size(), m_pcEncCfg->getAddHypTries()) > 1)
-    printf("nbTries=%d\n", std::min((int)mhResults.size(), m_pcEncCfg->getAddHypTries()));
   for (int i = 0; i < std::min((int)mhResults.size(), m_pcEncCfg->getAddHypTries()); ++i)
   {
     tempCS->initStructData(encTestMode.qp);
diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp
index d8c7306aed0e4a034ebd6b4e227375c665733f32..b76d81cf34849a886353165abf4603baf1914d66 100644
--- a/source/Lib/EncoderLib/EncLib.cpp
+++ b/source/Lib/EncoderLib/EncLib.cpp
@@ -1429,9 +1429,9 @@ void EncLib::xInitSPS( SPS& sps )
   sps.setNumAddHypWeights(m_numAddHypWeights);
   sps.setMaxNumAddHypRefFrames(m_maxNumAddHypRefFrames);
 #endif
-#if IDCC_TPM_JEM
-  sps.setUseIntraTMP(m_IntraTMP);
-  sps.setIntraTMPMaxSize(m_IntraTMP_MaxSize);
+#if JVET_V0130_INTRA_TMP
+  sps.setUseIntraTMP(m_intraTMP);
+  sps.setIntraTMPMaxSize(m_intraTmpMaxSize);
 #endif
   // ADD_NEW_TOOL : (encoder lib) set tool enabling flags and associated parameters here
   sps.setUseISP                             ( m_ISP );
diff --git a/source/Lib/EncoderLib/EncModeCtrl.cpp b/source/Lib/EncoderLib/EncModeCtrl.cpp
index 67b0825473d1807c127628dfe14551828a02688c..007b25ea102de18be0060bb5673c422cff62630a 100644
--- a/source/Lib/EncoderLib/EncModeCtrl.cpp
+++ b/source/Lib/EncoderLib/EncModeCtrl.cpp
@@ -2209,8 +2209,8 @@ bool EncModeCtrlMTnoRQT::tryMode( const EncTestMode& encTestmode, const CodingSt
             int bit4 = cuECtx.ispLfnstIdx == 2;
             int bit5 = cuECtx.mipFlag;
             int bit6 = cuECtx.bestCostIsp < cuECtx.bestNonDCT2Cost * 0.95;
-#if IDCC_TPM_JEM
-			int bit7 = cuECtx.TmpFlag;
+#if JVET_V0130_INTRA_TMP
+            int bit7 = cuECtx.tmpFlag;
 #endif
             int val =
               (bit0) |
@@ -2220,8 +2220,8 @@ bool EncModeCtrlMTnoRQT::tryMode( const EncTestMode& encTestmode, const CodingSt
               (bit4 << 4) |
               (bit5 << 5) |
               (bit6 << 6) |
-#if IDCC_TPM_JEM
-			  (bit7 << 7) |
+#if JVET_V0130_INTRA_TMP
+              (bit7 << 7) |
 #endif
               ( cuECtx.bestPredModeDCT2 << 9 );
             relatedCU.ispPredModeVal     = val;
diff --git a/source/Lib/EncoderLib/EncModeCtrl.h b/source/Lib/EncoderLib/EncModeCtrl.h
index 5b62615ea0b94bb1cf4ee74eebe77560cf92ad82..528ddb8da068916047865280d04109ada77ef12c 100644
--- a/source/Lib/EncoderLib/EncModeCtrl.h
+++ b/source/Lib/EncoderLib/EncModeCtrl.h
@@ -238,8 +238,8 @@ struct ComprCUCtx
                     ( MAX_DOUBLE )
     , bestISPIntraMode
                     ( UINT8_MAX )
-#if IDCC_TPM_JEM
-	  , TmpFlag(false)
+#if JVET_V0130_INTRA_TMP
+	  , tmpFlag       (false)
 #endif
     , mipFlag       ( false )
     , ispMode       ( NOT_INTRA_SUBPARTITIONS )
@@ -286,8 +286,8 @@ struct ComprCUCtx
   double                            bestDCT2NonISPCost;
   double                            bestNonDCT2Cost;
   uint8_t                           bestISPIntraMode;
-#if IDCC_TPM_JEM
-  bool								TmpFlag;
+#if JVET_V0130_INTRA_TMP
+  bool							              	tmpFlag;
 #endif
   bool                              mipFlag;
   uint8_t                           ispMode;
@@ -396,8 +396,8 @@ public:
   void   setBestNonDCT2Cost           ( double val )            { m_ComprCUCtxList.back().bestNonDCT2Cost = val; }
   uint8_t getBestISPIntraModeRelCU    ()                  const { return m_ComprCUCtxList.back().bestISPIntraMode; }
   void   setBestISPIntraModeRelCU     ( uint8_t val )           { m_ComprCUCtxList.back().bestISPIntraMode = val; }
-#if IDCC_TPM_JEM
-  void   setTPMFlagISPPass(bool val) { m_ComprCUCtxList.back().TmpFlag = val; }
+#if JVET_V0130_INTRA_TMP
+  void   setTPMFlagISPPass            (bool val)                { m_ComprCUCtxList.back().tmpFlag = val; }
 #endif
   void   setMIPFlagISPPass            ( bool val )              { m_ComprCUCtxList.back().mipFlag = val; }
   void   setISPMode                   ( uint8_t val )           { m_ComprCUCtxList.back().ispMode = val; }
diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp
index 033ecbab76951161f7ef7acad5885c14694b0540..46474299c2704bb447f3af87fb6398c124173acd 100644
--- a/source/Lib/EncoderLib/IntraSearch.cpp
+++ b/source/Lib/EncoderLib/IntraSearch.cpp
@@ -394,7 +394,7 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
 
   const TempCtx ctxStart          ( m_CtxCache, m_CABACEstimator->getCtx() );
   const TempCtx ctxStartMipFlag    ( m_CtxCache, SubCtx( Ctx::MipFlag,          m_CABACEstimator->getCtx() ) );
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   const TempCtx ctxStartTpmFlag(m_CtxCache, SubCtx(Ctx::TmpFlag, m_CABACEstimator->getCtx()));
 #endif
   const TempCtx ctxStartIspMode    ( m_CtxCache, SubCtx( Ctx::ISPMode,          m_CABACEstimator->getCtx() ) );
@@ -494,9 +494,9 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
     const bool mipAllowed = sps.getUseMIP() && isLuma(partitioner.chType) && ((cu.lfnstIdx == 0) || allowLfnstWithMip(cu.firstPU->lumaSize()));
     const bool testMip = mipAllowed && !(cu.lwidth() > (8 * cu.lheight()) || cu.lheight() > (8 * cu.lwidth()));
     const bool supportedMipBlkSize = pu.lwidth() <= MIP_MAX_WIDTH && pu.lheight() <= MIP_MAX_HEIGHT;
-#if IDCC_TPM_JEM
-	const bool tpmAllowed = sps.getUseIntraTMP() && isLuma(partitioner.chType) && ((cu.lfnstIdx == 0) || allowLfnstWithTpm());
-	const bool testTpm = tpmAllowed && (cu.lwidth() <= sps.getIntraTMPMaxSize() && cu.lheight() <= sps.getIntraTMPMaxSize());
+#if JVET_V0130_INTRA_TMP
+    const bool tpmAllowed = sps.getUseIntraTMP() && isLuma(partitioner.chType) && ((cu.lfnstIdx == 0) || allowLfnstWithTmp());
+    const bool testTpm = tpmAllowed && (cu.lwidth() <= sps.getIntraTMPMaxSize() && cu.lheight() <= sps.getIntraTMPMaxSize());
 #endif
 
     static_vector<ModeInfo, FAST_UDI_MAX_RDMODE_NUM> uiRdModeList;
@@ -573,19 +573,20 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
                                    ? std::max(numModesForFullRD, floorLog2(std::min(pu.lwidth(), pu.lheight())) - 1)
                                    : numModesForFullRD;
           }
-#if IDCC_TPM_JEM
-		  if (testTpm)
-			  numModesForFullRD += 1; // testing tpm
-		  const int numHadCand = (testMip ? 2 : 1) * 3 + testTpm;
+#if JVET_V0130_INTRA_TMP
+          if( testTpm )
+          {
+            numModesForFullRD += 1; // testing tpm
+          }
+          const int numHadCand = (testMip ? 2 : 1) * 3 + testTpm;
+          
+          cu.tmpFlag = false;
 #else
           const int numHadCand = (testMip ? 2 : 1) * 3;
 #endif
 
           //*** Derive (regular) candidates using Hadamard
           cu.mipFlag = false;
-#if IDCC_TPM_JEM
-		  cu.TmpFlag = false;
-#endif
 
           //===== init pattern for luma prediction =====
           initIntraPatternChType(cu, pu.Y(), true);
@@ -616,8 +617,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
               minSadHad += std::min(distParamSad.distFunc(distParamSad) * 2, distParamHad.distFunc(distParamHad));
 
               // NB xFracModeBitsIntra will not affect the mode for chroma that may have already been pre-estimated.
-#if IDCC_TPM_JEM
-			  m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag);
+#if JVET_V0130_INTRA_TMP
+              m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag );
 #endif
               m_CABACEstimator->getCtx() = SubCtx( Ctx::MipFlag, ctxStartMipFlag );
               m_CABACEstimator->getCtx() = SubCtx( Ctx::ISPMode, ctxStartIspMode );
@@ -693,8 +694,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
 
                     // NB xFracModeBitsIntra will not affect the mode for chroma that may have already been
                     // pre-estimated.
-#if IDCC_TPM_JEM
-					m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag);
+#if JVET_V0130_INTRA_TMP
+                    m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag );
 #endif
                     m_CABACEstimator->getCtx() = SubCtx(Ctx::MipFlag, ctxStartMipFlag);
                     m_CABACEstimator->getCtx() = SubCtx(Ctx::ISPMode, ctxStartIspMode);
@@ -761,8 +762,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
                     std::min(distParamSad.distFunc(distParamSad) * 2, distParamHad.distFunc(distParamHad));
 
                   // NB xFracModeBitsIntra will not affect the mode for chroma that may have already been pre-estimated.
-#if IDCC_TPM_JEM
-				  m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag);
+#if JVET_V0130_INTRA_TMP
+                  m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag );
 #endif
                   m_CABACEstimator->getCtx() = SubCtx(Ctx::MipFlag, ctxStartMipFlag);
                   m_CABACEstimator->getCtx() = SubCtx(Ctx::ISPMode, ctxStartIspMode);
@@ -806,47 +807,41 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
               m_dSavedHadListLFNST.resize(3);
               LFNSTSaveFlag = false;
             }
-#if IDCC_TPM_JEM
-			// derive TPM candidate using hadamard
-			if (testTpm)
-			{
-				cu.TmpFlag = true;
-				cu.mipFlag = false;
-				pu.multiRefIdx = 0;
-
-
-
-				int foundCandiNum = 0;
-				bool bsuccessfull = 0;
-				CodingUnit cu_cpy = cu;
-
-				if (isRefTemplateAvailable(cu_cpy, cu_cpy.blocks[COMPONENT_Y]))
-				{
-					m_pcTrQuant->getTargetTemplate(&cu_cpy, pu.lwidth(), pu.lheight());
-					m_pcTrQuant->candidateSearchIntra(&cu_cpy, pu.lwidth(), pu.lheight());
-					bsuccessfull = m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
-				}
-				if (bsuccessfull && foundCandiNum >= 1)
-				{
-					
-					Distortion minSadHad =
-						std::min(distParamSad.distFunc(distParamSad) * 2, distParamHad.distFunc(distParamHad));
-
-					m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag);
-
-					uint64_t fracModeBits = xFracModeBitsIntra(pu, 0, CHANNEL_TYPE_LUMA);
-
-					double cost = double(minSadHad) + double(fracModeBits) * sqrtLambdaForFirstPass;
-					DTRACE(g_trace_ctx, D_INTRA_COST, "IntraTPM: %u, %llu, %f (%d)\n", minSadHad, fracModeBits, cost,
-						0);
-
-					updateCandList(ModeInfo(0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1), cost, uiRdModeList,
-						CandCostList, numModesForFullRD);
-					updateCandList(ModeInfo(0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1),
-						0.8 * double(minSadHad), uiHadModeList, CandHadList, numHadCand);
-				}
-				
-			}
+#if JVET_V0130_INTRA_TMP
+            // derive TPM candidate using hadamard
+            if( testTpm )
+            {
+              cu.tmpFlag = true;
+              cu.mipFlag = false;
+              pu.multiRefIdx = 0;
+
+              int foundCandiNum = 0;
+              bool bsuccessfull = 0;
+              CodingUnit cu_cpy = cu;
+
+              if( isRefTemplateAvailable( cu_cpy, cu_cpy.blocks[COMPONENT_Y] ) )
+              {
+                m_pcTrQuant->getTargetTemplate( &cu_cpy, pu.lwidth(), pu.lheight() );
+                m_pcTrQuant->candidateSearchIntra( &cu_cpy, pu.lwidth(), pu.lheight() );
+                bsuccessfull = m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+              }
+              if( bsuccessfull && foundCandiNum >= 1 )
+              {
+
+                Distortion minSadHad =
+                  std::min( distParamSad.distFunc( distParamSad ) * 2, distParamHad.distFunc( distParamHad ) );
+
+                m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag );
+
+                uint64_t fracModeBits = xFracModeBitsIntra( pu, 0, CHANNEL_TYPE_LUMA );
+
+                double cost = double( minSadHad ) + double( fracModeBits ) * sqrtLambdaForFirstPass;
+                DTRACE( g_trace_ctx, D_INTRA_COST, "IntraTPM: %u, %llu, %f (%d)\n", minSadHad, fracModeBits, cost, 0 );
+
+                updateCandList( ModeInfo( 0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1 ), cost, uiRdModeList, CandCostList, numModesForFullRD );
+                updateCandList( ModeInfo( 0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1 ), 0.8 * double( minSadHad ), uiHadModeList, CandHadList, numHadCand );
+              }
+            }
 #endif
             //*** Derive MIP candidates using Hadamard
             if (testMip && !supportedMipBlkSize)
@@ -866,8 +861,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
             }
             else if (testMip)
             {
-#if IDCC_TPM_JEM
-				cu.TmpFlag = 0;
+#if JVET_V0130_INTRA_TMP
+              cu.tmpFlag = 0;
 #endif
               cu.mipFlag     = true;
               pu.multiRefIdx = 0;
@@ -1097,8 +1092,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
           cs.interHad = 0;
 
           //===== reset context models =====
-#if IDCC_TPM_JEM
-		  m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag);
+#if JVET_V0130_INTRA_TMP
+          m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag );
 #endif
           m_CABACEstimator->getCtx() = SubCtx(Ctx::MipFlag, ctxStartMipFlag);
           m_CABACEstimator->getCtx() = SubCtx(Ctx::ISPMode, ctxStartIspMode);
@@ -1204,8 +1199,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
         cu.dimd = true;
       }
 #endif
-#if IDCC_TPM_JEM
-	  cu.TmpFlag = uiOrgMode.tpmFlg;
+#if JVET_V0130_INTRA_TMP
+      cu.tmpFlag = uiOrgMode.tmpFlag;
 #endif
       cu.mipFlag                     = uiOrgMode.mipFlg;
       pu.mipTransposedFlag           = uiOrgMode.mipTrFlg;
@@ -1218,10 +1213,10 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
       CHECK(cu.ispMode && cu.mipFlag, "Error: combination of ISP and MIP not supported");
       CHECK(cu.ispMode && pu.multiRefIdx, "Error: combination of ISP and MRL not supported");
       CHECK(cu.ispMode&& cu.colorTransform, "Error: combination of ISP and ACT not supported");
-#if IDCC_TPM_JEM
-	  CHECK(cu.mipFlag&& cu.TmpFlag, "Error: combination of MIP and TPM not supported");
-	  CHECK(cu.TmpFlag&& cu.ispMode, "Error: combination of TPM and ISP not supported");
-	  CHECK(cu.TmpFlag&& pu.multiRefIdx, "Error: combination of TPM and MRL not supported");
+#if JVET_V0130_INTRA_TMP
+      CHECK( cu.mipFlag && cu.tmpFlag, "Error: combination of MIP and TPM not supported" );
+      CHECK( cu.tmpFlag && cu.ispMode, "Error: combination of TPM and ISP not supported" );
+      CHECK( cu.tmpFlag && pu.multiRefIdx, "Error: combination of TPM and MRL not supported" );
 #endif
 #if ENABLE_DIMD && JVET_V0087_DIMD_NO_ISP
       CHECK(cu.ispMode && cu.dimd, "Error: combination of ISP and DIMD not supported");
@@ -1265,14 +1260,14 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
             uiBestPUMode.ispMod, mtsCheckRangeFlag, mtsFirstCheckId, mtsLastCheckId, moreProbMTSIdxFirst);
         }
       }
-#if IDCC_TPM_JEM
-	  if (!cu.ispMode && !cu.mtsFlag && !cu.lfnstIdx && !cu.bdpcmMode && !pu.multiRefIdx && !cu.mipFlag && !cu.TmpFlag && testISP)
+#if JVET_V0130_INTRA_TMP
+      if( !cu.ispMode && !cu.mtsFlag && !cu.lfnstIdx && !cu.bdpcmMode && !pu.multiRefIdx && !cu.mipFlag && !cu.tmpFlag && testISP )
 #else
       if (!cu.ispMode && !cu.mtsFlag && !cu.lfnstIdx && !cu.bdpcmMode && !pu.multiRefIdx && !cu.mipFlag && testISP)
 #endif
       {
-#if IDCC_TPM_JEM
-		  m_regIntraRDListWithCosts.push_back(ModeInfoWithCost(cu.mipFlag, pu.mipTransposedFlag, pu.multiRefIdx, cu.ispMode, uiOrgMode.modeId, cu.TmpFlag, csTemp->cost));
+#if JVET_V0130_INTRA_TMP
+        m_regIntraRDListWithCosts.push_back( ModeInfoWithCost( cu.mipFlag, pu.mipTransposedFlag, pu.multiRefIdx, cu.ispMode, uiOrgMode.modeId, cu.tmpFlag, csTemp->cost ) );
 #else
         m_regIntraRDListWithCosts.push_back( ModeInfoWithCost( cu.mipFlag, pu.mipTransposedFlag, pu.multiRefIdx, cu.ispMode, uiOrgMode.modeId, csTemp->cost ) );
 #endif
@@ -1290,10 +1285,10 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
       {
         m_modeCostStore[lfnstIdx][mode] = tmpValidReturn ? csTemp->cost : (MAX_DOUBLE / 2.0); //(MAX_DOUBLE / 2.0) ??
       }
-#if IDCC_TPM_JEM
-	  DTRACE(g_trace_ctx, D_INTRA_COST, "IntraCost T [x=%d,y=%d,w=%d,h=%d] %f (%d,%d,%d,%d,%d,%d,%d) \n", cu.blocks[0].x,
-		  cu.blocks[0].y, (int)width, (int)height, csTemp->cost, uiOrgMode.modeId, uiOrgMode.ispMod,
-		  pu.multiRefIdx, cu.TmpFlag, cu.mipFlag, cu.lfnstIdx, cu.mtsFlag);
+#if JVET_V0130_INTRA_TMP
+      DTRACE( g_trace_ctx, D_INTRA_COST, "IntraCost T [x=%d,y=%d,w=%d,h=%d] %f (%d,%d,%d,%d,%d,%d,%d) \n", cu.blocks[0].x,
+              cu.blocks[0].y, ( int ) width, ( int ) height, csTemp->cost, uiOrgMode.modeId, uiOrgMode.ispMod,
+              pu.multiRefIdx, cu.tmpFlag, cu.mipFlag, cu.lfnstIdx, cu.mtsFlag );
 #else
       DTRACE(g_trace_ctx, D_INTRA_COST, "IntraCost T [x=%d,y=%d,w=%d,h=%d] %f (%d,%d,%d,%d,%d,%d) \n", cu.blocks[0].x,
              cu.blocks[0].y, (int) width, (int) height, csTemp->cost, uiOrgMode.modeId, uiOrgMode.ispMod,
@@ -1395,8 +1390,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
     if( validReturn )
     {
       //=== update PU data ====
-#if IDCC_TPM_JEM
-		cu.TmpFlag = uiBestPUMode.tpmFlg;
+#if JVET_V0130_INTRA_TMP
+      cu.tmpFlag = uiBestPUMode.tmpFlag;
 #endif
       cu.mipFlag = uiBestPUMode.mipFlg;
       pu.mipTransposedFlag             = uiBestPUMode.mipTrFlg;
@@ -3352,16 +3347,16 @@ void IntraSearch::xIntraCodingTUBlock(TransformUnit &tu, const ComponentID &comp
       }
       else
       {
-#if IDCC_TPM_JEM
-		  if (PU::isTmp(pu, chType))
-		  {
-			  int foundCandiNum;
-			  m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight());
-			  m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight());
-			  m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
-			  assert(foundCandiNum >= 1);
-		  }
-		  else if (PU::isMIP(pu, chType))
+#if JVET_V0130_INTRA_TMP
+        if( PU::isTmp( pu, chType ) )
+        {
+          int foundCandiNum;
+          m_pcTrQuant->getTargetTemplate( tu.cu, pu.lwidth(), pu.lheight() );
+          m_pcTrQuant->candidateSearchIntra( tu.cu, pu.lwidth(), pu.lheight() );
+          m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+          CHECK( foundCandiNum < 1, "" );
+        }
+        else if( PU::isMIP( pu, chType ) )
 #else
         if( PU::isMIP( pu, chType ) )
 #endif
@@ -4586,17 +4581,17 @@ bool IntraSearch::xRecurIntraCodingACTQT(CodingStructure &cs, Partitioner &parti
       PelBuf         piResi = resiBuf.bufs[compID];
 
       initIntraPatternChType(*tu.cu, area);
-#if IDCC_TPM_JEM
-	  if (PU::isTmp(pu, chType))
-	  {
-		  int foundCandiNum;
-		  m_pcTrQuant->getTargetTemplate(pu.cu, pu.lwidth(), pu.lheight());
-		  m_pcTrQuant->candidateSearchIntra(pu.cu, pu.lwidth(), pu.lheight());
-		  m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
-		  assert(foundCandiNum >= 1);
-
-	  }
-	  else if (PU::isMIP(pu, chType))
+#if JVET_V0130_INTRA_TMP
+      if( PU::isTmp( pu, chType ) )
+      {
+        int foundCandiNum;
+        m_pcTrQuant->getTargetTemplate( pu.cu, pu.lwidth(), pu.lheight() );
+        m_pcTrQuant->candidateSearchIntra( pu.cu, pu.lwidth(), pu.lheight() );
+        m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+        CHECK( foundCandiNum < 1, "" );
+
+      }
+      else if( PU::isMIP( pu, chType ) )
 #else
       if (PU::isMIP(pu, chType))
 #endif
diff --git a/source/Lib/EncoderLib/IntraSearch.h b/source/Lib/EncoderLib/IntraSearch.h
index 615408ac82f72010b877b5d9c8161ffd28b0b124..31b813dabccf8007b977a5b4d48c8ed6f2e78a89 100644
--- a/source/Lib/EncoderLib/IntraSearch.h
+++ b/source/Lib/EncoderLib/IntraSearch.h
@@ -221,13 +221,12 @@ private:
     int      mRefId; // PU::multiRefIdx
     uint8_t  ispMod; // CU::ispMode
     uint32_t modeId; // PU::intraDir[CHANNEL_TYPE_LUMA]
-#if IDCC_TPM_JEM
-	bool     tpmFlg; // CU::TmpFlag
-#endif
-#if IDCC_TPM_JEM
-	ModeInfo() : mipFlg(false), mipTrFlg(false), mRefId(0), ispMod(NOT_INTRA_SUBPARTITIONS), modeId(0), tpmFlg(0) {}
-	ModeInfo(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf = 0) : mipFlg(mipf), mipTrFlg(miptf), mRefId(mrid), ispMod(ispm), modeId(mode), tpmFlg(tpmf) {}
-	bool operator==(const ModeInfo cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tpmFlg == cmp.tpmFlg); }
+#if JVET_V0130_INTRA_TMP
+	  bool     tmpFlag; // CU::tmpFlag
+
+	  ModeInfo() : mipFlg(false), mipTrFlg(false), mRefId(0), ispMod(NOT_INTRA_SUBPARTITIONS), modeId(0), tmpFlag(0) {}
+	  ModeInfo(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf = 0) : mipFlg(mipf), mipTrFlg(miptf), mRefId(mrid), ispMod(ispm), modeId(mode), tmpFlag(tpmf) {}
+	  bool operator==(const ModeInfo cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tmpFlag == cmp.tmpFlag); }
 #else
     ModeInfo() : mipFlg(false), mipTrFlg(false), mRefId(0), ispMod(NOT_INTRA_SUBPARTITIONS), modeId(0) {}
     ModeInfo(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode) : mipFlg(mipf), mipTrFlg(miptf), mRefId(mrid), ispMod(ispm), modeId(mode) {}
@@ -238,9 +237,9 @@ private:
   {
     double rdCost;
     ModeInfoWithCost() : ModeInfo(), rdCost(MAX_DOUBLE) {}
-#if IDCC_TPM_JEM
-	ModeInfoWithCost(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf, double cost) : ModeInfo(mipf, miptf, mrid, ispm, mode, tpmf), rdCost(cost) {}
-	bool operator==(const ModeInfoWithCost cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tpmFlg == cmp.tpmFlg && rdCost == cmp.rdCost); }
+#if JVET_V0130_INTRA_TMP
+	  ModeInfoWithCost(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf, double cost) : ModeInfo(mipf, miptf, mrid, ispm, mode, tpmf), rdCost(cost) {}
+	  bool operator==(const ModeInfoWithCost cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tmpFlag == cmp.tmpFlag && rdCost == cmp.rdCost); }
 #else
     ModeInfoWithCost(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, double cost) : ModeInfo(mipf, miptf, mrid, ispm, mode), rdCost(cost) {}
     bool operator==(const ModeInfoWithCost cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && rdCost == cmp.rdCost); }
diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp
index 4d123fa1f371870762637556c002c8b8fdde1232..cec7da12760889e3721451dbd17fe3b50b56380e 100644
--- a/source/Lib/EncoderLib/VLCWriter.cpp
+++ b/source/Lib/EncoderLib/VLCWriter.cpp
@@ -1384,11 +1384,11 @@ void HLSWriter::codeSPS( const SPS* pcSPS )
 #if ENABLE_DIMD
   WRITE_FLAG( pcSPS->getUseDimd() ? 1 : 0,                                             "sps_dimd_enabled_flag");
 #endif
-#if IDCC_TPM_JEM
+#if JVET_V0130_INTRA_TMP
   WRITE_FLAG( pcSPS->getUseIntraTMP() ? 1 : 0,                                         "sps_intraTMP_enabled_flag");
   if(pcSPS->getUseIntraTMP())
   {
-    WRITE_UVLC(floorLog2(pcSPS->getIntraTMPMaxSize()), "sps_log2_intra_tmp_max_size");
+    WRITE_UVLC(floorLog2(pcSPS->getIntraTMPMaxSize()),                                 "sps_log2_intra_tmp_max_size");
   }
 #endif