From 1c1b257d552e984e2953b31d4b0b2d203c664a8a Mon Sep 17 00:00:00 2001
From: Vadim Seregin <vseregin@qti.qualcomm.com>
Date: Thu, 21 Oct 2021 08:49:46 -0700
Subject: [PATCH] Lossless code cleanup, mainly TMP related

---
 source/Lib/CommonLib/InterPrediction.cpp      |   4 +
 source/Lib/CommonLib/IntraPrediction.cpp      | 546 +++++++++++++++++-
 source/Lib/CommonLib/IntraPrediction.h        |  87 ++-
 source/Lib/CommonLib/TrQuant.cpp              | 543 -----------------
 source/Lib/CommonLib/TrQuant.h                |  83 +--
 source/Lib/CommonLib/TypeDef.h                |  16 +-
 source/Lib/CommonLib/x86/InitX86.cpp          |  25 +-
 source/Lib/CommonLib/x86/IntraX86.h           | 321 ++++++++++
 source/Lib/CommonLib/x86/TrQuantX86.h         | 270 +--------
 source/Lib/CommonLib/x86/avx/Intra_avx.cpp    |   1 +
 source/Lib/CommonLib/x86/avx2/Intra_avx2.cpp  |   1 +
 .../Lib/CommonLib/x86/sse41/Intra_sse41.cpp   |   1 +
 .../Lib/CommonLib/x86/sse42/Intra_sse42.cpp   |   1 +
 source/Lib/DecoderLib/DecCu.cpp               |  25 +-
 source/Lib/EncoderLib/IntraSearch.cpp         |  42 +-
 15 files changed, 1030 insertions(+), 936 deletions(-)
 create mode 100644 source/Lib/CommonLib/x86/IntraX86.h
 create mode 100644 source/Lib/CommonLib/x86/avx/Intra_avx.cpp
 create mode 100644 source/Lib/CommonLib/x86/avx2/Intra_avx2.cpp
 create mode 100644 source/Lib/CommonLib/x86/sse41/Intra_sse41.cpp
 create mode 100644 source/Lib/CommonLib/x86/sse42/Intra_sse42.cpp

diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index 7fe099653..e4873b7a5 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -48,6 +48,10 @@
 #include "Reshape.h"
 #endif
 
+#if ENABLE_SIMD_TMP
+#include "CommonDefX86.h"
+#endif
+
 //! \ingroup CommonLib
 //! \{
 
diff --git a/source/Lib/CommonLib/IntraPrediction.cpp b/source/Lib/CommonLib/IntraPrediction.cpp
index 5e5a66392..f16ddaa80 100644
--- a/source/Lib/CommonLib/IntraPrediction.cpp
+++ b/source/Lib/CommonLib/IntraPrediction.cpp
@@ -107,6 +107,9 @@ IntraPrediction::IntraPrediction()
 #if MMLM
   m_encPreRDRun = false;
 #endif
+#if JVET_V0130_INTRA_TMP
+  m_pppTarPatch = NULL;
+#endif
 }
 
 IntraPrediction::~IntraPrediction()
@@ -140,6 +143,31 @@ void IntraPrediction::destroy()
     buffer.destroy();
   }
   m_tempBuffer.clear();
+
+#if JVET_V0130_INTRA_TMP
+  if( m_pppTarPatch != NULL )
+  {
+    for( unsigned int uiDepth = 0; uiDepth < USE_MORE_BLOCKSIZE_DEPTH_MAX; uiDepth++ )
+    {
+      unsigned int blkSize = g_uiDepth2Width[uiDepth];
+
+      unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE;
+      for( unsigned int uiRow = 0; uiRow < patchSize; uiRow++ )
+      {
+        if( m_pppTarPatch[uiDepth][uiRow] != NULL )
+        {
+          delete[]m_pppTarPatch[uiDepth][uiRow]; m_pppTarPatch[uiDepth][uiRow] = NULL;
+        }
+      }
+      if( m_pppTarPatch[uiDepth] != NULL )
+      {
+        delete[]m_pppTarPatch[uiDepth]; m_pppTarPatch[uiDepth] = NULL;
+      }
+    }
+    delete[] m_pppTarPatch;
+    m_pppTarPatch = NULL;
+  }
+#endif
 }
 
 void IntraPrediction::init(ChromaFormat chromaFormatIDC, const unsigned bitDepthY)
@@ -210,6 +238,34 @@ void IntraPrediction::init(ChromaFormat chromaFormatIDC, const unsigned bitDepth
   {
     buffer.create( chromaFormatIDC, Area( 0, 0, MAX_CU_SIZE, MAX_CU_SIZE ) );
   }
+
+#if JVET_V0130_INTRA_TMP
+  unsigned int blkSize;
+
+  if( m_pppTarPatch == NULL )
+  {
+    m_pppTarPatch = new Pel * *[USE_MORE_BLOCKSIZE_DEPTH_MAX];
+    for( unsigned int uiDepth = 0; uiDepth < USE_MORE_BLOCKSIZE_DEPTH_MAX; uiDepth++ )
+    {
+      blkSize = g_uiDepth2Width[uiDepth];
+
+      unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE;
+      m_pppTarPatch[uiDepth] = new Pel *[patchSize];
+      for( unsigned int uiRow = 0; uiRow < patchSize; uiRow++ )
+      {
+        m_pppTarPatch[uiDepth][uiRow] = new Pel[patchSize];
+      }
+    }
+  }
+
+  m_calcTemplateDiff = calcTemplateDiff;
+#endif
+
+#if ENABLE_SIMD_TMP
+#ifdef TARGET_SIMD_X86
+  initIntraX86();
+#endif
+#endif
 }
 
 #if JVET_W0123_TIMD_FUSION
@@ -1984,7 +2040,7 @@ void IntraPrediction::initIntraPatternChTypeISP(const CodingUnit& cu, const Comp
 
 #if JVET_V0130_INTRA_TMP
 #if JVET_W0069_TMP_BOUNDARY
-RefTemplateType IntraPrediction::GetRefTemplateType(CodingUnit& cu, CompArea& area)
+RefTemplateType IntraPrediction::getRefTemplateType(CodingUnit& cu, CompArea& area)
 #else
 bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area)
 #endif
@@ -5556,4 +5612,492 @@ int &a, int &b, int &iShift)
 }
 #endif
 
+#if JVET_V0130_INTRA_TMP
+void insertNode( int diff, int& iXOffset, int& iYOffset, int& pDiff, int& pX, int& pY, short& pId, unsigned int& setId )
+{
+  pDiff = diff;
+  pX = iXOffset;
+  pY = iYOffset;
+  pId = setId;
+}
+
+void clipMvIntraConstraint( CodingUnit* pcCU, int regionId, int& iHorMin, int& iHorMax, int& iVerMin, int& iVerMax, unsigned int uiTemplateSize, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int iCurrY, int iCurrX, int offsetLCUY, int offsetLCUX )
+{
+  int searchRangeWidth = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkWidth;
+  int searchRangeHeight = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkHeight;
+  int iMvShift = 0;
+  int iTemplateSize = uiTemplateSize;
+  int iBlkWidth = uiBlkWidth;
+  int iBlkHeight = uiBlkHeight;
+  if( regionId == 0 ) //above outside LCU
+  {
+    iHorMax = std::min( (iCurrX + searchRangeWidth) << iMvShift, ( int ) ((pcCU->cs->pps->getPicWidthInLumaSamples() - iBlkWidth) << iMvShift) );
+    iHorMin = std::max( (iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift );
+
+    iVerMax = (iCurrY - iBlkHeight - offsetLCUY) << iMvShift;
+    iVerMin = std::max( ((iTemplateSize) << iMvShift), ((iCurrY - searchRangeHeight) << iMvShift) );
+
+    iHorMin = iHorMin - iCurrX;
+    iHorMax = iHorMax - iCurrX;
+    iVerMax = iVerMax - iCurrY;
+    iVerMin = iVerMin - iCurrY;
+  }
+  else if( regionId == 1 ) //left outside LCU
+  {
+    iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift;
+    iHorMin = std::max( (iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift );
+
+    iVerMin = std::max( (iTemplateSize) << iMvShift, (iCurrY - iBlkHeight - offsetLCUY) << iMvShift );
+    iVerMax = (iCurrY) << iMvShift;
+
+    iHorMin = iHorMin - iCurrX;
+    iHorMax = iHorMax - iCurrX;
+    iVerMax = iVerMax - iCurrY;
+    iVerMin = iVerMin - iCurrY;
+  }
+  else if( regionId == 2 ) //left outside LCU (can reach the bottom row of LCU)
+  {
+    iHorMin = std::max( (iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift );
+    iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift;
+    iVerMin = (iCurrY + 1) << iMvShift;
+    iVerMax = std::min( pcCU->cs->pps->getPicHeightInLumaSamples() - iBlkHeight, (iCurrY - offsetLCUY + pcCU->cs->sps->getCTUSize() - iBlkHeight) << iMvShift );
+
+    iHorMin = iHorMin - iCurrX;
+    iHorMax = iHorMax - iCurrX;
+    iVerMax = iVerMax - iCurrY;
+    iVerMin = iVerMin - iCurrY;
+  }
+}
+
+TempLibFast::TempLibFast()
+{
+}
+
+TempLibFast::~TempLibFast()
+{
+}
+
+void TempLibFast::initTemplateDiff( unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int bitDepth )
+{
+  int maxValue = ((1 << bitDepth) >> (INIT_THRESHOULD_SHIFTBITS)) * (uiPatchHeight * uiPatchWidth - uiBlkHeight * uiBlkWidth);
+  m_diffMax = maxValue;
+  {
+    m_pDiff = maxValue;
+  }
+}
+
+#if JVET_W0069_TMP_BOUNDARY
+void IntraPrediction::getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType tempType )
+#else
+void IntraPrediction::getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight )
+#endif
+{
+  const ComponentID compID = COMPONENT_Y;
+  unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
+  unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
+  unsigned int uiTarDepth = floorLog2( std::max( uiBlkHeight, uiBlkWidth ) ) - 2;
+  Pel** tarPatch = m_pppTarPatch[uiTarDepth];
+  CompArea area = pcCU->blocks[compID];
+  Pel* pCurrStart = pcCU->cs->picture->getRecoBuf( area ).buf;
+  unsigned int  uiPicStride = pcCU->cs->picture->getRecoBuf( compID ).stride;
+  unsigned int uiY, uiX;
+
+  //fill template
+  //up-left & up 
+  Pel* tarTemp;
+#if JVET_W0069_TMP_BOUNDARY
+  if( tempType == L_SHAPE_TEMPLATE )
+  {
+#endif
+    Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride - TMP_TEMPLATE_SIZE;
+    for( uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++ )
+    {
+      tarTemp = tarPatch[uiY];
+      for( uiX = 0; uiX < uiPatchWidth; uiX++ )
+      {
+        tarTemp[uiX] = pCurrTemp[uiX];
+      }
+      pCurrTemp += uiPicStride;
+    }
+    //left
+    for( uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++ )
+    {
+      tarTemp = tarPatch[uiY];
+      for( uiX = 0; uiX < TMP_TEMPLATE_SIZE; uiX++ )
+      {
+        tarTemp[uiX] = pCurrTemp[uiX];
+      }
+      pCurrTemp += uiPicStride;
+    }
+#if JVET_W0069_TMP_BOUNDARY
+  }
+  else if( tempType == ABOVE_TEMPLATE )
+  {
+    Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride;
+    for( uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++ )
+    {
+      tarTemp = tarPatch[uiY];
+      for( uiX = 0; uiX < uiBlkWidth; uiX++ )
+      {
+        tarTemp[uiX] = pCurrTemp[uiX];
+      }
+      pCurrTemp += uiPicStride;
+    }
+  }
+  else if( tempType == LEFT_TEMPLATE )
+  {
+    Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE;
+    for( uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++ )
+    {
+      tarTemp = tarPatch[uiY];
+      for( uiX = 0; uiX < TMP_TEMPLATE_SIZE; uiX++ )
+      {
+        tarTemp[uiX] = pCurrTemp[uiX];
+      }
+      pCurrTemp += uiPicStride;
+    }
+  }
+#endif
+}
+
+#if JVET_W0069_TMP_BOUNDARY
+void IntraPrediction::candidateSearchIntra( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType tempType )
+#else
+void IntraPrediction::candidateSearchIntra( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight )
+#endif
+{
+  const ComponentID compID = COMPONENT_Y;
+  const int channelBitDepth = pcCU->cs->sps->getBitDepth( toChannelType( compID ) );
+  unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
+  unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
+  unsigned int uiTarDepth = floorLog2( std::max( uiBlkWidth, uiBlkHeight ) ) - 2;
+  Pel** tarPatch = getTargetPatch( uiTarDepth );
+  //Initialize the library for saving the best candidates
+  m_tempLibFast.initTemplateDiff( uiPatchWidth, uiPatchHeight, uiBlkWidth, uiBlkHeight, channelBitDepth );
+  short setId = 0; //record the reference picture.
+#if JVET_W0069_TMP_BOUNDARY
+  searchCandidateFromOnePicIntra( pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId, tempType );
+#else
+  searchCandidateFromOnePicIntra( pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId );
+#endif
+  //count collected candidate number
+  int pDiff = m_tempLibFast.getDiff();
+  int maxDiff = m_tempLibFast.getDiffMax();
+
+
+  if( pDiff < maxDiff )
+  {
+    m_uiVaildCandiNum = 1;
+  }
+  else
+  {
+    m_uiVaildCandiNum = 0;
+  }
+}
+
+#if JVET_W0069_TMP_BOUNDARY
+void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId, RefTemplateType tempType )
+#else
+void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId )
+#endif
+{
+  const ComponentID compID = COMPONENT_Y;
+  unsigned int uiBlkWidth = uiPatchWidth - TMP_TEMPLATE_SIZE;
+  unsigned int uiBlkHeight = uiPatchHeight - TMP_TEMPLATE_SIZE;
+
+  int pX = m_tempLibFast.getX();
+  int pY = m_tempLibFast.getY();
+  int pDiff = m_tempLibFast.getDiff();
+  short pId = m_tempLibFast.getId();
+  CompArea area = pcCU->blocks[compID];
+  int  refStride = pcCU->cs->picture->getRecoBuf( compID ).stride;
+
+  Pel* ref = pcCU->cs->picture->getRecoBuf( area ).buf;
+
+  setRefPicUsed( ref ); //facilitate the access of each candidate point 
+  setStride( refStride );
+
+  Mv cTmpMvPred;
+  cTmpMvPred.setZero();
+
+  unsigned int uiCUPelY = area.pos().y;
+  unsigned int uiCUPelX = area.pos().x;
+  int blkX = 0;
+  int blkY = 0;
+  int iCurrY = uiCUPelY + blkY;
+  int iCurrX = uiCUPelX + blkX;
+
+  Position  ctuRsAddr = CU::getCtuXYAddr( *pcCU );
+  int offsetLCUY = iCurrY - ctuRsAddr.y;
+  int offsetLCUX = iCurrX - ctuRsAddr.x;
+
+  int iYOffset, iXOffset;
+  int diff;
+  Pel* refCurr;
+
+  const int regionNum = 3;
+  int mvYMins[regionNum];
+  int mvYMaxs[regionNum];
+  int mvXMins[regionNum];
+  int mvXMaxs[regionNum];
+  int regionId = 0;
+
+  //1. check the near pixels within LCU
+  //above pixels in LCU
+  int iTemplateSize = TMP_TEMPLATE_SIZE;
+  int iBlkWidth = uiBlkWidth;
+  int iBlkHeight = uiBlkHeight;
+  regionId = 0;
+  int iMvShift = 0;
+
+  int iVerMin = std::max( ((iTemplateSize) << iMvShift), (iCurrY - offsetLCUY - iBlkHeight + 1) << iMvShift );
+  int iVerMax = (iCurrY - iBlkHeight) << iMvShift;
+  int iHorMin = std::max( (iTemplateSize) << iMvShift, (iCurrX - offsetLCUX - iBlkWidth + 1) << iMvShift );
+  int iHorMax = (iCurrX - iBlkWidth);
+
+  mvXMins[regionId] = iHorMin - iCurrX;
+  mvXMaxs[regionId] = iHorMax - iCurrX;
+  mvYMins[regionId] = iVerMin - iCurrY;
+  mvYMaxs[regionId] = iVerMax - iCurrY;
+
+  //check within CTU pixels
+  for( regionId = 0; regionId < 1; regionId++ )
+  {
+    int mvYMin = mvYMins[regionId];
+    int mvYMax = mvYMaxs[regionId];
+    int mvXMin = mvXMins[regionId];
+    int mvXMax = mvXMaxs[regionId];
+    if( mvYMax < mvYMin || mvXMax < mvXMin )
+    {
+      continue;
+    }
+
+    for( iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset-- )
+    {
+      for( iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset-- )
+      {
+        refCurr = ref + iYOffset * refStride + iXOffset;
+#if JVET_W0069_TMP_BOUNDARY
+        diff = m_calcTemplateDiff( refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff, tempType );
+#else
+        diff = m_calcTemplateDiff( refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff );
+#endif
+        if( diff < (pDiff) )
+        {
+          insertNode( diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId );
+        }
+        if( pDiff == 0 )
+        {
+          regionId++;
+        }
+      }
+    }
+  }
+
+  //2. check the pixels outside CTU
+  for( regionId = 0; regionId < regionNum; regionId++ )
+  {
+    // this function fills in the range the template matching for pixels outside the current CTU
+    clipMvIntraConstraint( pcCU, regionId, mvXMins[regionId], mvXMaxs[regionId], mvYMins[regionId], mvYMaxs[regionId], TMP_TEMPLATE_SIZE, uiBlkWidth, uiBlkHeight, iCurrY, iCurrX, offsetLCUY, offsetLCUX );
+  }
+
+  for( regionId = 0; regionId < regionNum; regionId++ )
+  {
+    int mvYMin = mvYMins[regionId];
+    int mvYMax = mvYMaxs[regionId];
+    int mvXMin = mvXMins[regionId];
+    int mvXMax = mvXMaxs[regionId];
+    if( mvYMax < mvYMin || mvXMax < mvXMin )
+    {
+      continue;
+    }
+    for( iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset-- )
+    {
+      for( iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset-- )
+      {
+        refCurr = ref + iYOffset * refStride + iXOffset;
+#if JVET_W0069_TMP_BOUNDARY
+        diff = m_calcTemplateDiff( refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff, tempType );
+#else
+        diff = m_calcTemplateDiff( refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff );
+#endif
+
+        if( diff < (pDiff) )
+        {
+          insertNode( diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId );
+        }
+
+        if( pDiff == 0 )
+        {
+          regionId = regionNum;
+        }
+      }
+    }
+  }
+
+  m_tempLibFast.m_pX = pX;
+  m_tempLibFast.m_pY = pY;
+  m_tempLibFast.m_pDiff = pDiff;
+  m_tempLibFast.m_pId = pId;
+}
+bool IntraPrediction::generateTMPrediction( Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int& foundCandiNum )
+{
+  bool bSucceedFlag = true;
+  unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
+  unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
+
+  foundCandiNum = m_uiVaildCandiNum;
+  if( foundCandiNum < 1 )
+  {
+    return false;
+  }
+
+  int pX = m_tempLibFast.getX();
+  int pY = m_tempLibFast.getY();
+  Pel* ref;
+  int picStride = getStride();
+  int iOffsetY, iOffsetX;
+  Pel* refTarget;
+  unsigned int uiHeight = uiPatchHeight - TMP_TEMPLATE_SIZE;
+  unsigned int uiWidth = uiPatchWidth - TMP_TEMPLATE_SIZE;
+
+  //the data center: we use the prediction block as the center now.
+  //collect the candidates
+  ref = getRefPicUsed();
+  {
+    iOffsetY = pY;
+    iOffsetX = pX;
+    refTarget = ref + iOffsetY * picStride + iOffsetX;
+    for( unsigned int uiY = 0; uiY < uiHeight; uiY++ )
+    {
+      for( unsigned int uiX = 0; uiX < uiWidth; uiX++ )
+      {
+        piPred[uiX] = refTarget[uiX];
+      }
+      refTarget += picStride;
+      piPred += uiStride;
+    }
+  }
+  return bSucceedFlag;
+}
+
+#if JVET_W0069_TMP_BOUNDARY
+bool IntraPrediction::generateTmDcPrediction( Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int DC_Val )
+{
+  bool bSucceedFlag = true;
+  {
+    for( unsigned int uiY = 0; uiY < uiBlkHeight; uiY++ )
+    {
+      for( unsigned int uiX = 0; uiX < uiBlkWidth; uiX++ )
+      {
+        piPred[uiX] = DC_Val;
+      }
+      piPred += uiStride;
+    }
+  }
+  return bSucceedFlag;
+}
+#endif
+
+#if JVET_W0069_TMP_BOUNDARY
+int IntraPrediction::calcTemplateDiff( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType tempType )
+#else
+int IntraPrediction::calcTemplateDiff( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax )
+#endif
+{
+  int diffSum = 0;
+#if JVET_W0069_TMP_BOUNDARY
+  Pel* refPatchRow;
+  if( tempType == L_SHAPE_TEMPLATE )
+  {
+    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+  }
+  else if( tempType == LEFT_TEMPLATE )
+  {
+    refPatchRow = ref - TMP_TEMPLATE_SIZE;
+  }
+  else if( tempType == ABOVE_TEMPLATE )
+  {
+    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride;
+  }
+#else
+  Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+#endif
+  Pel* tarPatchRow;
+
+#if JVET_W0069_TMP_BOUNDARY
+  if( tempType == L_SHAPE_TEMPLATE )
+  {
+#endif
+    // horizontal difference
+    for( int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
+    {
+      tarPatchRow = tarPatch[iY];
+      for( int iX = 0; iX < uiPatchWidth; iX++ )
+      {
+        diffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
+      }
+      if( diffSum > iMax ) //for speeding up
+      {
+        return diffSum;
+      }
+      refPatchRow += uiStride;
+    }
+
+    // vertical difference
+    for( int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ )
+    {
+      tarPatchRow = tarPatch[iY];
+      for( int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++ )
+      {
+        diffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
+      }
+      if( diffSum > iMax ) //for speeding up
+      {
+        return diffSum;
+      }
+      refPatchRow += uiStride;
+    }
+#if JVET_W0069_TMP_BOUNDARY
+  }
+  else if( tempType == ABOVE_TEMPLATE )
+  {
+    // top  template difference
+    for( int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
+    {
+      tarPatchRow = tarPatch[iY];
+      for( int iX = 0; iX < uiPatchWidth - TMP_TEMPLATE_SIZE; iX++ )
+      {
+        diffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
+      }
+      if( diffSum > iMax ) //for speeding up
+      {
+        return diffSum;
+      }
+      refPatchRow += uiStride;
+    }
+  }
+  else if( tempType == LEFT_TEMPLATE )
+  {
+    // left template difference
+    for( int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ )
+    {
+      tarPatchRow = tarPatch[iY];
+      for( int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++ )
+      {
+        diffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
+      }
+      if( diffSum > iMax ) //for speeding up
+      {
+        return diffSum;
+      }
+      refPatchRow += uiStride;
+    }
+  }
+#endif
+
+  return diffSum;
+}
+#endif
+
 //! \}
diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h
index d89f5d2e7..b97ad8975 100644
--- a/source/Lib/CommonLib/IntraPrediction.h
+++ b/source/Lib/CommonLib/IntraPrediction.h
@@ -66,6 +66,43 @@ enum PredBuf
 
 static const uint32_t MAX_INTRA_FILTER_DEPTHS=8;
 
+#if JVET_V0130_INTRA_TMP
+extern unsigned int g_uiDepth2Width[5];
+extern unsigned int g_uiDepth2MaxCandiNum[5];
+
+class TempLibFast
+{
+public:
+  int m_pX;    //offset X
+  int m_pY;    //offset Y
+  int m_pXInteger;    //offset X for integer pixel search
+  int m_pYInteger;    //offset Y for integer pixel search
+  int m_pDiffInteger;
+  int getXInteger() { return m_pXInteger; }
+  int getYInteger() { return m_pYInteger; }
+  int getDiffInteger() { return m_pDiffInteger; }
+  short m_pIdInteger; //frame id
+  short getIdInteger() { return m_pIdInteger; }
+  int m_pDiff; //mse
+  short m_pId; //frame id
+
+  TempLibFast();
+  ~TempLibFast();
+  //void init();
+  int getX() { return m_pX; }
+  int getY() { return m_pY; }
+  int getDiff() { return m_pDiff; }
+  short getId() { return m_pId; }
+  /*void initDiff(unsigned int uiPatchSize, int bitDepth);
+  void initDiff(unsigned int uiPatchSize, int bitDepth, int iCandiNumber);*/
+  void initTemplateDiff( unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int bitDepth );
+  int m_diffMax;
+  int getDiffMax() { return m_diffMax; }
+};
+
+typedef short TrainDataType;
+#endif
+
 class IntraPrediction
 {
 #if MMLM
@@ -140,6 +177,16 @@ protected:
   bool         m_bestScanRotationMode;
   std::vector<PelStorage>   m_tempBuffer;
 
+#if JVET_V0130_INTRA_TMP
+  int          m_uiPartLibSize;
+  TempLibFast  m_tempLibFast;
+  Pel*         m_refPicUsed;
+  Picture*     m_refPicBuf;
+  unsigned int m_uiPicStride;
+  unsigned int m_uiVaildCandiNum;
+  Pel***       m_pppTarPatch;
+#endif
+
   // prediction
   void xPredIntraPlanar           ( const CPelBuf &pSrc, PelBuf &pDst );
   void xPredIntraDc               ( const CPelBuf &pSrc, PelBuf &pDst, const ChannelType channelType, const bool enableBoundaryFilter = true );
@@ -204,7 +251,7 @@ public:
   virtual ~IntraPrediction();
 
 #if JVET_W0069_TMP_BOUNDARY
-  RefTemplateType GetRefTemplateType(CodingUnit& cu, CompArea& area);
+  RefTemplateType getRefTemplateType(CodingUnit& cu, CompArea& area);
 #endif
 
   void init                       (ChromaFormat chromaFormatIDC, const unsigned bitDepthY);
@@ -259,6 +306,44 @@ public:
   Pel* getPredictorPtr2           (const ComponentID compID, uint32_t idx) { return m_yuvExt2[compID][idx]; }
   void switchBuffer               (const PredictionUnit &pu, ComponentID compID, PelBuf srcBuff, Pel *dst);
 #endif
+
+#if JVET_V0130_INTRA_TMP
+#if JVET_W0069_TMP_BOUNDARY
+  int( *m_calcTemplateDiff )(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType);
+  static int calcTemplateDiff( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType );
+#else
+  int( *m_calcTemplateDiff )(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
+  static int calcTemplateDiff( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax );
+#endif
+  Pel** getTargetPatch( unsigned int uiDepth ) { return m_pppTarPatch[uiDepth]; }
+  Pel* getRefPicUsed() { return m_refPicUsed; }
+  void setRefPicUsed( Pel* ref ) { m_refPicUsed = ref; }
+  unsigned int getStride() { return m_uiPicStride; }
+  void         setStride( unsigned int uiPicStride ) { m_uiPicStride = uiPicStride; }
+
+#if JVET_W0069_TMP_BOUNDARY
+  void searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId, RefTemplateType tempType );
+  void candidateSearchIntra( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType tempType );
+#else
+  void searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId );
+  void candidateSearchIntra( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight );
+#endif
+  bool generateTMPrediction( Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int& foundCandiNum );
+#if JVET_W0069_TMP_BOUNDARY
+  bool generateTmDcPrediction( Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int DC_Val );
+  void getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType tempType );
+#else
+  void getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight );
+#endif
+#endif
+
+#if ENABLE_SIMD_TMP
+#ifdef TARGET_SIMD_X86
+  void    initIntraX86();
+  template <X86_VEXT vext>
+  void    _initIntraX86();
+#endif
+#endif
 };
 #if ENABLE_DIMD
 int  buildHistogram(const Pel *pReco, int iStride, uint32_t uiHeight, uint32_t uiWidth, int* piHistogram, int direction, int bw, int bh);
diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp
index ce6634820..349675100 100644
--- a/source/Lib/CommonLib/TrQuant.cpp
+++ b/source/Lib/CommonLib/TrQuant.cpp
@@ -55,10 +55,6 @@
 #include "CommonLib/CodingStatistics.h"
 #endif
 
-#if ENABLE_SIMD_TMP
-#include "CommonDefX86.h"
-#endif
-
 #if JVET_V0130_INTRA_TMP
 unsigned int g_uiDepth2Width[5] = { 4, 8, 16, 32, 64 };
 #endif
@@ -195,9 +191,6 @@ TrQuant::TrQuant() : m_quant( nullptr )
     m_fwdICT[-2]  = fwdTransformCbCr<-2>;
     m_fwdICT[ 3]  = fwdTransformCbCr< 3>;
     m_fwdICT[-3]  = fwdTransformCbCr<-3>;
-#if JVET_V0130_INTRA_TMP
-	  m_pppTarPatch = NULL;
-#endif
   }
 }
 
@@ -208,31 +201,6 @@ TrQuant::~TrQuant()
     delete m_quant;
     m_quant = nullptr;
   }
-
-#if JVET_V0130_INTRA_TMP
-  if (m_pppTarPatch != NULL)
-  {
-	  for (unsigned int uiDepth = 0; uiDepth < USE_MORE_BLOCKSIZE_DEPTH_MAX; uiDepth++)
-	  {
-		  unsigned int blkSize = g_uiDepth2Width[uiDepth];
-
-		  unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE;
-		  for (unsigned int uiRow = 0; uiRow < patchSize; uiRow++)
-		  {
-			  if (m_pppTarPatch[uiDepth][uiRow] != NULL)
-			  {
-				  delete[]m_pppTarPatch[uiDepth][uiRow]; m_pppTarPatch[uiDepth][uiRow] = NULL;
-			  }
-		  }
-		  if (m_pppTarPatch[uiDepth] != NULL)
-		  {
-			  delete[]m_pppTarPatch[uiDepth]; m_pppTarPatch[uiDepth] = NULL;
-		  }
-	  }
-	  delete[] m_pppTarPatch;
-	  m_pppTarPatch = NULL;
-  }
-#endif
 }
 
 #if ENABLE_SPLIT_PARALLELISM
@@ -270,27 +238,6 @@ void TrQuant::init( const Quant* otherQuant,
     m_quant->init( uiMaxTrSize, bUseRDOQ, bUseRDOQTS, useSelectiveRDOQ );
   }
 
-
-#if JVET_V0130_INTRA_TMP
-  unsigned int blkSize;
-  
-  if (m_pppTarPatch == NULL)
-  {
-	  m_pppTarPatch = new Pel * *[USE_MORE_BLOCKSIZE_DEPTH_MAX];
-	  for (unsigned int uiDepth = 0; uiDepth < USE_MORE_BLOCKSIZE_DEPTH_MAX; uiDepth++)
-	  {
-		  blkSize = g_uiDepth2Width[uiDepth];
-
-		  unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE;
-		  m_pppTarPatch[uiDepth] = new Pel * [patchSize];
-		  for (unsigned int uiRow = 0; uiRow < patchSize; uiRow++)
-		  {
-			  m_pppTarPatch[uiDepth][uiRow] = new Pel[patchSize];
-		  }
-	  }
-}
-#endif
-
 #if TU_256
   fastFwdTrans =
   { {
@@ -348,9 +295,6 @@ void TrQuant::init( const Quant* otherQuant,
 #if ENABLE_SIMD_SIGN_PREDICTION
   m_computeSAD = xComputeSAD;
 #endif
-#if JVET_V0130_INTRA_TMP
-  m_calcTemplateDiff = calcTemplateDiff;
-#endif
 
 #if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
 #ifdef TARGET_SIMD_X86
@@ -468,493 +412,6 @@ void TrQuant::invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32
   }
 }
 
-#if JVET_V0130_INTRA_TMP
-void insertNode(int diff, int& iXOffset, int& iYOffset, int& pDiff, int& pX, int& pY, short& pId, unsigned int& setId)
-{
-	pDiff = diff;
-	pX = iXOffset;
-	pY = iYOffset;
-	pId = setId;
-}
-
-void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iHorMax, int& iVerMin, int& iVerMax, unsigned int uiTemplateSize, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int iCurrY, int iCurrX, int offsetLCUY, int offsetLCUX)
-{
-	int searchRangeWidth  = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkWidth;
-	int searchRangeHeight = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkHeight;
-	int iMvShift = 0;
-	int iTemplateSize = uiTemplateSize;
-	int iBlkWidth = uiBlkWidth;
-	int iBlkHeight = uiBlkHeight;
-	if (regionId == 0) //above outside LCU
-	{
-    iHorMax = std::min((iCurrX + searchRangeWidth) << iMvShift, (int)((pcCU->cs->pps->getPicWidthInLumaSamples() - iBlkWidth) << iMvShift) );
-		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift);
-
-		iVerMax = (iCurrY - iBlkHeight - offsetLCUY) << iMvShift;
-		iVerMin = std::max(((iTemplateSize) << iMvShift), ((iCurrY - searchRangeHeight) << iMvShift));
-
-		iHorMin = iHorMin - iCurrX;
-		iHorMax = iHorMax - iCurrX;
-		iVerMax = iVerMax - iCurrY;
-		iVerMin = iVerMin - iCurrY;
-	}
-	else if (regionId == 1) //left outside LCU
-	{
-		iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift;
-		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift);
-
-		iVerMin = std::max((iTemplateSize) << iMvShift, (iCurrY - iBlkHeight - offsetLCUY) << iMvShift);
-		iVerMax = (iCurrY) << iMvShift;
-
-		iHorMin = iHorMin - iCurrX;
-		iHorMax = iHorMax - iCurrX;
-		iVerMax = iVerMax - iCurrY;
-		iVerMin = iVerMin - iCurrY;
-	}
-	else if (regionId == 2) //left outside LCU (can reach the bottom row of LCU)
-	{
-		iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift);
-		iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift;
-		iVerMin = (iCurrY + 1) << iMvShift;
-    iVerMax = std::min(pcCU->cs->pps->getPicHeightInLumaSamples() - iBlkHeight, (iCurrY - offsetLCUY + pcCU->cs->sps->getCTUSize() - iBlkHeight) << iMvShift);
-
-		iHorMin = iHorMin - iCurrX;
-		iHorMax = iHorMax - iCurrX;
-		iVerMax = iVerMax - iCurrY;
-		iVerMin = iVerMin - iCurrY;
-	}
-}
-
-TempLibFast::TempLibFast()
-{
-}
-
-TempLibFast::~TempLibFast()
-{
-}
-
-void TempLibFast::initTemplateDiff(unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int bitDepth)
-{
-	int maxValue = ((1 << bitDepth) >> (INIT_THRESHOULD_SHIFTBITS)) * (uiPatchHeight * uiPatchWidth - uiBlkHeight * uiBlkWidth);
-	m_diffMax = maxValue;
-	{
-		m_pDiff = maxValue;
-	}
-}
-
-#if JVET_W0069_TMP_BOUNDARY
-void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType)
-#else
-void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight)
-#endif
-{
-	const ComponentID compID = COMPONENT_Y;
-	unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
-	unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
-	unsigned int uiTarDepth = floorLog2(std::max(uiBlkHeight, uiBlkWidth)) - 2;
-	Pel** tarPatch = m_pppTarPatch[uiTarDepth];
-	CompArea area = pcCU->blocks[compID];
-	Pel* pCurrStart = pcCU->cs->picture->getRecoBuf(area).buf;
-	unsigned int  uiPicStride = pcCU->cs->picture->getRecoBuf(compID).stride;
-	unsigned int uiY, uiX;
-
-	//fill template
-	//up-left & up 
-	Pel* tarTemp;
-#if JVET_W0069_TMP_BOUNDARY
-	if (TempType == L_SHAPE_TEMPLATE)
-	{
-#endif
-	Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride - TMP_TEMPLATE_SIZE;
-	for (uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++)
-	{
-		tarTemp = tarPatch[uiY]; 
-		for (uiX = 0; uiX < uiPatchWidth; uiX++)
-		{
-			tarTemp[uiX] = pCurrTemp[uiX];
-		}
-		pCurrTemp += uiPicStride;
-	}
-	//left
-	for (uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++)
-	{
-		tarTemp = tarPatch[uiY];
-		for (uiX = 0; uiX < TMP_TEMPLATE_SIZE; uiX++)
-		{
-			tarTemp[uiX] = pCurrTemp[uiX];
-		}
-		pCurrTemp += uiPicStride;
-	}
-#if JVET_W0069_TMP_BOUNDARY
-	}
-  else if (TempType == ABOVE_TEMPLATE)
-  {
-    Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride;
-    for (uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++)
-    {
-      tarTemp = tarPatch[uiY];
-      for (uiX = 0; uiX < uiBlkWidth; uiX++)
-      {
-        tarTemp[uiX] = pCurrTemp[uiX];
-      }
-      pCurrTemp += uiPicStride;
-    }
-  }
-	else if (TempType == LEFT_TEMPLATE)
-	{
-		Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE;
-		for (uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++)
-		{
-			tarTemp = tarPatch[uiY];
-			for (uiX = 0; uiX < TMP_TEMPLATE_SIZE; uiX++)
-			{
-				tarTemp[uiX] = pCurrTemp[uiX];
-			}
-			pCurrTemp += uiPicStride;
-		}
-	}
-#endif
-}
-
-#if JVET_W0069_TMP_BOUNDARY
-void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType)
-#else
-void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight)
-#endif
-{
-	const ComponentID compID = COMPONENT_Y;
-	const int channelBitDepth = pcCU->cs->sps->getBitDepth(toChannelType(compID));
-	unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
-	unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
-	unsigned int uiTarDepth = floorLog2(std::max(uiBlkWidth, uiBlkHeight)) - 2;
-	Pel** tarPatch = getTargetPatch(uiTarDepth);
-	//Initialize the library for saving the best candidates
-	m_tempLibFast.initTemplateDiff(uiPatchWidth, uiPatchHeight, uiBlkWidth, uiBlkHeight, channelBitDepth);
-	short setId = 0; //record the reference picture.
-#if JVET_W0069_TMP_BOUNDARY
-	searchCandidateFromOnePicIntra(pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId, TempType);
-#else
-	searchCandidateFromOnePicIntra(pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId);
-#endif
-	//count collected candidate number
-	int pDiff = m_tempLibFast.getDiff();
-	int maxDiff = m_tempLibFast.getDiffMax();
-	
-
-  if( pDiff < maxDiff )
-  {
-    m_uiVaildCandiNum = 1;
-  }
-  else
-  {
-    m_uiVaildCandiNum = 0;
-  }
-}
-
-#if JVET_W0069_TMP_BOUNDARY
-void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId, RefTemplateType TempType)
-#else
-void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId)
-#endif
-{
-	const ComponentID compID = COMPONENT_Y;
-	unsigned int uiBlkWidth = uiPatchWidth - TMP_TEMPLATE_SIZE;
-	unsigned int uiBlkHeight = uiPatchHeight - TMP_TEMPLATE_SIZE;
-
-	int pX = m_tempLibFast.getX();
-	int pY = m_tempLibFast.getY();
-	int pDiff = m_tempLibFast.getDiff();
-	short pId = m_tempLibFast.getId();
-	CompArea area = pcCU->blocks[compID];
-	int  refStride = pcCU->cs->picture->getRecoBuf(compID).stride;
-	
-	Pel* ref = pcCU->cs->picture->getRecoBuf(area).buf;
-	
-	setRefPicUsed(ref); //facilitate the access of each candidate point 
-	setStride(refStride);
-	
-	Mv cTmpMvPred;
-	cTmpMvPred.setZero();
-
-	unsigned int uiCUPelY = area.pos().y;
-	unsigned int uiCUPelX = area.pos().x;
-	int blkX = 0;
-	int blkY = 0;
-	int iCurrY = uiCUPelY + blkY;
-	int iCurrX = uiCUPelX + blkX;
-
-	Position  ctuRsAddr = CU::getCtuXYAddr(*pcCU);
-	int offsetLCUY = iCurrY - ctuRsAddr.y;
-	int offsetLCUX = iCurrX - ctuRsAddr.x;
-
-	int iYOffset, iXOffset;
-	int diff;
-	Pel* refCurr;
-
-	const int regionNum = 3;
-	int mvYMins[regionNum];
-	int mvYMaxs[regionNum];
-	int mvXMins[regionNum];
-	int mvXMaxs[regionNum];
-	int regionId = 0;
-
-	//1. check the near pixels within LCU
-	//above pixels in LCU
-	int iTemplateSize = TMP_TEMPLATE_SIZE;
-	int iBlkWidth = uiBlkWidth;
-	int iBlkHeight = uiBlkHeight;
-	regionId = 0;
-	int iMvShift = 0;
-
-	int iVerMin = std::max(((iTemplateSize) << iMvShift), (iCurrY - offsetLCUY - iBlkHeight + 1) << iMvShift);
-	int iVerMax = (iCurrY - iBlkHeight) << iMvShift; 
-	int iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - offsetLCUX - iBlkWidth + 1) << iMvShift);
-	int iHorMax = (iCurrX - iBlkWidth);
-
-	mvXMins[regionId] = iHorMin - iCurrX;
-	mvXMaxs[regionId] = iHorMax - iCurrX;
-	mvYMins[regionId] = iVerMin - iCurrY;
-	mvYMaxs[regionId] = iVerMax - iCurrY;
-
-	//check within CTU pixels
-	for (regionId = 0; regionId < 1; regionId++)
-	{
-		int mvYMin = mvYMins[regionId];
-		int mvYMax = mvYMaxs[regionId];
-		int mvXMin = mvXMins[regionId];
-		int mvXMax = mvXMaxs[regionId];
-		if (mvYMax < mvYMin || mvXMax < mvXMin)
-		{
-			continue;
-		}
-
-		for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset--)
-		{
-			for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--)
-			{
-				refCurr = ref + iYOffset * refStride + iXOffset;
-#if JVET_W0069_TMP_BOUNDARY
-				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff, TempType);
-#else
-				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
-#endif
-				if (diff < (pDiff))
-				{
-					insertNode(diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId); 
-				}
-        if (pDiff == 0)
-        {
-          regionId++;
-        }
-			}
-		}
-	}
-
-	//2. check the pixels outside CTU
-	for (regionId = 0; regionId < regionNum; regionId++)
-	{// this function fills in the range the template matching for pixels outside the current CTU
-		clipMvIntraConstraint(pcCU, regionId, mvXMins[regionId], mvXMaxs[regionId], mvYMins[regionId], mvYMaxs[regionId], TMP_TEMPLATE_SIZE, uiBlkWidth, uiBlkHeight, iCurrY, iCurrX, offsetLCUY, offsetLCUX);
-	}
-
-	for (regionId = 0; regionId < regionNum; regionId++)
-	{
-		int mvYMin = mvYMins[regionId];
-		int mvYMax = mvYMaxs[regionId];
-		int mvXMin = mvXMins[regionId];
-		int mvXMax = mvXMaxs[regionId];
-		if ( mvYMax < mvYMin || mvXMax < mvXMin )
-		{
-			continue;
-		}
-		for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset--)
-		{
-			for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--)
-			{
-				refCurr = ref + iYOffset * refStride + iXOffset;
-#if JVET_W0069_TMP_BOUNDARY
-				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff, TempType);
-#else
-				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
-#endif
-
-				if (diff < (pDiff))
-				{
-					insertNode(diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId);
-				}
-
-        if (pDiff == 0)
-        {
-          regionId = regionNum;
-        }
-			}
-		}
-	}
-
-	m_tempLibFast.m_pX = pX;
-	m_tempLibFast.m_pY = pY;
-	m_tempLibFast.m_pDiff = pDiff;
-	m_tempLibFast.m_pId = pId;
-}
-bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int& foundCandiNum)
-{
-	bool bSucceedFlag = true;
-	unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
-	unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
-
-	foundCandiNum = m_uiVaildCandiNum;
-	if (foundCandiNum < 1)
-	{
-		return false;
-	}
-
-	int pX = m_tempLibFast.getX();
-	int pY = m_tempLibFast.getY();
-	Pel* ref;
-	int picStride = getStride();
-	int iOffsetY, iOffsetX;
-	Pel* refTarget;
-	unsigned int uiHeight = uiPatchHeight - TMP_TEMPLATE_SIZE;
-	unsigned int uiWidth = uiPatchWidth - TMP_TEMPLATE_SIZE;
-
-	//the data center: we use the prediction block as the center now.
-	//collect the candidates
-	ref = getRefPicUsed();
-	{
-		iOffsetY = pY;
-		iOffsetX = pX;
-		refTarget = ref + iOffsetY * picStride + iOffsetX;
-		for (unsigned int uiY = 0; uiY < uiHeight; uiY++)
-		{
-			for (unsigned int uiX = 0; uiX < uiWidth; uiX++)
-			{
-				piPred[uiX] = refTarget[uiX];
-			}
-			refTarget += picStride;
-			piPred += uiStride;
-		}
-	}
-	return bSucceedFlag;
-}
-
-#if JVET_W0069_TMP_BOUNDARY
-bool TrQuant::generateTM_DC_Prediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int DC_Val)
-{
-  bool bSucceedFlag = true;
-  {
-    for (unsigned int uiY = 0; uiY < uiBlkHeight; uiY++)
-    {
-      for (unsigned int uiX = 0; uiX < uiBlkWidth; uiX++)
-      {
-        piPred[uiX] = DC_Val;
-      }
-      piPred += uiStride;
-    }
-  }
-	return bSucceedFlag;
-}
-#endif
-
-#if JVET_W0069_TMP_BOUNDARY
-int TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType)
-#else
-int TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax)
-#endif
-{
-  int iDiffSum = 0;
-#if JVET_W0069_TMP_BOUNDARY
-  Pel* refPatchRow;
-  if( TempType == L_SHAPE_TEMPLATE )
-  {
-    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
-  }
-  else if( TempType == LEFT_TEMPLATE )
-  {
-    refPatchRow = ref - TMP_TEMPLATE_SIZE;
-  }
-  else if( TempType == ABOVE_TEMPLATE )
-  {
-    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride;
-  }
-#else
-  Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
-#endif
-  Pel* tarPatchRow;
-
-#if JVET_W0069_TMP_BOUNDARY
-  if( TempType == L_SHAPE_TEMPLATE )
-  {
-#endif
-  // horizontal difference
-  for( int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
-  {
-    tarPatchRow = tarPatch[iY];
-    for( int iX = 0; iX < uiPatchWidth; iX++ )
-    {
-      iDiffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
-    }
-    if( iDiffSum > iMax ) //for speeding up
-    {
-      return iDiffSum;
-    }
-    refPatchRow += uiStride;
-  }
-
-  // vertical difference
-  for( int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ )
-  {
-    tarPatchRow = tarPatch[iY];
-    for( int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++ )
-    {
-      iDiffSum += abs( refPatchRow[iX] - tarPatchRow[iX] );
-    }
-    if( iDiffSum > iMax ) //for speeding up
-    {
-      return iDiffSum;
-    }
-    refPatchRow += uiStride;
-  }
-#if JVET_W0069_TMP_BOUNDARY
-	}
-  else if (TempType == ABOVE_TEMPLATE)
-  {
-    // top  template difference
-    for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
-    {
-      tarPatchRow = tarPatch[iY];
-      for (int iX = 0; iX < uiPatchWidth - TMP_TEMPLATE_SIZE; iX++)
-      {
-        iDiffSum += abs(refPatchRow[iX] - tarPatchRow[iX]);
-      }
-      if (iDiffSum > iMax) //for speeding up
-      {
-        return iDiffSum;
-      }
-      refPatchRow += uiStride;
-    }
-  }
-  else if (TempType == LEFT_TEMPLATE)
-  {
-	  // left template difference
-	  for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
-	  {
-		  tarPatchRow = tarPatch[iY];
-		  for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
-		  {
-			  iDiffSum += abs(refPatchRow[iX] - tarPatchRow[iX]);
-		  }
-		  if (iDiffSum > iMax) //for speeding up
-		  {
-			  return iDiffSum;
-		  }
-		  refPatchRow += uiStride;
-	  }
-  }
-#endif
-
-  return iDiffSum;
-}
-#endif
-
 uint32_t TrQuant::getLFNSTIntraMode( int wideAngPredMode )
 {
   uint32_t intraMode;
diff --git a/source/Lib/CommonLib/TrQuant.h b/source/Lib/CommonLib/TrQuant.h
index ec9b42c35..3bc60d777 100644
--- a/source/Lib/CommonLib/TrQuant.h
+++ b/source/Lib/CommonLib/TrQuant.h
@@ -56,47 +56,6 @@ typedef void FwdTrans(const TCoeff*, TCoeff*, int, int, int, int);
 typedef void InvTrans(const TCoeff*, TCoeff*, int, int, int, int, const TCoeff, const TCoeff);
 
 
-
-#if JVET_V0130_INTRA_TMP
-extern unsigned int g_uiDepth2Width[5];
-extern unsigned int g_uiDepth2MaxCandiNum[5];
-
-class TempLibFast
-{
-public:
-	int m_pX;    //offset X
-	int m_pY;    //offset Y
-	int m_pXInteger;    //offset X for integer pixel search
-	int m_pYInteger;    //offset Y for integer pixel search
-	int m_pDiffInteger;
-	int getXInteger() { return m_pXInteger; }
-	int getYInteger() { return m_pYInteger; }
-	int getDiffInteger() { return m_pDiffInteger; }
-	short m_pIdInteger; //frame id
-	short getIdInteger() { return m_pIdInteger; }
-	int m_pDiff; //mse
-	short m_pId; //frame id
-	
-
-	TempLibFast();
-	~TempLibFast();
-	//void init();
-	int getX() { return m_pX; }
-	int getY() { return m_pY; }
-	int getDiff() { return m_pDiff; }
-	short getId() { return m_pId; }
-	/*void initDiff(unsigned int uiPatchSize, int bitDepth);
-	void initDiff(unsigned int uiPatchSize, int bitDepth, int iCandiNumber);*/
-	void initTemplateDiff(unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int bitDepth);
-	int m_diffMax;
-	int getDiffMax() { return m_diffMax; }
-};
-
-
-typedef short TrainDataType;
-#endif
-
-
 // ====================================================================================================================
 // Class definition
 // ====================================================================================================================
@@ -128,37 +87,6 @@ public:
 #else
   void fwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize );
   void invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize );
-#endif
-#if JVET_V0130_INTRA_TMP
-#if JVET_W0069_TMP_BOUNDARY
-  int (*m_calcTemplateDiff)(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType);
-  static int calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType);
-#else
-  int ( *m_calcTemplateDiff )(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
-  static int calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
-#endif
-  Pel** getTargetPatch(unsigned int uiDepth)       { return m_pppTarPatch[uiDepth]; }
-  Pel* getRefPicUsed()                             { return m_refPicUsed; }
-  void setRefPicUsed(Pel* ref)                     { m_refPicUsed = ref; }
-  unsigned int getStride()                         { return m_uiPicStride; }
-  void         setStride(unsigned int uiPicStride) { m_uiPicStride = uiPicStride; }
-
-#if JVET_W0069_TMP_BOUNDARY
-  void searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId, RefTemplateType TempType);
-  void candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType);
-#else
-  void searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId);
-  void candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight);
-#endif
-  bool generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int& foundCandiNum);
-#if JVET_W0069_TMP_BOUNDARY
-  bool generateTM_DC_Prediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int DC_Val);
-#endif
-#if JVET_W0069_TMP_BOUNDARY
-  void getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType);
-#else
-  void getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight);
-#endif
 #endif
 
   uint32_t getLFNSTIntraMode( int wideAngPredMode );
@@ -217,15 +145,6 @@ public:
 
 protected:
   TCoeff   m_tempCoeff[MAX_TB_SIZEY * MAX_TB_SIZEY];
-#if JVET_V0130_INTRA_TMP
-  int          m_uiPartLibSize;
-  TempLibFast  m_tempLibFast;
-  Pel*         m_refPicUsed;
-  Picture*     m_refPicBuf;
-  unsigned int m_uiPicStride;
-  unsigned int m_uiVaildCandiNum;
-  Pel***       m_pppTarPatch;
-#endif
 #if SIGN_PREDICTION
   Pel      m_tempSignPredResid[SIGN_PRED_MAX_BS * SIGN_PRED_MAX_BS * 2]{0};
   Pel      m_signPredTemplate[SIGN_PRED_FREQ_RANGE*SIGN_PRED_FREQ_RANGE*SIGN_PRED_MAX_BS*2];
@@ -305,7 +224,7 @@ private:
   static void fastInverseTransform_SIMD( const TCoeff *coeff, TCoeff *block, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum );
 #endif
 
-#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP
+#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
 #ifdef TARGET_SIMD_X86
   void    initTrQuantX86();
   template <X86_VEXT vext>
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 351e19286..38d9110f4 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -65,6 +65,8 @@
 
 #define REMOVE_PCM                                        1 // Remove PCM related code for memory reduction and speedup
 
+#define JVET_X0049_BDMVR_SW_OPT                           1 // JVET-X0049: software optimization for BDMVR (lossless)
+
 // SIMD optimizations
 #define MCIF_SIMD_NEW                                     1 // SIMD for interpolation
 #define DIST_SSE_ENABLE                                   1 // Enable SIMD for SSE
@@ -109,15 +111,19 @@
 #define INTRA_6TAP                                        1 // 6TapCubic + 6 TapGaussian + left side 4 tap weak filtering for intra.
 #define SECONDARY_MPM                                     1 // Primary MPM and Secondary MPM: Add neighbouring modes into MPMs from positions AR, BL, AL, derived modes
 #define ENABLE_DIMD                                       1 // Decoder side intra mode derivation
-#define JVET_V0087_DIMD_NO_ISP                  ENABLE_DIMD // JVET-V0087: disallow combination of DIMD and ISP
+#if ENABLE_DIMD
+#define JVET_V0087_DIMD_NO_ISP                            1 // JVET-V0087: disallow combination of DIMD and ISP
+#define JVET_X0124_TMP_SIGNAL                             1 // JVET-X0124: cleanup on signalling of intra template matching
+#endif
 #define JVET_V0130_INTRA_TMP                              1 // JVET-V0130: template matching prediction
 #define JVET_W0069_TMP_BOUNDARY								            1 // JVET-W0069: boundary handling for TMP
 #define JVET_W0123_TIMD_FUSION                            1 // JVET-W0123: Template based intra mode derivation and fusion
-#define JVET_X0148_TIMD_PDPC         JVET_W0123_TIMD_FUSION // JVET-X0148: PDPC handling for TIMD
+#if JVET_W0123_TIMD_FUSION
+#define JVET_X0148_TIMD_PDPC                              1 // JVET-X0148: PDPC handling for TIMD
+#endif
 #if ENABLE_DIMD || JVET_W0123_TIMD_FUSION
 #define JVET_X0149_TIMD_DIMD_LUT                          1 // JVET-X0149: LUT-based derivation of DIMD and TIMD
 #endif
-#define JVET_X0124_TMP_SIGNAL                   ENABLE_DIMD // JVET-X0124: cleanup on signalling of intra template matching
 
 // Inter
 #define CIIP_PDPC                                         1 // apply pdpc to megre prediction as a new CIIP mode (CIIP_PDPC) additional to CIIP mode
@@ -131,13 +137,11 @@
 #define IF_12TAP                                          1 // 12-tap IF
 #define ENABLE_OBMC                                       1 // Enable Overlapped Block Motion Compensation
 
-#define JVET_X0049_BDMVR_SW_OPT                           1 // JVET-X0049: software optimization for BDMVR (lossless)
 #if JVET_X0049_BDMVR_SW_OPT
 #define JVET_X0049_ADAPT_DMVR                             1 // JVET-X0049: Adaptive DMVR
 #endif
 #define JVET_X0056_DMVD_EARLY_TERMINATION                 1 // JVET-X0056: Early termination for DMVR and TM
-
-#define JVET_X0083_BM_AMVP_MERGE_MODE                     1
+#define JVET_X0083_BM_AMVP_MERGE_MODE                     1 // JVET-X0083: AMVP-merge mode
 
 // Inter template matching tools
 #define ENABLE_INTER_TEMPLATE_MATCHING                    1 // It controls whether template matching is enabled for inter prediction
diff --git a/source/Lib/CommonLib/x86/InitX86.cpp b/source/Lib/CommonLib/x86/InitX86.cpp
index e12809508..7766a4ce2 100644
--- a/source/Lib/CommonLib/x86/InitX86.cpp
+++ b/source/Lib/CommonLib/x86/InitX86.cpp
@@ -193,7 +193,7 @@ void IbcHashMap::initIbcHashMapX86()
 }
 #endif
 
-#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP
+#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
 void TrQuant::initTrQuantX86()
 {
   auto vext = read_x86_extension_flags();
@@ -216,6 +216,29 @@ void TrQuant::initTrQuantX86()
 }
 #endif
 
+#if ENABLE_SIMD_TMP
+void IntraPrediction::initIntraX86()
+{
+  auto vext = read_x86_extension_flags();
+  switch( vext )
+  {
+  case AVX512:
+  case AVX2:
+    _initIntraX86<AVX2>();
+    break;
+  case AVX:
+    _initIntraX86<AVX>();
+    break;
+  case SSE42:
+  case SSE41:
+    _initIntraX86<SSE41>();
+    break;
+  default:
+    break;
+  }
+}
+#endif
+
 #if ENABLE_SIMD_BILATERAL_FILTER || JVET_X0071_CHROMA_BILATERAL_FILTER_ENABLE_SIMD
 void BilateralFilter::initBilateralFilterX86()
 {
diff --git a/source/Lib/CommonLib/x86/IntraX86.h b/source/Lib/CommonLib/x86/IntraX86.h
new file mode 100644
index 000000000..e4f5617d4
--- /dev/null
+++ b/source/Lib/CommonLib/x86/IntraX86.h
@@ -0,0 +1,321 @@
+/* The copyright in this software is being made available under the BSD
+ * License, included below. This software may be subject to other third party
+ * and contributor rights, including patent rights, and no such rights are
+ * granted under this license.
+ *
+ * Copyright (c) 2010-2021, ITU/ISO/IEC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+ *    be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Implementation of IbcHashMap class
+ */
+
+#include "CommonDefX86.h"
+#include "../IntraPrediction.h"
+
+#ifdef TARGET_SIMD_X86
+
+#include <nmmintrin.h>
+
+#if ENABLE_SIMD_TMP
+template< X86_VEXT vext >
+#if JVET_W0069_TMP_BOUNDARY
+int calcTemplateDiffSIMD(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType tempType)
+#else
+int calcTemplateDiffSIMD( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax )
+#endif
+{
+  int diffSum = 0;
+  int iY;
+#if JVET_W0069_TMP_BOUNDARY
+  Pel* refPatchRow;
+  if( tempType == L_SHAPE_TEMPLATE )
+  {
+    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+  }
+  else if( tempType == LEFT_TEMPLATE )
+  {
+    refPatchRow = ref - TMP_TEMPLATE_SIZE;
+  }
+  else if( tempType == ABOVE_TEMPLATE )
+  {
+    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride;
+  }
+#else
+  Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+#endif
+  Pel* tarPatchRow;
+  uint32_t uiSum;
+
+  // horizontal difference
+#if JVET_W0069_TMP_BOUNDARY
+  if( tempType == L_SHAPE_TEMPLATE )
+  {
+#endif
+  for( iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = ( const short* ) tarPatchRow;
+    const short* pSrc2 = ( const short* ) refPatchRow;
+
+    // SIMD difference
+    //int  iRows = uiPatchHeight;
+    int  iCols = uiPatchWidth;
+    if( (iCols & 7) == 0 )
+    {
+      // Do with step of 8
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for( int iX = 0; iX < iCols; iX += 8 )
+        {
+          __m128i vsrc1 = _mm_loadu_si128( (const __m128i*)(&pSrc1[iX]) );
+          __m128i vsrc2 = _mm_lddqu_si128( (const __m128i*)(&pSrc2[iX]) );
+          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
+        }
+        __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+      uiSum = _mm_cvtsi128_si32( vsum32 );
+    }
+    else
+    {
+      // Do with step of 4
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for( int iX = 0; iX < iCols; iX += 4 )
+        {
+          __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] );
+          __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] );
+          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
+        }
+        __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+      uiSum = _mm_cvtsi128_si32( vsum32 );
+    }
+    diffSum += uiSum;
+
+    if( diffSum > iMax ) //for speeding up
+    {
+      return diffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+
+  // vertical difference
+  int  iCols = TMP_TEMPLATE_SIZE;
+
+  for( iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ )
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = ( const short* ) tarPatchRow;
+    const short* pSrc2 = ( const short* ) refPatchRow;
+
+    // SIMD difference
+
+    // Do with step of 4
+    __m128i vzero = _mm_setzero_si128();
+    __m128i vsum32 = vzero;
+    //for (int iY = 0; iY < iRows; iY += iSubStep)
+    {
+      __m128i vsum16 = vzero;
+      for( int iX = 0; iX < iCols; iX += 4 )
+      {
+        __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] );
+        __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] );
+        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
+      }
+      __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+      vsum32 = _mm_add_epi32( vsum32, vsumtemp );
+      //pSrc1 += iStrideSrc1;
+      //pSrc2 += iStrideSrc2;
+    }
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+    uiSum = _mm_cvtsi128_si32( vsum32 );
+
+    diffSum += uiSum;
+
+    if( diffSum > iMax ) //for speeding up
+    {
+      return diffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+#if JVET_W0069_TMP_BOUNDARY
+  }
+  else if (tempType == ABOVE_TEMPLATE)
+  {
+  // horizontal difference
+  for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = (const short*)tarPatchRow;
+    const short* pSrc2 = (const short*)refPatchRow;
+
+    // SIMD difference
+    //int  iRows = uiPatchHeight;
+    int  iCols = uiPatchWidth - TMP_TEMPLATE_SIZE;
+    if ((iCols & 7) == 0)
+    {
+      // Do with step of 8
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for (int iX = 0; iX < iCols; iX += 8)
+        {
+          __m128i vsrc1 = _mm_loadu_si128((const __m128i*)(&pSrc1[iX]));
+          __m128i vsrc2 = _mm_lddqu_si128((const __m128i*)(&pSrc2[iX]));
+          vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
+        }
+        __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
+        vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
+      uiSum = _mm_cvtsi128_si32(vsum32);
+    }
+    else
+    {
+      // Do with step of 4
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for (int iX = 0; iX < iCols; iX += 4)
+        {
+          __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
+          __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
+          vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
+        }
+        __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
+        vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
+      uiSum = _mm_cvtsi128_si32(vsum32);
+    }
+    diffSum += uiSum;
+
+    if (diffSum > iMax) //for speeding up
+    {
+      return diffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+
+  
+  }
+  else if (tempType == LEFT_TEMPLATE)
+  {
+
+  // vertical difference
+  int  iCols = TMP_TEMPLATE_SIZE;
+
+  for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = (const short*)tarPatchRow;
+    const short* pSrc2 = (const short*)refPatchRow;
+
+    // SIMD difference
+
+    // Do with step of 4
+    __m128i vzero = _mm_setzero_si128();
+    __m128i vsum32 = vzero;
+    //for (int iY = 0; iY < iRows; iY += iSubStep)
+    {
+      __m128i vsum16 = vzero;
+      for (int iX = 0; iX < iCols; iX += 4)
+      {
+        __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
+        __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
+        vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
+      }
+      __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
+      vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+      //pSrc1 += iStrideSrc1;
+      //pSrc2 += iStrideSrc2;
+    }
+    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
+    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
+    uiSum = _mm_cvtsi128_si32(vsum32);
+
+    diffSum += uiSum;
+
+    if (diffSum > iMax) //for speeding up
+    {
+      return diffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+  }
+#endif
+
+  return diffSum;
+}
+
+template <X86_VEXT vext>
+void IntraPrediction::_initIntraX86()
+{
+#if ENABLE_SIMD_TMP
+  m_calcTemplateDiff = calcTemplateDiffSIMD<vext>;
+#endif
+}
+
+template void IntraPrediction::_initIntraX86<SIMDX86>();
+#endif
+
+#endif //#ifdef TARGET_SIMD_X86
+//! \}
diff --git a/source/Lib/CommonLib/x86/TrQuantX86.h b/source/Lib/CommonLib/x86/TrQuantX86.h
index ac45c865e..99c806a31 100644
--- a/source/Lib/CommonLib/x86/TrQuantX86.h
+++ b/source/Lib/CommonLib/x86/TrQuantX86.h
@@ -410,271 +410,7 @@ uint32_t computeSAD_SIMD( const Pel* ref, const Pel* cur, const int size )
 }
 #endif
 
-#if ENABLE_SIMD_TMP
-template< X86_VEXT vext >
-#if JVET_W0069_TMP_BOUNDARY
-int calcTemplateDiffSIMD(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType)
-#else
-int calcTemplateDiffSIMD( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax )
-#endif
-{
-  int iDiffSum = 0;
-  int iY;
-#if JVET_W0069_TMP_BOUNDARY
-  Pel* refPatchRow;
-  if( TempType == L_SHAPE_TEMPLATE )
-  {
-    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
-  }
-  else if( TempType == LEFT_TEMPLATE )
-  {
-    refPatchRow = ref - TMP_TEMPLATE_SIZE;
-  }
-  else if( TempType == ABOVE_TEMPLATE )
-  {
-    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride;
-  }
-#else
-  Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
-#endif
-  Pel* tarPatchRow;
-  uint32_t uiSum;
-
-  // horizontal difference
-#if JVET_W0069_TMP_BOUNDARY
-  if (TempType == L_SHAPE_TEMPLATE)
-  {
-#endif
-  for( iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
-  {
-    tarPatchRow = tarPatch[iY];
-    const short* pSrc1 = ( const short* ) tarPatchRow;
-    const short* pSrc2 = ( const short* ) refPatchRow;
-
-    // SIMD difference
-    //int  iRows = uiPatchHeight;
-    int  iCols = uiPatchWidth;
-    if( (iCols & 7) == 0 )
-    {
-      // Do with step of 8
-      __m128i vzero = _mm_setzero_si128();
-      __m128i vsum32 = vzero;
-      //for (int iY = 0; iY < iRows; iY += iSubStep)
-      {
-        __m128i vsum16 = vzero;
-        for( int iX = 0; iX < iCols; iX += 8 )
-        {
-          __m128i vsrc1 = _mm_loadu_si128( (const __m128i*)(&pSrc1[iX]) );
-          __m128i vsrc2 = _mm_lddqu_si128( (const __m128i*)(&pSrc2[iX]) );
-          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
-        }
-        __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
-        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
-        //pSrc1 += iStrideSrc1;
-        //pSrc2 += iStrideSrc2;
-      }
-      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
-      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
-      uiSum = _mm_cvtsi128_si32( vsum32 );
-    }
-    else
-    {
-      // Do with step of 4
-      __m128i vzero = _mm_setzero_si128();
-      __m128i vsum32 = vzero;
-      //for (int iY = 0; iY < iRows; iY += iSubStep)
-      {
-        __m128i vsum16 = vzero;
-        for( int iX = 0; iX < iCols; iX += 4 )
-        {
-          __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] );
-          __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] );
-          vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
-        }
-        __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
-        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
-        //pSrc1 += iStrideSrc1;
-        //pSrc2 += iStrideSrc2;
-      }
-      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
-      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
-      uiSum = _mm_cvtsi128_si32( vsum32 );
-    }
-    iDiffSum += uiSum;
-
-    if( iDiffSum > iMax ) //for speeding up
-    {
-      return iDiffSum;
-    }
-    // update location
-    refPatchRow += uiStride;
-  }
-
-  // vertical difference
-  int  iCols = TMP_TEMPLATE_SIZE;
-
-  for( iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ )
-  {
-    tarPatchRow = tarPatch[iY];
-    const short* pSrc1 = ( const short* ) tarPatchRow;
-    const short* pSrc2 = ( const short* ) refPatchRow;
-
-    // SIMD difference
-
-    // Do with step of 4
-    __m128i vzero = _mm_setzero_si128();
-    __m128i vsum32 = vzero;
-    //for (int iY = 0; iY < iRows; iY += iSubStep)
-    {
-      __m128i vsum16 = vzero;
-      for( int iX = 0; iX < iCols; iX += 4 )
-      {
-        __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] );
-        __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] );
-        vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) );
-      }
-      __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
-      vsum32 = _mm_add_epi32( vsum32, vsumtemp );
-      //pSrc1 += iStrideSrc1;
-      //pSrc2 += iStrideSrc2;
-    }
-    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
-    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
-    uiSum = _mm_cvtsi128_si32( vsum32 );
-
-    iDiffSum += uiSum;
-
-    if( iDiffSum > iMax ) //for speeding up
-    {
-      return iDiffSum;
-    }
-    // update location
-    refPatchRow += uiStride;
-  }
-#if JVET_W0069_TMP_BOUNDARY
-  }
-  else if (TempType == ABOVE_TEMPLATE)
-  {
-  // horizontal difference
-  for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
-  {
-    tarPatchRow = tarPatch[iY];
-    const short* pSrc1 = (const short*)tarPatchRow;
-    const short* pSrc2 = (const short*)refPatchRow;
-
-    // SIMD difference
-    //int  iRows = uiPatchHeight;
-    int  iCols = uiPatchWidth - TMP_TEMPLATE_SIZE;
-    if ((iCols & 7) == 0)
-    {
-      // Do with step of 8
-      __m128i vzero = _mm_setzero_si128();
-      __m128i vsum32 = vzero;
-      //for (int iY = 0; iY < iRows; iY += iSubStep)
-      {
-        __m128i vsum16 = vzero;
-        for (int iX = 0; iX < iCols; iX += 8)
-        {
-          __m128i vsrc1 = _mm_loadu_si128((const __m128i*)(&pSrc1[iX]));
-          __m128i vsrc2 = _mm_lddqu_si128((const __m128i*)(&pSrc2[iX]));
-          vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
-        }
-        __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-        vsum32 = _mm_add_epi32(vsum32, vsumtemp);
-        //pSrc1 += iStrideSrc1;
-        //pSrc2 += iStrideSrc2;
-      }
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-      uiSum = _mm_cvtsi128_si32(vsum32);
-    }
-    else
-    {
-      // Do with step of 4
-      __m128i vzero = _mm_setzero_si128();
-      __m128i vsum32 = vzero;
-      //for (int iY = 0; iY < iRows; iY += iSubStep)
-      {
-        __m128i vsum16 = vzero;
-        for (int iX = 0; iX < iCols; iX += 4)
-        {
-          __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
-          __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
-          vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
-        }
-        __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-        vsum32 = _mm_add_epi32(vsum32, vsumtemp);
-        //pSrc1 += iStrideSrc1;
-        //pSrc2 += iStrideSrc2;
-      }
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-      uiSum = _mm_cvtsi128_si32(vsum32);
-    }
-    iDiffSum += uiSum;
-
-    if (iDiffSum > iMax) //for speeding up
-    {
-      return iDiffSum;
-    }
-    // update location
-    refPatchRow += uiStride;
-  }
-
-  
-  }
-  else if (TempType == LEFT_TEMPLATE)
-  {
-
-  // vertical difference
-  int  iCols = TMP_TEMPLATE_SIZE;
-
-  for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
-  {
-    tarPatchRow = tarPatch[iY];
-    const short* pSrc1 = (const short*)tarPatchRow;
-    const short* pSrc2 = (const short*)refPatchRow;
-
-    // SIMD difference
-
-    // Do with step of 4
-    __m128i vzero = _mm_setzero_si128();
-    __m128i vsum32 = vzero;
-    //for (int iY = 0; iY < iRows; iY += iSubStep)
-    {
-      __m128i vsum16 = vzero;
-      for (int iX = 0; iX < iCols; iX += 4)
-      {
-        __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
-        __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
-        vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
-      }
-      __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-      vsum32 = _mm_add_epi32(vsum32, vsumtemp);
-      //pSrc1 += iStrideSrc1;
-      //pSrc2 += iStrideSrc2;
-    }
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-    uiSum = _mm_cvtsi128_si32(vsum32);
-
-    iDiffSum += uiSum;
-
-    if (iDiffSum > iMax) //for speeding up
-    {
-      return iDiffSum;
-    }
-    // update location
-    refPatchRow += uiStride;
-  }
-  }
-#endif
-
-  return iDiffSum;
-}
-#endif
-
-#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP
+#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT
 template <X86_VEXT vext>
 void TrQuant::_initTrQuantX86()
 {
@@ -897,10 +633,6 @@ void TrQuant::_initTrQuantX86()
   fastInvTrans[2][5] = fastInverseTransform_SIMD<DST7, 64>;
 #endif
 #endif
-
-#if ENABLE_SIMD_TMP
-  m_calcTemplateDiff = calcTemplateDiffSIMD<vext>;
-#endif
 }
 
 template void TrQuant::_initTrQuantX86<SIMDX86>();
diff --git a/source/Lib/CommonLib/x86/avx/Intra_avx.cpp b/source/Lib/CommonLib/x86/avx/Intra_avx.cpp
new file mode 100644
index 000000000..59a692f72
--- /dev/null
+++ b/source/Lib/CommonLib/x86/avx/Intra_avx.cpp
@@ -0,0 +1 @@
+#include "../IntraX86.h"
diff --git a/source/Lib/CommonLib/x86/avx2/Intra_avx2.cpp b/source/Lib/CommonLib/x86/avx2/Intra_avx2.cpp
new file mode 100644
index 000000000..59a692f72
--- /dev/null
+++ b/source/Lib/CommonLib/x86/avx2/Intra_avx2.cpp
@@ -0,0 +1 @@
+#include "../IntraX86.h"
diff --git a/source/Lib/CommonLib/x86/sse41/Intra_sse41.cpp b/source/Lib/CommonLib/x86/sse41/Intra_sse41.cpp
new file mode 100644
index 000000000..59a692f72
--- /dev/null
+++ b/source/Lib/CommonLib/x86/sse41/Intra_sse41.cpp
@@ -0,0 +1 @@
+#include "../IntraX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/Intra_sse42.cpp b/source/Lib/CommonLib/x86/sse42/Intra_sse42.cpp
new file mode 100644
index 000000000..59a692f72
--- /dev/null
+++ b/source/Lib/CommonLib/x86/sse42/Intra_sse42.cpp
@@ -0,0 +1 @@
+#include "../IntraX86.h"
diff --git a/source/Lib/DecoderLib/DecCu.cpp b/source/Lib/DecoderLib/DecCu.cpp
index 0e196a07f..6d3fb02d4 100644
--- a/source/Lib/DecoderLib/DecCu.cpp
+++ b/source/Lib/DecoderLib/DecCu.cpp
@@ -385,22 +385,23 @@ void DecCu::xIntraRecBlk( TransformUnit& tu, const ComponentID compID )
 	  {
 		  int foundCandiNum;
 #if JVET_W0069_TMP_BOUNDARY
-		  RefTemplateType TempType = m_pcIntraPred->GetRefTemplateType(*(tu.cu), tu.cu->blocks[COMPONENT_Y]);
-		  if (TempType != NO_TEMPLATE)
+		  RefTemplateType tempType = m_pcIntraPred->getRefTemplateType(*(tu.cu), tu.cu->blocks[COMPONENT_Y]);
+
+      if( tempType != NO_TEMPLATE )
 		  {
-			  m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight(), TempType);
-			  m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight(), TempType);
-			  m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
+        m_pcIntraPred->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight(), tempType);
+        m_pcIntraPred->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight(), tempType);
+        m_pcIntraPred->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
 		  }
 		  else
 		  {
 			  foundCandiNum = 1;
-			  m_pcTrQuant->generateTM_DC_Prediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (tu.cu->cs->sps->getBitDepth(CHANNEL_TYPE_LUMA) - 1));
+        m_pcIntraPred->generateTmDcPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (tu.cu->cs->sps->getBitDepth(CHANNEL_TYPE_LUMA) - 1));
 		  }
 #else
-		  m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight());
-		  m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight());
-		  m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
+      m_pcIntraPred->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight());
+      m_pcIntraPred->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight());
+      m_pcIntraPred->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
 #endif
 		  assert(foundCandiNum >= 1);
 	  }
@@ -605,9 +606,9 @@ void DecCu::xIntraRecACTBlk(TransformUnit& tu)
 	{
 		int foundCandiNum;
 		const unsigned int uiStride = cs.picture->getRecoBuf(COMPONENT_Y).stride;
-		m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight());
-		m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight());
-		m_pcTrQuant->generateTMPrediction(piPred.buf, uiStride, pu.lwidth(), pu.lheight(), foundCandiNum);
+    m_pcIntraPred->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight());
+    m_pcIntraPred->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight());
+    m_pcIntraPred->generateTMPrediction(piPred.buf, uiStride, pu.lwidth(), pu.lheight(), foundCandiNum);
 	}
 	else if (PU::isMIP(pu, chType))
 #else
diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp
index e97365623..c4edea054 100644
--- a/source/Lib/EncoderLib/IntraSearch.cpp
+++ b/source/Lib/EncoderLib/IntraSearch.cpp
@@ -892,27 +892,27 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
               CodingUnit cu_cpy = cu;
 
 #if JVET_W0069_TMP_BOUNDARY
-              RefTemplateType TemplateType = GetRefTemplateType( cu_cpy, cu_cpy.blocks[COMPONENT_Y] );
-              if( TemplateType != NO_TEMPLATE )
+              RefTemplateType templateType = getRefTemplateType( cu_cpy, cu_cpy.blocks[COMPONENT_Y] );
+              if( templateType != NO_TEMPLATE )
 #else
               if( isRefTemplateAvailable( cu_cpy, cu_cpy.blocks[COMPONENT_Y] ) )
 #endif
               {
 #if JVET_W0069_TMP_BOUNDARY
-                m_pcTrQuant->getTargetTemplate( &cu_cpy, pu.lwidth(), pu.lheight(), TemplateType );
-                m_pcTrQuant->candidateSearchIntra( &cu_cpy, pu.lwidth(), pu.lheight(), TemplateType );
-                bsuccessfull = m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+                getTargetTemplate( &cu_cpy, pu.lwidth(), pu.lheight(), templateType );
+                candidateSearchIntra( &cu_cpy, pu.lwidth(), pu.lheight(), templateType );
+                bsuccessfull = generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
 #else
-                m_pcTrQuant->getTargetTemplate( &cu_cpy, pu.lwidth(), pu.lheight() );
-                m_pcTrQuant->candidateSearchIntra( &cu_cpy, pu.lwidth(), pu.lheight() );
-                bsuccessfull = m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+                getTargetTemplate( &cu_cpy, pu.lwidth(), pu.lheight() );
+                candidateSearchIntra( &cu_cpy, pu.lwidth(), pu.lheight() );
+                bsuccessfull = generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
 #endif
               }
 #if JVET_W0069_TMP_BOUNDARY
               else
               {
                 foundCandiNum = 1;
-                bsuccessfull = m_pcTrQuant->generateTM_DC_Prediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (cu_cpy.cs->sps->getBitDepth( CHANNEL_TYPE_LUMA ) - 1) );
+                bsuccessfull = generateTmDcPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (cu_cpy.cs->sps->getBitDepth( CHANNEL_TYPE_LUMA ) - 1) );
               }
 #endif
               if( bsuccessfull && foundCandiNum >= 1 )
@@ -3679,22 +3679,22 @@ void IntraSearch::xIntraCodingTUBlock(TransformUnit &tu, const ComponentID &comp
         {
           int foundCandiNum;
 #if JVET_W0069_TMP_BOUNDARY
-          RefTemplateType TempType = GetRefTemplateType( *(tu.cu), tu.cu->blocks[COMPONENT_Y] );
-          if( TempType != NO_TEMPLATE )
+          RefTemplateType tempType = getRefTemplateType( *(tu.cu), tu.cu->blocks[COMPONENT_Y] );
+          if( tempType != NO_TEMPLATE )
           {
-            m_pcTrQuant->getTargetTemplate( tu.cu, pu.lwidth(), pu.lheight(), TempType );
-            m_pcTrQuant->candidateSearchIntra( tu.cu, pu.lwidth(), pu.lheight(), TempType );
-            m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+            getTargetTemplate( tu.cu, pu.lwidth(), pu.lheight(), tempType );
+            candidateSearchIntra( tu.cu, pu.lwidth(), pu.lheight(), tempType );
+            generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
           }
           else
           {
             foundCandiNum = 1;
-            m_pcTrQuant->generateTM_DC_Prediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (tu.cu->cs->sps->getBitDepth( CHANNEL_TYPE_LUMA ) - 1) );
+            generateTmDcPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (tu.cu->cs->sps->getBitDepth( CHANNEL_TYPE_LUMA ) - 1) );
           }
 #else
-          m_pcTrQuant->getTargetTemplate( tu.cu, pu.lwidth(), pu.lheight() );
-          m_pcTrQuant->candidateSearchIntra( tu.cu, pu.lwidth(), pu.lheight() );
-          m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+          getTargetTemplate( tu.cu, pu.lwidth(), pu.lheight() );
+          candidateSearchIntra( tu.cu, pu.lwidth(), pu.lheight() );
+          generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
 #endif
           CHECK( foundCandiNum < 1, "" );
         }
@@ -5094,9 +5094,9 @@ bool IntraSearch::xRecurIntraCodingACTQT(CodingStructure &cs, Partitioner &parti
       if( PU::isTmp( pu, chType ) )
       {
         int foundCandiNum;
-        m_pcTrQuant->getTargetTemplate( pu.cu, pu.lwidth(), pu.lheight() );
-        m_pcTrQuant->candidateSearchIntra( pu.cu, pu.lwidth(), pu.lheight() );
-        m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+        getTargetTemplate( pu.cu, pu.lwidth(), pu.lheight() );
+        candidateSearchIntra( pu.cu, pu.lwidth(), pu.lheight() );
+        generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
         CHECK( foundCandiNum < 1, "" );
 
       }
-- 
GitLab