diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
index b9d6834f68e7992af31dd789f55fbc4671115d9d..2ca0aceddd16e6a38500347da8a21b70951c1fcd 100644
--- a/source/Lib/CommonLib/CommonDef.h
+++ b/source/Lib/CommonLib/CommonDef.h
@@ -2178,6 +2178,13 @@ static const int TMP_NUM_MERGE_CANDS = 10;
 static const int NUM_TMP_ARBVP = 20;
 static const int EBVP_RANGE = 1;
 #endif
+
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+static const int NUM_TMP_ARBVP_S  = 5;
+static const int TMP_MRG_REG_ID   = 6;
+static const int INIT_TL_POS      = (MTMP_NUM - TL_NUM_SPARSE);
+#endif
+
 #if JVET_AG0152_SGPM_ITMP_IBC
 static const int SGPM_NUM_BVS = 6; // maximum BVs to be considered into the list for Itmp-Sgpm
 static const int SGPM_BV_START_IDX = NUM_LUMA_MODE;
diff --git a/source/Lib/CommonLib/IntraPrediction.cpp b/source/Lib/CommonLib/IntraPrediction.cpp
index afef96be885964cd8904c73383dc96dd7de95398..99d07f921bf711e847dee201c660e57d1c6a9555 100644
--- a/source/Lib/CommonLib/IntraPrediction.cpp
+++ b/source/Lib/CommonLib/IntraPrediction.cpp
@@ -128,6 +128,9 @@ IntraPrediction::IntraPrediction()
 #if JVET_W0123_TIMD_FUSION
   m_timdSatdCost = nullptr;
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  m_itmpSatdCost = nullptr;
+#endif
 #if JVET_AC0071_DBV
   m_dbvSadCost = nullptr;
 #endif
@@ -217,6 +220,9 @@ void IntraPrediction::destroy()
 #if JVET_W0123_TIMD_FUSION
   delete m_timdSatdCost;
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  delete m_itmpSatdCost;
+#endif
 #if JVET_AC0071_DBV
   delete m_dbvSadCost;
 #endif
@@ -291,7 +297,7 @@ void IntraPrediction::destroy()
   }
   m_tempBuffer.clear();
 
-#if JVET_V0130_INTRA_TMP
+#if JVET_V0130_INTRA_TMP && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   if( m_pppTarPatch != NULL )
   {
     for( unsigned int uiDepth = 0; uiDepth < USE_MORE_BLOCKSIZE_DEPTH_MAX; uiDepth++ )
@@ -414,6 +420,12 @@ void IntraPrediction::init(ChromaFormat chromaFormatIDC, const unsigned bitDepth
     m_timdSatdCost = new RdCost;
   }
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  if (m_itmpSatdCost == nullptr)
+  {
+    m_itmpSatdCost = new RdCost;
+  }
+#endif
 #if JVET_AC0071_DBV
   if (m_dbvSadCost == nullptr)
   {
@@ -574,6 +586,7 @@ void IntraPrediction::init(ChromaFormat chromaFormatIDC, const unsigned bitDepth
   m_calcAeipGroupSum = calcAeipGroupSum;
 #endif
 #if JVET_V0130_INTRA_TMP
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   unsigned int blkSize;
   if( m_pppTarPatch == NULL )
   {
@@ -590,6 +603,7 @@ void IntraPrediction::init(ChromaFormat chromaFormatIDC, const unsigned bitDepth
       }
     }
   }
+#endif
 
   m_calcTemplateDiff = calcTemplateDiff;
 #if JVET_AG0136_INTRA_TMP_LIC
@@ -7941,6 +7955,13 @@ int IntraPrediction::getBestNonAnglularMode(const CPelBuf& recoBuf, const CompAr
     distParamSad[0].useMR = false;
     distParamSad[1].applyWeight = false;
     distParamSad[1].useMR = false;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    DistParam distParamSadItmp[2];
+    distParamSadItmp[0].applyWeight = false;
+    distParamSadItmp[0].useMR = false;
+    distParamSadItmp[1].applyWeight = false;
+    distParamSadItmp[1].useMR = false;
+#endif
     if (eTempType == LEFT_ABOVE_NEIGHBOR)
     {
       m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg + iTempWidth, piPred + iTempWidth, iOrgStride,
@@ -7949,16 +7970,26 @@ int IntraPrediction::getBestNonAnglularMode(const CPelBuf& recoBuf, const CompAr
       m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg + iTempHeight * iOrgStride,
         piPred + iTempHeight * uiPredStride, iOrgStride, uiPredStride, channelBitDepth,
         COMPONENT_Y, iTempWidth, uiHeight, 0, 1, true);   // Use HAD (SATD) cost
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      m_itmpSatdCost->setTimdDistParam(distParamSadItmp[0], piOrg + iTempWidth, piOrg + iTempWidth, iOrgStride, iOrgStride, channelBitDepth, COMPONENT_Y, uiWidth, iTempHeight, 0, 1, true);
+      m_itmpSatdCost->setTimdDistParam(distParamSadItmp[1], piOrg + iTempHeight * iOrgStride, piOrg + iTempHeight * uiPredStride, iOrgStride, iOrgStride, channelBitDepth, COMPONENT_Y, iTempWidth, uiHeight, 0, 1, true);
+#endif
     }
     else if (eTempType == LEFT_NEIGHBOR)
     {
       m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg, piPred, iOrgStride, uiPredStride, channelBitDepth,
         COMPONENT_Y, iTempWidth, uiHeight, 0, 1, true);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      m_itmpSatdCost->setTimdDistParam(distParamSadItmp[1], piOrg, piPred, iOrgStride, iOrgStride, channelBitDepth, COMPONENT_Y, iTempWidth, uiHeight, 0, 1, true);
+#endif
     }
     else if (eTempType == ABOVE_NEIGHBOR)
     {
       m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg, piPred, iOrgStride, uiPredStride, channelBitDepth,
         COMPONENT_Y, uiWidth, iTempHeight, 0, 1, true);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      m_itmpSatdCost->setTimdDistParam(distParamSadItmp[0], piOrg, piOrg, iOrgStride, iOrgStride, channelBitDepth, COMPONENT_Y, uiWidth, iTempHeight, 0, 1, true);
+#endif
     }
     initTimdIntraPatternLuma(cu, area, eTempType != ABOVE_NEIGHBOR ? iTempWidth : 0,
       eTempType != LEFT_NEIGHBOR ? iTempHeight : 0, uiRefWidth, uiRefHeight);
@@ -7987,19 +8018,34 @@ int IntraPrediction::getBestNonAnglularMode(const CPelBuf& recoBuf, const CompAr
         initPredTimdIntraParams(pu, area, i);
         predTimdIntraAng(COMPONENT_Y, pu, i, piPred, uiPredStride, uiRealW, uiRealH, eTempType,
           (eTempType == ABOVE_NEIGHBOR) ? 0 : iTempWidth, (eTempType == LEFT_NEIGHBOR) ? 0 : iTempHeight);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tmpCost0 = distParamSad[0].distFunc(distParamSad[0]);
+        tmpCost1 = distParamSad[1].distFunc(distParamSad[1]);
+        uiCost = tmpCost0 + tmpCost1;
+#endif
       }
       else
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        distParamSadItmp[0].cur.buf = piOrg + BVs[i - 1].hor + BVs[i - 1].ver * iOrgStride + iTempWidth;
+        tmpCost0 = distParamSadItmp[0].distFunc(distParamSadItmp[0]);
+        distParamSadItmp[1].cur.buf = piOrg + BVs[i - 1].hor + (BVs[i - 1].ver + iTempHeight) * iOrgStride;
+        tmpCost1 = distParamSadItmp[1].distFunc(distParamSadItmp[1]);
+        uiCost = tmpCost0 + tmpCost1;
+#else
         predTimdIbcItmp(COMPONENT_Y, pu, BVs[i - 1], piPred, uiPredStride, uiRealW, uiRealH, eTempType,
           (eTempType == ABOVE_NEIGHBOR) ? 0 : iTempWidth, (eTempType == LEFT_NEIGHBOR) ? 0 : iTempHeight, piOrg, iOrgStride);
+#endif
       }
 
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tmpCost0 = distParamSad[0].distFunc(distParamSad[0]);
       tmpCost1 = distParamSad[1].distFunc(distParamSad[1]);
 
 
 
       uiCost = tmpCost0 + tmpCost1;
+#endif
       if (uiCost < uiBestCost)
       {
         uiBestCost = uiCost;
@@ -14358,8 +14404,13 @@ void insertNode( int diff, int& iXOffset, int& iYOffset, int& pDiff, int& pX, in
 void clipMvIntraConstraint( CodingUnit* pcCU, int regionId, int& iHorMin, int& iHorMax, int& iVerMin, int& iVerMax, unsigned int uiTemplateSize, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int iCurrY, int iCurrX, int offsetLCUY, int offsetLCUX, RefTemplateType tempType )
 {
 #if JVET_AD0086_ENHANCED_INTRA_TMP
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const int searchRangeWidth = std::max((uiBlkWidth  == 64 ? TMP_SEARCH_RANGE_MULT_FACTOR - 1 : TMP_SEARCH_RANGE_MULT_FACTOR) * static_cast<int>(uiBlkWidth), TMP_MINSR);
+  const int searchRangeHeight = std::max((uiBlkHeight == 64 ? TMP_SEARCH_RANGE_MULT_FACTOR - 1 : TMP_SEARCH_RANGE_MULT_FACTOR) * static_cast<int>(uiBlkHeight), TMP_MINSR);
+#else
   int searchRangeWidth  = std::max(TMP_SEARCH_RANGE_MULT_FACTOR * (int) uiBlkWidth, TMP_MINSR);
   int searchRangeHeight = std::max(TMP_SEARCH_RANGE_MULT_FACTOR * (int) uiBlkHeight, TMP_MINSR);
+#endif
 #else  
   int searchRangeWidth = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkWidth;
   int searchRangeHeight = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkHeight;
@@ -14462,10 +14513,25 @@ void clipMvIntraConstraint( CodingUnit* pcCU, int regionId, int& iHorMin, int& i
 void clipMvIntraConstraintRefine(int& iHorMin, int& iHorMax, int& iVerMin, int& iVerMax,int pX, int pY, int refinementRange
 #if JVET_AG0136_INTRA_TMP_LIC
                                  , const int tmpSampling
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                 , const bool isTransferredLeft, const bool isTransferredTop, const int bestRegionId, const bool isExtLeft, const bool isExtTop, const bool isExceptionAllowed
 #endif
                                  )
 {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  if (isTransferredLeft)
+  {
+    iHorMin = isExceptionAllowed && (bestRegionId == 3 || bestRegionId == 5) && isExtLeft ? pX : std::max(iHorMin, pX);
+    iHorMax = isExceptionAllowed && bestRegionId == 3 ? pX + tmpSampling - 1 : std::min(iHorMax, pX + tmpSampling - 1);
+  }
+  else
+  {
+#endif
 #if JVET_AD0086_ENHANCED_INTRA_TMP
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  iHorMin = isExceptionAllowed && (bestRegionId == 3 || bestRegionId == 5) && isExtLeft ? pX - refinementRange : std::max(iHorMin, pX - refinementRange);
+#else
   iHorMin = std::max(iHorMin, pX - refinementRange + (
 #if JVET_AG0136_INTRA_TMP_LIC
                                                       tmpSampling
@@ -14473,7 +14539,36 @@ void clipMvIntraConstraintRefine(int& iHorMin, int& iHorMax, int& iVerMin, int&
                                                       TMP_SAMPLING
 #endif
                                                       % 2 ? 0 : 1));
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  iHorMax = isExceptionAllowed && bestRegionId == 3 ? pX + refinementRange - (
+#if JVET_AG0136_INTRA_TMP_LIC
+                                                                              tmpSampling
+#else
+                                                                              TMP_SAMPLING
+#endif
+                                                                              % 2 ? 0 : 1)
+                                  : std::min(iHorMax, pX + refinementRange - (
+#if JVET_AG0136_INTRA_TMP_LIC
+                                                                              tmpSampling
+#else
+                                                                              TMP_SAMPLING
+#endif
+                                                                              % 2 ? 0 : 1));
+#else
   iHorMax = std::min(iHorMax, pX + refinementRange);
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  }
+  if (isTransferredTop)
+  {
+    iVerMin = isExceptionAllowed && bestRegionId == 4 && isExtTop ? pY : std::max(iVerMin, pY);
+    iVerMax = isExceptionAllowed && bestRegionId == 1 ? pY + tmpSampling - 1 : std::min(iVerMax, pY + tmpSampling - 1);
+  }
+  else
+  {
+  iVerMin = isExceptionAllowed && bestRegionId == 4 && isExtTop ? pY - refinementRange : std::max(iVerMin, pY - refinementRange);
+#else
   iVerMin = std::max(iVerMin, pY - refinementRange + (
 #if JVET_AG0136_INTRA_TMP_LIC
                                                       tmpSampling
@@ -14481,7 +14576,28 @@ void clipMvIntraConstraintRefine(int& iHorMin, int& iHorMax, int& iVerMin, int&
                                                       TMP_SAMPLING
 #endif
                                                       % 2 ? 0 : 1));
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  iVerMax = isExceptionAllowed && bestRegionId == 1 ? pY + refinementRange - (
+#if JVET_AG0136_INTRA_TMP_LIC
+                                                                              tmpSampling
+#else
+                                                                              TMP_SAMPLING
+#endif
+                                                                              % 2 ? 0 : 1)
+                                 : std::min(iVerMax, pY + refinementRange - (
+#if JVET_AG0136_INTRA_TMP_LIC
+                                                                             tmpSampling
+#else
+                                                                             TMP_SAMPLING
+#endif
+                                                                             % 2 ? 0 : 1));
+#else
   iVerMax = std::min(iVerMax, pY + refinementRange);
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  }
+#endif
 #else
   iHorMin = std::max(iHorMin, pX - refinementRange);
   iHorMax = std::min(iHorMax, pX + refinementRange);
@@ -14517,22 +14633,31 @@ void IntraPrediction::getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWid
 #endif
 {
   const ComponentID compID = COMPONENT_Y;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
   unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
   unsigned int uiTarDepth = floorLog2( std::max( uiBlkHeight, uiBlkWidth ) ) - 2;
   Pel** tarPatch = m_pppTarPatch[uiTarDepth];
+#endif
   CompArea area = pcCU->blocks[compID];
   Pel* pCurrStart = pcCU->cs->picture->getRecoBuf( area ).buf;
   unsigned int  uiPicStride = pcCU->cs->picture->getRecoBuf( compID ).stride;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   unsigned int uiY, uiX;
+#endif
 
   //fill template
   //up-left & up 
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   Pel* tarTemp;
+#endif
 #if JVET_W0069_TMP_BOUNDARY
   if( tempType == L_SHAPE_TEMPLATE )
   {
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    m_pppTarPatch = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride - TMP_TEMPLATE_SIZE;
+#else
     Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride - TMP_TEMPLATE_SIZE;
     for( uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++ )
     {
@@ -14553,10 +14678,14 @@ void IntraPrediction::getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWid
       }
       pCurrTemp += uiPicStride;
     }
+#endif
 #if JVET_W0069_TMP_BOUNDARY
   }
   else if( tempType == ABOVE_TEMPLATE )
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    m_pppTarPatch = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride;
+#else
     Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride;
     for( uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++ )
     {
@@ -14567,9 +14696,13 @@ void IntraPrediction::getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWid
       }
       pCurrTemp += uiPicStride;
     }
+#endif
   }
   else if( tempType == LEFT_TEMPLATE )
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    m_pppTarPatch = pCurrStart - TMP_TEMPLATE_SIZE;
+#else
     Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE;
     for( uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++ )
     {
@@ -14580,6 +14713,7 @@ void IntraPrediction::getTargetTemplate( CodingUnit* pcCU, unsigned int uiBlkWid
       }
       pCurrTemp += uiPicStride;
     }
+#endif
   }
 #endif
 }
@@ -14596,9 +14730,15 @@ void IntraPrediction::candidateSearchIntra( CodingUnit* pcCU, unsigned int uiBlk
 {
   unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
   unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   unsigned int uiTarDepth = floorLog2( std::max( uiBlkWidth, uiBlkHeight ) ) - 2;
+#endif
 
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  Pel* tarPatch = getTargetPatch();
+#else
   Pel** tarPatch = getTargetPatch( uiTarDepth );
+#endif
   //Initialize the library for saving the best candidates
 #if !JVET_AD0086_ENHANCED_INTRA_TMP
   const ComponentID compID = COMPONENT_Y;
@@ -14638,7 +14778,11 @@ void IntraPrediction::candidateSearchIntra( CodingUnit* pcCU, unsigned int uiBlk
 }
 
 #if JVET_W0069_TMP_BOUNDARY
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void IntraPrediction::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel* tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, RefTemplateType tempType
+#else
 void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, RefTemplateType tempType
+#endif
 #if JVET_AG0136_INTRA_TMP_LIC
                                                      , const bool useMR
 #endif
@@ -14718,6 +14862,7 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
   setRefPicUsed( ref ); //facilitate the access of each candidate point 
   setStride( refStride );
 
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   Mv cTmpMvPred;
   cTmpMvPred.setZero();
 
@@ -14727,10 +14872,17 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
   int          blkY     = 0;
   int          iCurrY   = uiCUPelY + blkY;
   int          iCurrX   = uiCUPelX + blkX;
+#else
+  const int iCurrY = area.pos().y;
+  const int iCurrX = area.pos().x;
+#endif
 
   Position ctuRsAddr  = CU::getCtuXYAddr(*pcCU);
   int      offsetLCUY = iCurrY - ctuRsAddr.y;
   int      offsetLCUX = iCurrX - ctuRsAddr.x;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const int offset = TMP_SAMPLING >> 1;
+#endif
 
   int iYOffset, iXOffset;
 #if JVET_AD0086_ENHANCED_INTRA_TMP
@@ -14756,6 +14908,18 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
 #endif
   Pel* refCurr;
 
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const int iRefine          = 1;
+  const int iRefineRange     = TMP_SAMPLING >> 1;
+  const int tmpMrgRefWind    = 5;
+  const Mv  bvOffSparseTL    = Mv(iRefineRange, iRefineRange);
+  const Mv  bvOffSparseBR    = Mv(iRefineRange, iRefineRange);
+  const Mv  bvOffMerge       = Mv(tmpMrgRefWind, tmpMrgRefWind);
+  const Mv  bvOffArbvp       = Mv(EBVP_RANGE, EBVP_RANGE);
+  Mv        regTL, regBR, bvMrg;
+  Mv        iMrgWindTL, iMrgWindBR;
+  bool      bRegOverlap = false;
+#endif
 #if JVET_AD0086_ENHANCED_INTRA_TMP
 #if JVET_AE0077_EXT_INTRATMP
   const int regionNum = 6;
@@ -14801,7 +14965,11 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
     log2SizeTop = floorLog2(TMP_TEMPLATE_SIZE * uiBlkWidth);
     log2SizeLeft = floorLog2(uiBlkHeight * TMP_TEMPLATE_SIZE);
     sizeTopLeft = (uiBlkHeight + uiPatchWidth) * TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    m_calcTargetMean(tarPatch, m_uiPicStride, uiPatchWidth, uiPatchHeight, tempType, needTopLeft ? 3 : 0, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+#else
     m_calcTargetMean(tarPatch, uiPatchWidth, uiPatchHeight, tempType, needTopLeft ? 3 : 0, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
     m_log2SizeTop  = log2SizeTop ;
     m_log2SizeLeft = log2SizeLeft;
@@ -14854,19 +15022,95 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
       }
     }
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const int numInitMrg = static_cast<int>(m_bvBasedMergeCandidates.size());
+    if (uiBlkWidth <= 8 && uiBlkHeight <= 8)
+    {
+      regTL       = Mv(mvXMins[regionId], mvYMins[regionId]);
+      regBR       = Mv(mvXMaxs[regionId], mvYMaxs[regionId]);
+      bRegOverlap = false;
+      for (int mrgIdx = 0; mrgIdx < numInitMrg; mrgIdx++)
+      {
+        bvMrg      = m_bvBasedMergeCandidates[mrgIdx];
+        iMrgWindTL = bvMrg - bvOffMerge;
+        iMrgWindBR = bvMrg + bvOffMerge;
+
+        if ((regTL.hor >= iMrgWindTL.hor) && (regTL.ver >= iMrgWindTL.ver) && (regBR.hor <= iMrgWindBR.hor) && (regBR.ver <= iMrgWindBR.ver))
+        {
+          bRegOverlap = true;
+          break;
+        }
+      }
+      if (bRegOverlap)
+      {
+        continue;
+      }
+    }
+#endif
 #if JVET_AB0130_ITMP_SAMPLING
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    int shiftX = 0;
+    for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset -= TMP_SAMPLING)
+    {
+      shiftX = (mvXMax != mvXMin) ? (shiftX % TMP_SAMPLING) : 0;
+      bool isAvailablePairFound{false};
+      for (iXOffset = mvXMax - shiftX++; iXOffset >= mvXMin; iXOffset -= TMP_SAMPLING)
+      {
+        int iXOffsetMetric = iXOffset + offset;
+        int iYOffsetMetric = iYOffset + offset;
+        bool isTransferredLeft = false;
+        bool isTransferredTop = false;
+        if (iXOffsetMetric > mvXMax)
+        {
+          iXOffsetMetric = iXOffset;
+          isTransferredLeft = true;
+        }
+        if (iYOffsetMetric > mvYMax)
+        {
+          iYOffsetMetric = iYOffset;
+          isTransferredTop = true;
+        }
+#else
     for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset -= TMP_SAMPLING)
     {
       for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset -= TMP_SAMPLING)
       {
+#endif
 #if JVET_AE0077_EXT_INTRATMP
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        if ((regionId == 4 || regionId == 5) && !isAvailablePairFound)
+#else
         if (regionId == 4 || regionId == 5)
+#endif
         {
           Position bottomRight(iCurrX + iXOffset + uiBlkWidth - 1, iCurrY + iYOffset + uiBlkHeight - 1);
           if (!pcCU->cs->isDecomp(bottomRight, CHANNEL_TYPE_LUMA))
           {
             continue;
           }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          else
+          {
+            if (bJointCalc || useMR)
+            {
+              if (!pcCU->cs->isDecomp(Position(iCurrX + iXOffsetMetric + uiBlkWidth - 1, iCurrY + iYOffsetMetric + uiBlkHeight - 1), CHANNEL_TYPE_LUMA))
+              {
+                iXOffsetMetric = iXOffset;
+                iYOffsetMetric = iYOffset;
+                isTransferredLeft = true;
+                isTransferredTop = true;
+              }
+              else
+              {
+                isAvailablePairFound = true;
+              }
+            }
+            else
+            {
+              isAvailablePairFound = true;
+            }
+          }
+#endif
         }
 #endif
 #else
@@ -14876,16 +15120,39 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
       {
 #endif
         refCurr = ref + iYOffset * refStride + iXOffset;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        int licShift = 0;
+        if (!isTransferredLeft)
+        {
+          licShift += offset;
+        }
+        if (!isTransferredTop)
+        {
+          licShift += offset * m_uiPicStride;
+        }
+#endif
 
 #if JVET_AG0136_INTRA_TMP_LIC
         if (bJointCalc)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          m_calcTemplateDiffJointSadMrsad(refCurr, refStride, tarPatch, m_uiPicStride, uiPatchWidth, uiPatchHeight, diff, diffSupp, pDiff, pDiffSupp, tempType, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean, licShift);
+#else
           m_calcTemplateDiffJointSadMrsad(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, diff, diffSupp, pDiff, pDiffSupp, tempType, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          for (int temIdx = 0; temIdx < (needTopLeft ? 3 : 1); temIdx++)
+#else
           for (int temIdx = 0; temIdx < 3; temIdx++)
+#endif
           {
             if (diffSupp[temIdx] < pDiffSupp[temIdx])
             {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+              updateCandList(TempLibFast(iXOffsetMetric, iYOffsetMetric, Mv(iXOffsetMetric, iYOffsetMetric) - bvOffSparseTL, Mv(iXOffsetMetric, iYOffsetMetric) + bvOffSparseBR, isTransferredLeft, isTransferredTop, regionId), diffSupp[temIdx], sparseMtmpCandListSupp[temIdx], sparseMtmpCostListSupp[temIdx], mtmpNumSparseForLic[temIdx]);
+#else
               updateCandList(TempLibFast(iXOffset, iYOffset, regionId), diffSupp[temIdx], sparseMtmpCandListSupp[temIdx], sparseMtmpCostListSupp[temIdx], mtmpNumSparseForLic[temIdx]);
+#endif
               if (sparseMtmpCandListSupp[temIdx].size() == mtmpNumSparseForLic[temIdx])
               {
                 pDiffSupp[temIdx] = std::min((int) sparseMtmpCostListSupp[temIdx][mtmpNumSparseForLic[temIdx] - 1], pDiffSupp[temIdx]);
@@ -14895,18 +15162,32 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
         }
         else
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          m_calcTemplateDiff(pcCU->tmpLicFlag ? refCurr + licShift : refCurr, refStride, tarPatch, m_uiPicStride, uiPatchWidth, uiPatchHeight, diff, pDiff, tempType, needTopLeft ? 3 : 0, useMR, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+#else
           m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, diff, pDiff, tempType, needTopLeft ? 3 : 0, useMR, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+#endif
         }
 #else
         m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, diff, pDiff, tempType, needTopLeft ? 3 : 0);
 #endif
 
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        for (int temIdx = 0; temIdx < (needTopLeft ? 3 : 1); temIdx++)
+#else
         for (int temIdx = 0; temIdx < 3; temIdx++)
+#endif
         {
           if (diff[temIdx] < pDiff[temIdx])
           {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            int curXoffset = (!bJointCalc && (useMR)) ? iXOffsetMetric : iXOffset;
+            int curYoffset = (!bJointCalc && (useMR)) ? iYOffsetMetric : iYOffset;
+            updateCandList(TempLibFast(curXoffset, curYoffset, Mv(curXoffset, curYoffset) - bvOffSparseTL, Mv(curXoffset, curYoffset) + bvOffSparseBR, !bJointCalc && (useMR) ? isTransferredLeft : false, !bJointCalc && (useMR) ? isTransferredTop : false, regionId), diff[temIdx], sparseMtmpCandList[temIdx], sparseMtmpCostList[temIdx], mtmpNumSparse[temIdx]);
+#else
             updateCandList(TempLibFast(iXOffset, iYOffset, regionId), diff[temIdx],
                            sparseMtmpCandList[temIdx], sparseMtmpCostList[temIdx], mtmpNumSparse[temIdx]);
+#endif
             if (sparseMtmpCandList[temIdx].size() == mtmpNumSparse[temIdx])
             {
               pDiff[temIdx] = std::min((int) sparseMtmpCostList[temIdx][mtmpNumSparse[temIdx] - 1], pDiff[temIdx]);
@@ -15024,6 +15305,24 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
 
 #if JVET_AG0151_INTRA_TMP_MERGE_MODE
   std::vector<Mv> bvBasedMergeCandidatesITMP;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  bvBasedMergeCandidatesITMP = m_bvBasedMergeCandidates;
+  std::vector<Mv> bvBasedMergeCandidatesOut;
+  std::vector<Mv> bvBasedMergeCandidatesIn;
+  for (int iBv = 0; iBv < bvRegionIdList.size(); iBv++)
+  {
+    if (bvRegionIdList[iBv] == TMP_MRG_REG_ID)
+    {
+      bvBasedMergeCandidatesOut.push_back(m_bvBasedMergeCandidates[iBv]);
+    }
+    else
+    {
+      bvBasedMergeCandidatesIn.push_back(m_bvBasedMergeCandidates[iBv]);
+    }
+  }
+  bvBasedMergeCandidatesOut.insert(bvBasedMergeCandidatesOut.end(), bvBasedMergeCandidatesIn.begin(), bvBasedMergeCandidatesIn.end());
+  bvBasedMergeCandidatesITMP = bvBasedMergeCandidatesOut;
+#else
   if (m_bvBasedMergeCandidates.size() > TMP_NUM_MERGE_CANDS)
   {
     std::vector<Mv> bvBasedMergeCandidatesOut;
@@ -15059,6 +15358,7 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
   {
     bvBasedMergeCandidatesITMP = m_bvBasedMergeCandidates;
   }
+#endif
 #if JVET_AH0055_INTRA_TMP_ARBVP
   const int numNeighborMerge = (int) bvBasedMergeCandidatesITMP.size();
   const int totalNum = numNeighborMerge + NUM_TMP_ARBVP;
@@ -15157,6 +15457,164 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
     }
   } while (bvBasedMergeCandidatesITMP.size() > end && bvBasedMergeCandidatesITMP.size() < totalNum);
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  // Add ARBVPs based on sparse candidates
+  std::vector<Mv> bvBasedMergeCandidatesITMPSupp;
+  bvBasedMergeCandidatesITMPSupp = bvBasedMergeCandidatesITMP;
+
+  PU::getSparseArBvMergeCandidate(pu, bvBasedMergeCandidatesITMP, sparseMtmpCandList[0]);
+  if (bJointCalc)
+  {
+    PU::getSparseArBvMergeCandidate(pu, bvBasedMergeCandidatesITMPSupp, sparseMtmpCandListSupp[0]);
+  }
+
+  // Clustering of the Merge and ARBVP candidates based on refinement window
+  bool clustMrgReg[3]  = { false, false, false };
+  bool clustMrgSupp[3] = { false, false, false };
+  Mv   iMergeWindTL, iMergeWindBR;
+  Mv   iSparseWindTL, iSparseWindBR;
+  Mv   mergeCand, sparseCand;
+  bool bOverlap = false;
+
+  for (int iM = 0; iM < bvBasedMergeCandidatesITMP.size(); iM++)
+  {
+    mergeCand    = bvBasedMergeCandidatesITMP[iM];
+    iMergeWindTL = (iM < numNeighborMerge) ? mergeCand - bvOffMerge : mergeCand - bvOffArbvp;
+    iMergeWindBR = (iM < numNeighborMerge) ? mergeCand + bvOffMerge : mergeCand + bvOffArbvp;
+    regionId     = (iM < numNeighborMerge) ? TMP_MRG_REG_ID : TMP_MRG_REG_ID + 1;
+    refCurr      = ref + mergeCand.ver * refStride + mergeCand.hor;
+
+    for (int temIdx = 0; temIdx < (needTopLeft ? 3 : 1); temIdx++)
+    {
+      m_calcTemplateDiff(refCurr, refStride, tarPatch, m_uiPicStride, uiPatchWidth, uiPatchHeight, diff, pDiff, tempType, needTopLeft ? 3 : 0, useMR, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+      bOverlap = false;
+      for (int i = 0; i < sparseMtmpCandList[temIdx].size(); i++)
+      {
+        sparseCand = Mv(sparseMtmpCandList[temIdx][i].m_pX, sparseMtmpCandList[temIdx][i].m_pY);
+        if (mergeCand == sparseCand)
+        {
+          clustMrgReg[temIdx] = true;
+          bOverlap = true;
+          break;
+        }
+
+        iSparseWindTL = sparseMtmpCandList[temIdx][i].m_windTL;
+        iSparseWindBR = sparseMtmpCandList[temIdx][i].m_windBR;
+        if (!((iMergeWindBR.hor < iSparseWindTL.hor) || (iMergeWindTL.hor > iSparseWindBR.hor))
+          && !((iMergeWindBR.ver < iSparseWindTL.ver) || (iMergeWindTL.ver > iSparseWindBR.ver)))
+        {
+          iSparseWindTL =
+            Mv(std::min(iSparseWindTL.hor, iMergeWindTL.hor), std::min(iSparseWindTL.ver, iMergeWindTL.ver));
+          iSparseWindBR =
+            Mv(std::max(iSparseWindBR.hor, iMergeWindBR.hor), std::max(iSparseWindBR.ver, iMergeWindBR.ver));
+
+          if (diff[temIdx] < sparseMtmpCostList[temIdx][i])
+          {
+            sparseMtmpCandList[temIdx].erase(sparseMtmpCandList[temIdx].begin() + i);
+            sparseMtmpCostList[temIdx].erase(sparseMtmpCostList[temIdx].begin() + i);
+            updateCandList(TempLibFast(mergeCand.hor, mergeCand.ver, iSparseWindTL, iSparseWindBR, false, false, regionId),
+              diff[temIdx], sparseMtmpCandList[temIdx], sparseMtmpCostList[temIdx], mtmpNumSparse[temIdx]);
+            if (sparseMtmpCandList[temIdx].size() == mtmpNumSparse[temIdx])
+            {
+              pDiff[temIdx] = std::min(static_cast<int>(sparseMtmpCostList[temIdx][mtmpNumSparse[temIdx] - 1]), pDiff[temIdx]);
+            }
+          }
+          else
+          {
+            sparseMtmpCandList[temIdx][i].m_windTL = iSparseWindTL;
+            sparseMtmpCandList[temIdx][i].m_windBR = iSparseWindBR;
+            sparseMtmpCandList[temIdx][i].m_rId    = regionId;
+          }
+          clustMrgReg[temIdx] = true;
+          bOverlap            = true;
+          break;
+        }
+      }
+      if (!bOverlap)
+      {
+        if (diff[temIdx] < pDiff[temIdx])
+        {
+          updateCandList(TempLibFast(mergeCand.hor, mergeCand.ver, iMergeWindTL, iMergeWindBR, false, false, regionId), diff[temIdx],
+            sparseMtmpCandList[temIdx], sparseMtmpCostList[temIdx], mtmpNumSparse[temIdx]);
+          if (sparseMtmpCandList[temIdx].size() == mtmpNumSparse[temIdx])
+          {
+            pDiff[temIdx] = std::min(static_cast<int>(sparseMtmpCostList[temIdx][mtmpNumSparse[temIdx] - 1]), pDiff[temIdx]);
+          }
+          clustMrgReg[temIdx] = true;
+        }
+      }
+    }
+  }
+
+  if (bJointCalc)
+  {
+    for (int iM = 0; iM < bvBasedMergeCandidatesITMPSupp.size(); iM++)
+    {
+      mergeCand    = bvBasedMergeCandidatesITMPSupp[iM];
+      iMergeWindTL = (iM < numNeighborMerge) ? mergeCand - bvOffMerge : mergeCand - bvOffArbvp;
+      iMergeWindBR = (iM < numNeighborMerge) ? mergeCand + bvOffMerge : mergeCand + bvOffArbvp;
+      regionId     = (iM < numNeighborMerge) ? TMP_MRG_REG_ID : TMP_MRG_REG_ID + 1;
+      refCurr      = ref + mergeCand.ver * refStride + mergeCand.hor;
+
+      for (int temIdx = 0; temIdx < (needTopLeft ? 3 : 1); temIdx++)
+      {
+        m_calcTemplateDiff(refCurr, refStride, tarPatch, m_uiPicStride, uiPatchWidth, uiPatchHeight, diffSupp, pDiffSupp, tempType, needTopLeft ? 3 : 0, true, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+        bOverlap = false;
+        for (int i = 0; i < sparseMtmpCandListSupp[temIdx].size(); i++)
+        {
+          sparseCand = Mv(sparseMtmpCandListSupp[temIdx][i].m_pX, sparseMtmpCandListSupp[temIdx][i].m_pY);
+          if (mergeCand == sparseCand)
+          {
+            clustMrgSupp[temIdx] = true;
+            bOverlap = true;
+            break;
+          }
+
+          iSparseWindTL = sparseMtmpCandListSupp[temIdx][i].m_windTL;
+          iSparseWindBR = sparseMtmpCandListSupp[temIdx][i].m_windBR;
+          if (!((iMergeWindBR.hor < iSparseWindTL.hor) || (iMergeWindTL.hor > iSparseWindBR.hor)) && !((iMergeWindBR.ver < iSparseWindTL.ver) || (iMergeWindTL.ver > iSparseWindBR.ver)))
+          {
+            iSparseWindTL = Mv(std::min(iSparseWindTL.hor, iMergeWindTL.hor), std::min(iSparseWindTL.ver, iMergeWindTL.ver));
+            iSparseWindBR = Mv(std::max(iSparseWindBR.hor, iMergeWindBR.hor), std::max(iSparseWindBR.ver, iMergeWindBR.ver));
+
+            if (diffSupp[temIdx] < sparseMtmpCostListSupp[temIdx][i])
+            {
+              sparseMtmpCandListSupp[temIdx].erase(sparseMtmpCandListSupp[temIdx].begin() + i);
+              sparseMtmpCostListSupp[temIdx].erase(sparseMtmpCostListSupp[temIdx].begin() + i);
+              updateCandList(TempLibFast(mergeCand.hor, mergeCand.ver, iSparseWindTL, iSparseWindBR, false, false, regionId), diffSupp[temIdx], sparseMtmpCandListSupp[temIdx], sparseMtmpCostListSupp[temIdx], mtmpNumSparseForLic[temIdx]);
+              if (sparseMtmpCandListSupp[temIdx].size() == mtmpNumSparseForLic[temIdx])
+              {
+                pDiffSupp[temIdx] = std::min(static_cast<int>(sparseMtmpCostListSupp[temIdx][mtmpNumSparseForLic[temIdx] - 1]), pDiffSupp[temIdx]);
+              }
+            }
+            else
+            {
+              sparseMtmpCandListSupp[temIdx][i].m_windTL = iSparseWindTL;
+              sparseMtmpCandListSupp[temIdx][i].m_windBR = iSparseWindBR;
+              sparseMtmpCandListSupp[temIdx][i].m_rId    = regionId;
+            }
+            bOverlap             = true;
+            clustMrgSupp[temIdx] = true;
+            break;
+          }
+        }
+        if (!bOverlap)
+        {
+          if (diffSupp[temIdx] < pDiffSupp[temIdx])
+          {
+            updateCandList(TempLibFast(mergeCand.hor, mergeCand.ver, iMergeWindTL, iMergeWindBR, false, false, regionId),
+              diffSupp[temIdx], sparseMtmpCandListSupp[temIdx], sparseMtmpCostListSupp[temIdx], mtmpNumSparseForLic[temIdx]);
+            if (sparseMtmpCandListSupp[temIdx].size() == mtmpNumSparseForLic[temIdx])
+            {
+              pDiffSupp[temIdx] = std::min(static_cast<int>(sparseMtmpCostListSupp[temIdx][mtmpNumSparseForLic[temIdx] - 1]), pDiffSupp[temIdx]);
+            }
+            clustMrgSupp[temIdx] = true;
+          }
+        }
+      }
+    }
+  }
+#else
 #if JVET_AG0136_INTRA_TMP_LIC
   bool isBvAddedReg = false;
   bool isBvAddedSupp = false;
@@ -15239,6 +15697,7 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
     }
   }
 #endif
+#endif
 
 #if JVET_AD0086_ENHANCED_INTRA_TMP
   static_vector<TempLibFast, MTMP_NUM> refineMtmpCandList[3];
@@ -15270,22 +15729,47 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
         pDiff[i] = 0;
       }
     }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    bool bRedundant = false;
+    Mv refineWindTL, refineWindBR;
+    int mvYMin = 0;
+    int mvYMax = 0;
+    int mvXMin = 0;
+    int mvXMax = 0;
+#endif
     for (int candIdx = 0; candIdx < sparseMtmpCandList[temIdx].size(); candIdx++)
     {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      refineWindTL = sparseMtmpCandList[temIdx][candIdx].m_windTL;
+      refineWindBR = sparseMtmpCandList[temIdx][candIdx].m_windBR;
+#else
       int iRefine      = 1;
       int iRefineRange = TMP_SAMPLING >> 1;
+#endif
       bestRegionId     = sparseMtmpCandList[temIdx][candIdx].m_rId;
 
 #if JVET_AG0151_INTRA_TMP_MERGE_MODE
-#if JVET_AG0136_INTRA_TMP_LIC
+#if JVET_AG0136_INTRA_TMP_LIC && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       const int RefineSizeForLic = pcCU->slice->getSPS()->getItmpLicMode() ? 5 : 2;
 #endif
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       int mvYMin = 0;
       int mvYMax = 0;
       int mvXMin = 0;
       int mvXMax = 0;
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      if (bestRegionId > 5)
+#else
       if (bestRegionId == 6)
+#endif
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        mvYMin = refineWindTL.ver;
+        mvYMax = refineWindBR.ver;
+        mvXMin = refineWindTL.hor;
+        mvXMax = refineWindBR.hor; 
+#else
 #if JVET_AG0136_INTRA_TMP_LIC
         if ((!pcCU->cs->pcv->isEncoder || !bJointCalc) && pcCU->tmpFlag && (pcCU->tmpLicFlag || pcCU->ibcLicFlag))
         {
@@ -15306,9 +15790,10 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
         mvYMax = sparseMtmpCandList[temIdx][candIdx].m_pY + 5;
         mvXMin = sparseMtmpCandList[temIdx][candIdx].m_pX - 5;
         mvXMax = sparseMtmpCandList[temIdx][candIdx].m_pX + 5;
+#endif
 #endif
       }
-#if JVET_AH0055_INTRA_TMP_ARBVP
+#if JVET_AH0055_INTRA_TMP_ARBVP && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       else if (bestRegionId == 7)
       {
         mvYMin = sparseMtmpCandList[temIdx][candIdx].m_pY - EBVP_RANGE;
@@ -15326,6 +15811,9 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
         clipMvIntraConstraintRefine(mvXMin, mvXMax, mvYMin, mvYMax, sparseMtmpCandList[temIdx][candIdx].m_pX, sparseMtmpCandList[temIdx][candIdx].m_pY, iRefineRange
 #if JVET_AG0136_INTRA_TMP_LIC
                                     , TMP_SAMPLING
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                    , sparseMtmpCandList[temIdx][candIdx].m_isTransferredLeft, sparseMtmpCandList[temIdx][candIdx].m_isTransferredTop, bestRegionId, iCurrX + mvXMin >= TMP_TEMPLATE_SIZE + iRefineRange, iCurrY + mvYMin >= TMP_TEMPLATE_SIZE + iRefineRange, !pcCU->slice->getSPS()->getItmpLicMode()
 #endif
                                     );
       }
@@ -15340,28 +15828,51 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
 #endif
       );
 #endif
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       if (!(mvYMax < mvYMin || mvXMax < mvXMin))
       {
+#endif
         for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset -= iRefine)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          bool isAvailablePairFound{false};
+#endif
           for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset -= iRefine)
           {
 #if JVET_AE0077_EXT_INTRATMP
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            if ((bestRegionId == 4 || bestRegionId == 5 || (!pcCU->slice->getSPS()->getItmpLicMode() && ((bestRegionId == 3 && iXOffset > 0) || (bestRegionId == 1 && iYOffset > 0)))) && !isAvailablePairFound)
+#else
             if (bestRegionId == 4 || bestRegionId == 5)
+#endif
             {
               Position bottomRight(iCurrX + iXOffset + uiBlkWidth - 1, iCurrY + iYOffset + uiBlkHeight - 1);
               if (!pcCU->cs->isDecomp(bottomRight, CHANNEL_TYPE_LUMA))
               {
                 continue;
               }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+              else
+              {
+                isAvailablePairFound = true;
+              }
+#endif
             }
 #endif
 #if JVET_AG0151_INTRA_TMP_MERGE_MODE
 #if JVET_AG0136_INTRA_TMP_LIC
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            if (clustMrgReg[temIdx])
+#else
             if (isBvAddedReg)
+#endif
             {
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+              bRedundant = false;
+#else
               bool bRedundant = false;
+#endif
               for (int i = 0; i < refineMtmpCandList[temIdx].size(); i++)
               {
                 if (iYOffset == refineMtmpCandList[temIdx][i].m_pY && iXOffset == refineMtmpCandList[temIdx][i].m_pX)
@@ -15397,8 +15908,12 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
             else
             {
               refCurr = ref + iYOffset * refStride + iXOffset;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+              m_calcTemplateDiff(refCurr, refStride, tarPatch, m_uiPicStride, uiPatchWidth, uiPatchHeight, diff, pDiff, tempType, temIdx
+#else
               m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, diff, pDiff, tempType,
                                  temIdx
+#endif
 #if JVET_AG0136_INTRA_TMP_LIC
                                  , useMR, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean
 #endif
@@ -15406,8 +15921,12 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
             }
             if (diff[temIdx] < pDiff[temIdx])
             {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+              updateCandList(TempLibFast(iXOffset, iYOffset, refineWindTL, refineWindBR, false, false, bestRegionId), diff[temIdx], refineMtmpCandList[temIdx], refineMtmpCostList[temIdx], mtmpNumRefine[temIdx]);
+#else
               updateCandList(TempLibFast(iXOffset, iYOffset, bestRegionId), diff[temIdx],
                              refineMtmpCandList[temIdx], refineMtmpCostList[temIdx], mtmpNumRefine[temIdx]);
+#endif
               if (refineMtmpCandList[temIdx].size() == mtmpNumRefine[temIdx])
               {
                 pDiff[temIdx] = std::min((int) refineMtmpCostList[temIdx][mtmpNumRefine[temIdx] - 1], pDiff[temIdx]);
@@ -15415,7 +15934,9 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
             }
           }
         }
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       }
+#endif
     }
   }
 
@@ -15438,8 +15959,17 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
 
   if (tempType == L_SHAPE_TEMPLATE && needTopLeft)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    // If the list size is less than INIT_TL_POS the Only-TL candidates are skip
+    if (refMtmpCandListTemp.size() >= INIT_TL_POS)
+    {
+      int  cnt        = 0;
+      bool bRedundant = false;
+      int  mvXCur, mvYCur, pos;
+#endif
     for (int temIdx = 2; temIdx >0; temIdx--)
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       int cnt = 0;
       for (int candIdx = 0; candIdx < refineMtmpCostList[temIdx].size() && cnt < TL_NUM; candIdx++)
       {
@@ -15448,6 +15978,13 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
 
         int  mvXCur     = refineMtmpCandList[temIdx][candIdx].m_pX;
         int  mvYCur     = refineMtmpCandList[temIdx][candIdx].m_pY;
+#else
+      for (int candIdx = 0; candIdx < refineMtmpCandList[temIdx].size(); candIdx++)
+      {
+        bRedundant = false;
+        mvXCur     = refineMtmpCandList[temIdx][candIdx].m_pX;
+        mvYCur     = refineMtmpCandList[temIdx][candIdx].m_pY;
+#endif
 #if JVET_AG0136_INTRA_TMP_LIC
         for (int crIdx = 0; crIdx < refMtmpCandListTemp.size(); crIdx++)
         {
@@ -15467,8 +16004,12 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
 
         if (!bRedundant)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          pos = (MTMP_NUM - static_cast<int>(refineMtmpCandList[1].size()) - static_cast<int>(refineMtmpCandList[2].size())) + cnt++;
+#else
           cnt++;
           int pos = MTMP_NUM - 1 - TL_NUM * temIdx + cnt;
+#endif
 #if JVET_AG0136_INTRA_TMP_LIC
           if (pos < refMtmpCandListTemp.size())
           {
@@ -15487,9 +16028,19 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
             m_mtmpCandList[pos] = refineMtmpCandList[temIdx][candIdx];
 #endif
           }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          else
+          {
+            // TL candidates are appended sequentially if the list size is less than MTMP_NUM
+            refMtmpCandListTemp.push_back(refineMtmpCandList[temIdx][candIdx]);
+          }
+#endif
         }
       }
     }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    }   
+#endif
   }
 #else
 #if JVET_AB0130_ITMP_SAMPLING
@@ -15624,8 +16175,16 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
       pDiffSparse[i] = pDiffSupp[i] + (sparseMtmpCandListSupp[i].size() < mtmpNumSparseForLic[i] ? 0 : 1);
     }
 
-#if JVET_AG0151_INTRA_TMP_MERGE_MODE
+#if JVET_AG0151_INTRA_TMP_MERGE_MODE && (!JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT)
     const int RefineSizeForLic = pcCU->slice->getSPS()->getItmpLicMode() ? 5 : 2;
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    bool bRedundant = false;
+    Mv refineWindTL, refineWindBR;
+    int mvYMin = 0;
+    int mvYMax = 0;
+    int mvXMin = 0;
+    int mvXMax = 0;
 #endif
     for (int temIdx = 0; temIdx < 3; temIdx++)
     {
@@ -15646,12 +16205,27 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
       }
       for (int candIdx = 0; candIdx < sparseMtmpCandListSupp[temIdx].size(); candIdx++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        refineWindTL = sparseMtmpCandListSupp[temIdx][candIdx].m_windTL;
+        refineWindBR = sparseMtmpCandListSupp[temIdx][candIdx].m_windBR;
+#endif
         bestRegionId = sparseMtmpCandListSupp[temIdx][candIdx].m_rId;
 #if JVET_AG0151_INTRA_TMP_MERGE_MODE
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         int mvYMin = 0;
         int mvYMax = 0;
         int mvXMin = 0;
         int mvXMax = 0;
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        if (bestRegionId >= TMP_MRG_REG_ID)
+        {
+          mvYMin = refineWindTL.ver;
+          mvYMax = refineWindBR.ver;
+          mvXMin = refineWindTL.hor;
+          mvXMax = refineWindBR.hor;
+        }
+#else
         if (bestRegionId == 6)
         {
           mvYMin = sparseMtmpCandListSupp[temIdx][candIdx].m_pY - RefineSizeForLic;
@@ -15667,6 +16241,7 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
           mvXMin = sparseMtmpCandListSupp[temIdx][candIdx].m_pX - EBVP_RANGE;
           mvXMax = sparseMtmpCandListSupp[temIdx][candIdx].m_pX + EBVP_RANGE;
         }
+#endif 
 #endif 
         else
         {
@@ -15674,7 +16249,11 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
           mvYMax = mvYMaxs[bestRegionId];
           mvXMin = mvXMins[bestRegionId];
           mvXMax = mvXMaxs[bestRegionId];
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          clipMvIntraConstraintRefine(mvXMin, mvXMax, mvYMin, mvYMax, sparseMtmpCandListSupp[temIdx][candIdx].m_pX, sparseMtmpCandListSupp[temIdx][candIdx].m_pY, TMP_SAMPLING >> 1, TMP_SAMPLING, sparseMtmpCandListSupp[temIdx][candIdx].m_isTransferredLeft, sparseMtmpCandListSupp[temIdx][candIdx].m_isTransferredTop, bestRegionId, iCurrX + mvXMin >= TMP_TEMPLATE_SIZE + (TMP_SAMPLING >> 1), iCurrY + mvYMin >= TMP_TEMPLATE_SIZE + (TMP_SAMPLING >> 1), !pcCU->slice->getSPS()->getItmpLicMode());
+#else
           clipMvIntraConstraintRefine(mvXMin, mvXMax, mvYMin, mvYMax, sparseMtmpCandListSupp[temIdx][candIdx].m_pX, sparseMtmpCandListSupp[temIdx][candIdx].m_pY, TMP_SAMPLING >> 1, TMP_SAMPLING);
+#endif
         }
 #else
         int mvYMin       = mvYMins[bestRegionId];
@@ -15687,24 +16266,47 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
 #endif
         );
 #endif
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT  
         if (!(mvYMax < mvYMin || mvXMax < mvXMin))
         {
+#endif
           for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset -= 1)
           {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            bool isAvailablePairFound{false};
+#endif
             for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset -= 1)
             {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+              if ((bestRegionId == 4 || bestRegionId == 5 || (!pcCU->slice->getSPS()->getItmpLicMode() && ((bestRegionId == 3 && iXOffset > 0) || (bestRegionId == 1 && iYOffset > 0)))) && !isAvailablePairFound)
+#else
               if (bestRegionId == 4 || bestRegionId == 5)
+#endif
               {
                 Position bottomRight(iCurrX + iXOffset + uiBlkWidth - 1, iCurrY + iYOffset + uiBlkHeight - 1);
                 if (!pcCU->cs->isDecomp(bottomRight, CHANNEL_TYPE_LUMA))
                 {
                   continue;
                 }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                else
+                {
+                  isAvailablePairFound = true;
+                }
+#endif
               }
 #if JVET_AG0151_INTRA_TMP_MERGE_MODE
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+              if (clustMrgSupp[temIdx])
+#else
               if (isBvAddedSupp)
+#endif
               {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                bRedundant = false;
+#else
                 bool bRedundant = false;
+#endif
                 for (int i = 0; i < refineMtmpCandList[temIdx].size(); i++)
                 {
                   if (iYOffset == refineMtmpCandList[temIdx][i].m_pY && iXOffset == refineMtmpCandList[temIdx][i].m_pX)
@@ -15737,11 +16339,19 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
               else
               {
                 refCurr = ref + iYOffset * refStride + iXOffset;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                m_calcTemplateDiff(refCurr, refStride, tarPatch, m_uiPicStride, uiPatchWidth, uiPatchHeight, diffSupp, pDiffSupp, tempType, temIdx, true, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+#else
                 m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, diffSupp, pDiffSupp, tempType, temIdx, true, log2SizeTop, log2SizeLeft, sizeTopLeft, topTargetMean, leftTargetMean);
+#endif
               }
               if (diffSupp[temIdx] < pDiffSupp[temIdx])
               {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                updateCandList(TempLibFast(iXOffset, iYOffset, refineWindTL, refineWindBR, false, false, bestRegionId), diffSupp[temIdx], refineMtmpCandList[temIdx], refineMtmpCostList[temIdx], mtmpNumRefine[temIdx]);
+#else
                 updateCandList(TempLibFast(iXOffset, iYOffset, bestRegionId), diffSupp[temIdx], refineMtmpCandList[temIdx], refineMtmpCostList[temIdx], mtmpNumRefine[temIdx]);
+#endif
                 if (refineMtmpCandList[temIdx].size() == mtmpNumRefine[temIdx])
                 {
                   pDiffSupp[temIdx] = std::min((int) refineMtmpCostList[temIdx][mtmpNumRefine[temIdx] - 1], pDiffSupp[temIdx]);
@@ -15749,7 +16359,9 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
               }
             }
           }
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT  
         }
+#endif
       }
     }
 
@@ -15758,14 +16370,30 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
     static_vector<TempLibFast, MTMP_NUM>& refMtmpCandListTemp = m_mtmpCandListUseMR;
     if (tempType == L_SHAPE_TEMPLATE && needTopLeft)
     {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      // If the list size is less than INIT_TL_POS the Only-TL candidates are skip
+      if (refMtmpCandListTemp.size() >= INIT_TL_POS)
+      {
+        int  cnt        = 0;
+        bool bRedundant = false;
+        int  mvXCur, mvYCur, pos;
+#endif
       for (int temIdx = 2; temIdx > 0; temIdx--)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         int cnt = 0;
         for (int candIdx = 0; candIdx < refineMtmpCostList[temIdx].size() && cnt < TL_NUM; candIdx++)
         {
           bool bRedundant = false;
           const int mvXCur = refineMtmpCandList[temIdx][candIdx].m_pX;
           const int mvYCur = refineMtmpCandList[temIdx][candIdx].m_pY;
+#else
+        for (int candIdx = 0; candIdx < refineMtmpCostList[temIdx].size(); candIdx++)
+        {
+          bRedundant = false;
+          mvXCur = refineMtmpCandList[temIdx][candIdx].m_pX;
+          mvYCur = refineMtmpCandList[temIdx][candIdx].m_pY;
+#endif
           for (int crIdx = 0; crIdx < refMtmpCandListTemp.size(); crIdx++)
           {
             if (mvXCur == refMtmpCandListTemp[crIdx].m_pX && mvYCur == refMtmpCandListTemp[crIdx].m_pY)
@@ -15776,8 +16404,12 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
           }
           if (!bRedundant)
           {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            pos = (MTMP_NUM - static_cast<int>(refineMtmpCandList[1].size()) - static_cast<int>(refineMtmpCandList[2].size())) + cnt++;
+#else
             cnt++;
             int pos = MTMP_NUM - 1 - TL_NUM * temIdx + cnt;
+#endif
             if (pos < refMtmpCandListTemp.size())
             {
               for (int updatePos = (int) refMtmpCandListTemp.size() - 1; updatePos > pos; updatePos--)
@@ -15786,9 +16418,19 @@ void IntraPrediction::searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** ta
               }
               refMtmpCandListTemp[pos] = refineMtmpCandList[temIdx][candIdx];
             }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            else
+            {
+              // TL candidates are appended sequentially if the list size is less than MTMP_NUM
+              refMtmpCandListTemp.push_back(refineMtmpCandList[temIdx][candIdx]);
+            }
+#endif
           }
         }
       }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      }
+#endif
     }
     m_tmpNumCandUseMR = static_cast<int>(m_mtmpCandListUseMR.size());
     for (int i = 0; i < refMtmpCandListTemp.size(); i++)
@@ -16359,7 +17001,11 @@ void IntraPrediction::xCalcTmpFlmRefArea(CodingUnit *cu, unsigned int uiBlkWidth
   int iBlkHeight = uiBlkHeight;
   int bestPosX = iCurrX + pX;
   int bestPosY = iCurrY + pY;
-  if (regionId == 4 || regionId == 5)
+  if (regionId == 4 || regionId == 5
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      || regionId == 1 || regionId == 3
+#endif
+      )
   {
     if (!cu->cs->isDecomp(Position(bestPosX + iBlkWidth - 1, bestPosY + iBlkHeight), CHANNEL_TYPE_LUMA))
     {
@@ -17517,7 +18163,11 @@ TempLibFracFast::~TempLibFracFast()
 {
 }
 
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void IntraPrediction::searchFracCandidate(CodingUnit* pcCU, Pel* tarPatch, RefTemplateType tempType)
+#else
 void IntraPrediction::searchFracCandidate( CodingUnit* pcCU, Pel** tarPatch, RefTemplateType tempType)
+#endif
 {
   const int tmpIdx = pcCU->tmpIdx;
   m_mtmpFracCandList[tmpIdx].clear();
@@ -17573,7 +18223,11 @@ void IntraPrediction::searchFracCandidate( CodingUnit* pcCU, Pel** tarPatch, Ref
 
   if(pcCU->tmpLicFlag)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    m_calcTemplateDiff(refCurr, predTempStride, tarPatch, m_uiPicStride, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, true
+#else
     m_calcTemplateDiff(refCurr, predTempStride, tarPatch, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, true
+#endif
             , m_log2SizeTop
             , m_log2SizeLeft
             , m_sizeTopLeft
@@ -17582,7 +18236,11 @@ void IntraPrediction::searchFracCandidate( CodingUnit* pcCU, Pel** tarPatch, Ref
   }
   else
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    m_calcTemplateDiff(refCurr, predTempStride, tarPatch, m_uiPicStride, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, false, 0, 0, 0, 0, 0);
+#else
     m_calcTemplateDiff(refCurr, predTempStride, tarPatch, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, false, 0, 0, 0, 0, 0);
+#endif
   }
 
   diff[0] = (int) (diff[0] * TMP_INT_BV_COST_SCALE);
@@ -17611,7 +18269,11 @@ void IntraPrediction::searchFracCandidate( CodingUnit* pcCU, Pel** tarPatch, Ref
 
       if(pcCU->tmpLicFlag)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        m_calcTemplateDiff(refCurr, predTempStride, tarPatch, m_uiPicStride, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, true
+#else
         m_calcTemplateDiff(refCurr, predTempStride, tarPatch, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, true
+#endif
                 , m_log2SizeTop
                 , m_log2SizeLeft
                 , m_sizeTopLeft
@@ -17620,7 +18282,11 @@ void IntraPrediction::searchFracCandidate( CodingUnit* pcCU, Pel** tarPatch, Ref
       }
       else
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        m_calcTemplateDiff(refCurr, predTempStride, tarPatch, m_uiPicStride, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, false, 0, 0, 0, 0, 0);
+#else
         m_calcTemplateDiff(refCurr, predTempStride, tarPatch, pcCU->lwidth() + TMP_TEMPLATE_SIZE, pcCU->lheight() + TMP_TEMPLATE_SIZE, diff, pDiff, tempType, 0, false, 0, 0, 0, 0, 0);
+#endif
       }
       if (diff[0] < pDiff[0])
       {
@@ -17677,22 +18343,35 @@ bool IntraPrediction::generateTmDcPrediction( Pel* piPred, unsigned int uiStride
 #endif
 
 #if JVET_AG0136_INTRA_TMP_LIC 
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void IntraPrediction::calcTargetMean(Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean)
+#else
 void IntraPrediction::calcTargetMean(Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean)
+#endif
 {
   topTargetMean = 0;
   leftTargetMean = 0;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const Pel* tarPatchRow = tarPatch;
+#else
   const Pel* tarPatchRow = nullptr;
+#endif
   if (tempType == L_SHAPE_TEMPLATE)
   {
     if (requiredTemplate == 3 || requiredTemplate == 0 || requiredTemplate == 1)
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
         for (int iX = TMP_TEMPLATE_SIZE; iX < uiPatchWidth; iX++)
         {
           topTargetMean += tarPatchRow[iX];
         }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
       topTargetMean >>= log2SizeTop;
     }
@@ -17700,11 +18379,16 @@ void IntraPrediction::calcTargetMean(Pel** tarPatch, const unsigned int uiPatchW
     {
       for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
         for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
         {
           leftTargetMean += tarPatchRow[iX];
         }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
       leftTargetMean >>= log2SizeLeft;
     }
@@ -17713,11 +18397,16 @@ void IntraPrediction::calcTargetMean(Pel** tarPatch, const unsigned int uiPatchW
   {
     for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
       for (int iX = 0; iX < uiPatchWidth - TMP_TEMPLATE_SIZE; iX++)
       {
         topTargetMean += tarPatchRow[iX];
       }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
     topTargetMean >>= log2SizeTop;
   }
@@ -17725,32 +18414,55 @@ void IntraPrediction::calcTargetMean(Pel** tarPatch, const unsigned int uiPatchW
   {
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
       for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
       {
         leftTargetMean += tarPatchRow[iX];
       }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
     leftTargetMean >>= log2SizeLeft;
   }
 }
 
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const unsigned int uiStride, Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean, const int licShift)
+#else
 void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const unsigned int uiStride, Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean)
+#endif
 {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   Pel intermediate = 0;
+#endif
   int diffSumSad = 0;
   int diffSumMrsad = 0;
   int topDiffSad = MAX_INT;
   int topDiffMrsad = MAX_INT;
   int leftDiffSad = MAX_INT;
   int leftDiffMrsad = MAX_INT;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const Pel* const refLic = ref + licShift;
+  const Pel* tarPatchRow = tarPatch;
+#else
   const Pel* tarPatchRow = nullptr;
+#endif
   const Pel* refPatchRow = tempType == L_SHAPE_TEMPLATE ? ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE : (tempType == ABOVE_TEMPLATE ? ref - TMP_TEMPLATE_SIZE * uiStride : ref - TMP_TEMPLATE_SIZE);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const Pel* refPatchRowLic = tempType == L_SHAPE_TEMPLATE ? refLic - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE : (tempType == ABOVE_TEMPLATE ? refLic - TMP_TEMPLATE_SIZE * uiStride : refLic - TMP_TEMPLATE_SIZE);
+#endif
   int topMeanRef = 0;
   int leftMeanRef = 0;
   if (tempType == L_SHAPE_TEMPLATE)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const Pel* refPatchRowTemp = refPatchRowLic;
+#else
     const Pel* refPatchRowTemp = refPatchRow;
+#endif
     for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
     {
       for (int iX = TMP_TEMPLATE_SIZE; iX < uiPatchWidth; iX++)
@@ -17759,7 +18471,11 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
       }
     }
     topMeanRef >>= log2SizeTop;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    refPatchRowTemp = refLic - TMP_TEMPLATE_SIZE;
+#else
     refPatchRowTemp = ref - TMP_TEMPLATE_SIZE;
+#endif
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRowTemp += uiStride)
     {
       for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
@@ -17771,7 +18487,11 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
   }
   else if (tempType == ABOVE_TEMPLATE)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const Pel* refPatchRowTemp = refPatchRowLic;
+#else
     const Pel* refPatchRowTemp = refPatchRow;
+#endif
     for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
     {
       for (int iX = 0; iX < uiPatchWidth - TMP_TEMPLATE_SIZE; iX++)
@@ -17783,7 +18503,11 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
   }
   else if (tempType == LEFT_TEMPLATE)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const Pel* refPatchRowTemp = refPatchRowLic;
+#else
     const Pel* refPatchRowTemp = refPatchRow;
+#endif
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRowTemp += uiStride)
     {
       for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
@@ -17798,8 +18522,10 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
   int tempDiff1 = 0;
   int tempDiff2 = 0;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   int tempDiff3 = 0;
   int tempDiff4 = 0;
+#endif
 #endif
   if (tempType == L_SHAPE_TEMPLATE)
   {
@@ -17809,20 +18535,42 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
     topDiffMrsad = 0;
     leftDiffSad = 0;
     leftDiffMrsad = 0;
-    for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
+    for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+         , refPatchRowLic += uiStride
+#endif
+         )
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
       tempDiff1 = 0;
       tempDiff2 = 0;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tempDiff3 = 0;
       tempDiff4 = 0;
 #endif
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
+      {
+        diffSumSad += abs(refPatchRow[iX] - tarPatchRow[iX]);
+        diffSumMrsad += abs(refPatchRowLic[iX] - tarPatchRow[iX] - topMeanDiff);
+      }
+      for (int iX = TMP_TEMPLATE_SIZE; iX < uiPatchWidth; iX++)
+#else
       for (int iX = 0; iX < uiPatchWidth; iX++)
+#endif
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        iSumSad = abs(refPatchRow[iX] - tarPatchRow[iX]);
+        iSumMrsad = abs(refPatchRowLic[iX] - tarPatchRow[iX] - topMeanDiff);
+#else
         intermediate = refPatchRow[iX] - tarPatchRow[iX];
         iSumSad = abs(intermediate);
         iSumMrsad = abs(intermediate - topMeanDiff);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         tempDiff1 += iSumSad;
         tempDiff2 += iSumMrsad;
@@ -17830,6 +18578,7 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
         diffSumSad += iSumSad;
         diffSumMrsad += iSumMrsad;
 #endif
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         if (iX >= TMP_TEMPLATE_SIZE)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
@@ -17840,35 +18589,61 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
           topDiffMrsad += iSumMrsad;
 #endif
         }
+#endif
       }
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
           tempDiff1 <<= TMP_TEMPLATE_COST_SHIFT;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
           tempDiff3 <<= TMP_TEMPLATE_COST_SHIFT;
+#endif
           tempDiff2 <<= TMP_TEMPLATE_COST_SHIFT;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
           tempDiff4 <<= TMP_TEMPLATE_COST_SHIFT;
+#endif
         }
 
         diffSumSad += tempDiff1;
         diffSumMrsad += tempDiff2;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        topDiffSad += tempDiff1;
+        topDiffMrsad += tempDiff2;
+#else
         topDiffSad += tempDiff3;
         topDiffMrsad += tempDiff4;
+#endif
 #endif
       if (diffSumSad > iMaxSad[0] && topDiffSad > iMaxSad[1] && diffSumMrsad > iMaxMrsad[0] && topDiffMrsad > iMaxMrsad[1])
       {
         break;
       }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
     refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    refPatchRowLic = refLic - TMP_TEMPLATE_SIZE;
+    tarPatchRow = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+    for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride, refPatchRowLic += uiStride)
+#else
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
+#endif
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
       for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        iSumSad = abs(refPatchRow[iX] - tarPatchRow[iX]);
+        iSumMrsad = abs(refPatchRowLic[iX] - tarPatchRow[iX] - leftMeanDiff);
+#else
         intermediate = refPatchRow[iX] - tarPatchRow[iX];
         iSumSad = abs(intermediate);
         iSumMrsad = abs(intermediate - leftMeanDiff);
+#endif
         diffSumSad += iSumSad;
         diffSumMrsad += iSumMrsad;
         leftDiffSad += iSumSad;
@@ -17878,7 +18653,11 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
       tempDiff1 = (abs(refPatchRow[TMP_TEMPLATE_SIZE - 1] - tarPatchRow[TMP_TEMPLATE_SIZE - 1]))*((1<<TMP_TEMPLATE_COST_SHIFT) - 1);
       diffSumSad += tempDiff1;
       leftDiffSad += tempDiff1;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tempDiff1 = (abs(refPatchRowLic[TMP_TEMPLATE_SIZE - 1] - tarPatchRow[TMP_TEMPLATE_SIZE - 1] - leftMeanDiff))*((1<<TMP_TEMPLATE_COST_SHIFT) - 1);
+#else
       tempDiff1 = (abs(refPatchRow[TMP_TEMPLATE_SIZE - 1] - tarPatchRow[TMP_TEMPLATE_SIZE - 1] - leftMeanDiff))*((1<<TMP_TEMPLATE_COST_SHIFT) - 1);
+#endif
       diffSumMrsad += tempDiff1;  
       leftDiffMrsad += tempDiff1;
 #endif
@@ -17886,23 +18665,39 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
       {
         break;
       }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
   }
   else if (tempType == ABOVE_TEMPLATE)
   {
-    for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
+    for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+         , refPatchRowLic += uiStride
+#endif
+         )
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
       tempDiff1 = 0;
       tempDiff2 = 0;
 #endif
       for (int iX = 0; iX < uiPatchWidth - TMP_TEMPLATE_SIZE; iX++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         intermediate = refPatchRow[iX] - tarPatchRow[iX];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tempDiff1 += abs(refPatchRow[iX] - tarPatchRow[iX]);
+        tempDiff2 += abs(refPatchRowLic[iX] - tarPatchRow[iX] - topMeanDiff);
+#else
         tempDiff1 += abs(intermediate);
         tempDiff2 += abs(intermediate - topMeanDiff);
+#endif
 #else
         diffSumSad += abs(intermediate);
         diffSumMrsad += abs(intermediate - topMeanDiff);
@@ -17921,29 +18716,50 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
       {
         break;
       }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
   }
   else if (tempType == LEFT_TEMPLATE)
   {
-    for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
+    for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+         , refPatchRowLic += uiStride
+#endif
+         )
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
       for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        diffSumSad += abs(refPatchRow[iX] - tarPatchRow[iX]);
+        diffSumMrsad += abs(refPatchRowLic[iX] - tarPatchRow[iX] - leftMeanDiff);
+#else
         intermediate = refPatchRow[iX] - tarPatchRow[iX];
         diffSumSad += abs(intermediate);
         diffSumMrsad += abs(intermediate - leftMeanDiff);
+#endif
       }
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
       tempDiff1 = (abs(refPatchRow[TMP_TEMPLATE_SIZE - 1] - tarPatchRow[TMP_TEMPLATE_SIZE - 1]))*((1<<TMP_TEMPLATE_COST_SHIFT) - 1);
       diffSumSad += tempDiff1;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tempDiff1 = (abs(refPatchRowLic[TMP_TEMPLATE_SIZE - 1] - tarPatchRow[TMP_TEMPLATE_SIZE - 1] - leftMeanDiff))*((1<<TMP_TEMPLATE_COST_SHIFT) - 1);
+#else
       tempDiff1 = (abs(refPatchRow[TMP_TEMPLATE_SIZE - 1] - tarPatchRow[TMP_TEMPLATE_SIZE - 1] - leftMeanDiff))*((1<<TMP_TEMPLATE_COST_SHIFT) - 1);
+#endif
       diffSumMrsad += tempDiff1;  
 #endif
       if (diffSumSad > iMaxSad[0] && diffSumMrsad > iMaxMrsad[0])
       {
         break;
       }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
   }
   diffSad[0] = diffSumSad;
@@ -17956,12 +18772,21 @@ void IntraPrediction::calcTemplateDiffJointSadMrsad(const Pel* const ref, const
 #endif
 
 #if JVET_AD0086_ENHANCED_INTRA_TMP
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void IntraPrediction::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel* tarPatch, int tarStride, unsigned int uiPatchWidth,
+  unsigned int uiPatchHeight, int* diff, int* iMax, RefTemplateType tempType, int requiredTemplate
+#if JVET_AG0136_INTRA_TMP_LIC
+  , const bool isMrSad, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean
+#endif
+)
+#else
 void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsigned int uiPatchWidth,
                                        unsigned int uiPatchHeight, int *diff, int *iMax, RefTemplateType tempType, int requiredTemplate
 #if JVET_AG0136_INTRA_TMP_LIC
                                        , const bool isMrSad, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean
 #endif
                                        )
+#endif
 {
   int diffSum = 0;
   int topDiff  = MAX_INT;
@@ -17983,7 +18808,11 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
 #else
   Pel *refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const Pel* tarPatchRow = tarPatch;
+#else
   Pel *tarPatchRow;
+#endif
 
 #if JVET_AG0136_INTRA_TMP_LIC
   int topMeanDiff = 0;
@@ -18055,8 +18884,10 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
 #endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
   int tempDiff1 = 0;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
   int tempDiff2 = 0;
 #endif
+#endif
 #if JVET_W0069_TMP_BOUNDARY
   if (tempType == L_SHAPE_TEMPLATE)
   {
@@ -18070,12 +18901,28 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         tempDiff1 = 0;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tempDiff2 = 0;
 #endif
+#endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
+        {
+          diffSum += abs(refPatchRow[iX] - tarPatchRow[iX]
+#if JVET_AG0136_INTRA_TMP_LIC
+                         - topMeanDiff
+#endif
+                         );
+        }
+        for (int iX = TMP_TEMPLATE_SIZE; iX < uiPatchWidth; iX++)
+#else
         for (int iX = 0; iX < uiPatchWidth; iX++)
+#endif
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
           tempDiff1 += abs(refPatchRow[iX] - tarPatchRow[iX]
@@ -18087,6 +18934,7 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
 #endif
                          );
 
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
           if (iX >= TMP_TEMPLATE_SIZE)
           {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
@@ -18099,15 +18947,22 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
 #endif
                            );
           }
+#endif
         }
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
           tempDiff1 <<= TMP_TEMPLATE_COST_SHIFT;
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
           tempDiff2 <<= TMP_TEMPLATE_COST_SHIFT;
+#endif
         }
         diffSum += tempDiff1;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        topDiff += tempDiff1;
+#else
         topDiff += tempDiff2;
+#endif
 #endif
 
         if (diffSum > iMax[0] && topDiff > iMax[1])
@@ -18116,14 +18971,22 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         }
 
         refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
 
       refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
 
       // vertical difference
       for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
         for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
         {
           diffSum += abs(refPatchRow[iX] - tarPatchRow[iX]
@@ -18153,17 +19016,34 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         }
 
         refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
     }
     else if (requiredTemplate == 0)//TL
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         tempDiff1 = 0;
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
+        {
+          diffSum += abs(refPatchRow[iX] - tarPatchRow[iX]
+#if JVET_AG0136_INTRA_TMP_LIC
+                         - topMeanDiff
+#endif
+                         );
+        }
+        for (int iX = TMP_TEMPLATE_SIZE; iX < uiPatchWidth; iX++)
+#else
         for (int iX = 0; iX < uiPatchWidth; iX++)
+#endif
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
           tempDiff1 += abs(refPatchRow[iX] - tarPatchRow[iX]
@@ -18189,14 +19069,22 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         }
 
         refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
 
       refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
 
       // vertical difference
       for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
         for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
         {
           diffSum += abs(refPatchRow[iX] - tarPatchRow[iX]
@@ -18220,13 +19108,18 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         }
 
         refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
     }
     else if(requiredTemplate == 1) //T  
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         tempDiff1 = 0;
 #endif
@@ -18256,16 +19149,24 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         }
 
         refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
     }
     else // L
     {
       refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
 
       // vertical difference
       for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         tarPatchRow = tarPatch[iY];
+#endif
         for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
         {
           leftDiff += abs(refPatchRow[iX] - tarPatchRow[iX]
@@ -18289,6 +19190,9 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         }
 
         refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow += tarStride;
+#endif
       }
     }
 #if JVET_W0069_TMP_BOUNDARY
@@ -18298,7 +19202,9 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
     // top  template difference
     for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
       tempDiff1 = 0;
 #endif
@@ -18326,6 +19232,9 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         break;
       }
       refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
   }
   else if (tempType == LEFT_TEMPLATE)
@@ -18333,7 +19242,9 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
     // left template difference
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
     {
+#if !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
       tarPatchRow = tarPatch[iY];
+#endif
       for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
       {
         diffSum += abs(refPatchRow[iX] - tarPatchRow[iX]
@@ -18355,6 +19266,9 @@ void IntraPrediction::calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **ta
         break;
       }
       refPatchRow += uiStride;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow += tarStride;
+#endif
     }
   }
 #endif
diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h
index faa92c10712db077ca31014e55cde2223211bf93..3f76cef5acd76781b55ba31c2626e22e455c5dd1 100644
--- a/source/Lib/CommonLib/IntraPrediction.h
+++ b/source/Lib/CommonLib/IntraPrediction.h
@@ -78,6 +78,12 @@ class TempLibFast
 public:
   int   m_pX;           //offset X
   int   m_pY;           //offset Y
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  Mv    m_windTL;       // TL offset of refine window
+  Mv    m_windBR;       // BR offset of refine window
+  bool m_isTransferredLeft;
+  bool m_isTransferredTop;
+#endif
 #if JVET_AD0086_ENHANCED_INTRA_TMP
   int   m_rId;
 #else
@@ -88,10 +94,20 @@ public:
   TempLibFast();
   ~TempLibFast();
 #if JVET_AD0086_ENHANCED_INTRA_TMP
-  TempLibFast(const int pX, const int pY, const int rId)
+  TempLibFast(const int pX, const int pY
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    , const Mv windTL, const Mv windBR, const bool isTransferredLeft, const bool isTransferredTop
+#endif
+    , const int rId)
   {
     m_pX = pX;
     m_pY = pY;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    m_windTL = windTL;
+    m_windBR = windBR;
+    m_isTransferredLeft = isTransferredLeft;
+    m_isTransferredTop = isTransferredTop;
+#endif
     m_rId = rId;
   };
 #endif
@@ -341,6 +357,9 @@ private:
   static const uint8_t m_aucIntraFilterExt[MAX_INTRA_FILTER_DEPTHS];
   RdCost* m_timdSatdCost;
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  RdCost* m_itmpSatdCost;
+#endif
 #if JVET_AC0071_DBV
   RdCost *m_dbvSadCost;
 #endif
@@ -474,8 +493,12 @@ protected:
   Picture*     m_refPicBuf;
   unsigned int m_uiPicStride;
   unsigned int m_uiVaildCandiNum;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  Pel* m_pppTarPatch;
+#else
   Pel***       m_pppTarPatch;
 #endif
+#endif
 
 #if TMP_FAST_ENC
 #if JVET_AD0086_ENHANCED_INTRA_TMP
@@ -1073,16 +1096,34 @@ public:
 #if JVET_W0069_TMP_BOUNDARY
 #if JVET_AD0086_ENHANCED_INTRA_TMP
 #if JVET_AG0136_INTRA_TMP_LIC
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  void (*m_calcTemplateDiffJointSadMrsad) (const Pel* const ref, const unsigned int uiStride, Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean, const int licShift);
+  void(*m_calcTargetMean)           (Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean);
+  static void calcTemplateDiffJointSadMrsad(const Pel* const ref, const unsigned int uiStride, Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean, const int licShift);
+  static void calcTargetMean(Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean);
+#else
   void (*m_calcTemplateDiffJointSadMrsad) (const Pel* const ref, const unsigned int uiStride, Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean);
   void(*m_calcTargetMean)           (Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean);
   static void calcTemplateDiffJointSadMrsad(const Pel* const ref, const unsigned int uiStride, Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean);
   static void calcTargetMean(Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean);
+#endif //
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  void(*m_calcTemplateDiff)      (Pel* ref, unsigned int uiStride, Pel* tarPatch, int tarStride, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int* diff, int* iMax, RefTemplateType TempType, int requiredTemplate
+#else
   void(*m_calcTemplateDiff)      (Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int *diff, int *iMax, RefTemplateType TempType, int requiredTemplate
+#endif
 #if JVET_AG0136_INTRA_TMP_LIC
                                   , const bool isMrSad, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean
 #endif
                                   );
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  static void calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel* tarPatch, int tarStride, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int* diff, int* iMax, RefTemplateType TempType, int requiredTemplate
+#if JVET_AG0136_INTRA_TMP_LIC
+                               , const bool isMrSad, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean
+#endif
+                               );
+#else
   static void calcTemplateDiff(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsigned int uiPatchWidth,
                                unsigned int uiPatchHeight, int *diff, int *iMax, RefTemplateType TempType,
                                int requiredTemplate
@@ -1090,6 +1131,7 @@ public:
                                , const bool isMrSad, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean
 #endif
                                );
+#endif
 #else
   int( *m_calcTemplateDiff )      ( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType );
   static int calcTemplateDiff     ( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType );
@@ -1098,7 +1140,11 @@ public:
   int( *m_calcTemplateDiff )      (Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
   static int calcTemplateDiff     ( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax );
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  Pel* getTargetPatch() { return m_pppTarPatch; }
+#else
   Pel** getTargetPatch            ( unsigned int uiDepth )      { return m_pppTarPatch[uiDepth]; }
+#endif
   Pel* getRefPicUsed              ()                            { return m_refPicUsed;           }
   void setRefPicUsed              ( Pel* ref )                  { m_refPicUsed = ref;            }
   unsigned int getStride          ()                            { return m_uiPicStride;          }
@@ -1106,7 +1152,11 @@ public:
 
 #if JVET_W0069_TMP_BOUNDARY
   RefTemplateType getRefTemplateType ( CodingUnit& cu, CompArea& area );
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  void searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel* tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, RefTemplateType tempType
+#else
   void searchCandidateFromOnePicIntra( CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, RefTemplateType tempType
+#endif
 #if JVET_AG0136_INTRA_TMP_LIC
                                       , const bool useMR
 #endif
@@ -1122,7 +1172,11 @@ public:
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
   void xPadForFracSearchInterpolation (CodingUnit* pcCU, RefTemplateType tempType);
   void xTmpFracSearchIF(PredictionUnit& pu, Pel* padbf0, unsigned int padStride, Pel* preTmpbf0, unsigned int predTempStride, Pel* tmp0, unsigned int tmpStride, int extUiWidth, int extUiHeight, int fracPrec, int fracDir);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  void searchFracCandidate(CodingUnit* pcCU, Pel* tarPatch, RefTemplateType tempType);
+#else
   void searchFracCandidate( CodingUnit* pcCU, Pel** tarPatch, RefTemplateType tempType);
+#endif
   InterPrediction *m_pcInterPred;
   void setInterPrediction( InterPrediction *inter);
 #endif
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 50cfa23e055a7ba75701f011ebe7a9694608724f..85355e9b760013d6533410ba49a4e2f309497651 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -143,6 +143,7 @@
 #define JVET_AG0146_DIMD_ITMP_IBC                         1 // JVET-AG0146: DIMD with Intra TMP and IBC
 #define JVET_AH0055_INTRA_TMP_ARBVP                       1 // JVET-AH0055: AR-BVP for intra TMP merge candidates
 #define JVET_AH0200_INTRA_TMP_BV_REORDER                  1 // JVET-AH0200: Intra TMP BV reordering
+#define JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT      1 // JVET-AI0129: Intra TMP candidates with overlapping refinement window enhanced.
 #endif
 
 #define JVET_W0123_TIMD_FUSION                            1 // JVET-W0123: Template based intra mode derivation and fusion
diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp
index 417e2bd8d461f0b0919c8ba6eca93e63d60db0ae..85a8389bba83137c858eaae75ff315caa742bcbf 100644
--- a/source/Lib/CommonLib/UnitTools.cpp
+++ b/source/Lib/CommonLib/UnitTools.cpp
@@ -1454,6 +1454,99 @@ void getNeighBv(const PredictionUnit& puOrg, const PredictionUnit* pu, std::vect
   }
 }
 
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void PU::getSparseArBvMergeCandidate(const PredictionUnit& pu, std::vector<Mv>& pBvs, static_vector<TempLibFast, MTMP_NUM_SPARSE> &sparseMtmpCandList)
+{
+  const int numMrgMArbvp = static_cast<int>(pBvs.size());
+  const int totalNum     = numMrgMArbvp + NUM_TMP_ARBVP_S;
+  int       end          = (int) sparseMtmpCandList.size();
+  Position  posCand[5]   = { pu.Y().center(), pu.Y().topLeft(), pu.Y().topRight(), pu.Y().bottomLeft(), pu.Y().bottomRight() };
+  int       offsetX = 0, offsetY = 0;
+  Mv        cMv, arbv, arbv2, bv;
+
+  for (int mergeIndex = 0; mergeIndex < end && pBvs.size() < totalNum; mergeIndex++)
+  {
+    offsetX = sparseMtmpCandList[mergeIndex].m_pX;
+    offsetY = sparseMtmpCandList[mergeIndex].m_pY;
+    cMv     = Mv(offsetX, offsetY);
+
+    for (int n = 0; n < 5 && pBvs.size() < totalNum; n++)
+    {
+      const PredictionUnit* puCascaded = pu.cs->getPURestricted(posCand[n].offset(offsetX, offsetY), pu, pu.chType);
+      if (!puCascaded || ((puCascaded->cu->predMode != MODE_IBC) && (!puCascaded->cu->tmpFlag)))
+      {
+        continue;
+      }
+
+      arbv = cMv + puCascaded->bv;
+      if (PU::validItmpBv(pu, arbv.hor, arbv.ver))
+      {
+        if (!PU::CheckBvAvailable(pBvs, arbv))
+        {
+          pBvs.push_back(arbv);
+          if (pBvs.size() >= totalNum)
+          {
+            break;
+          }
+        }
+      }
+      if (PU::validItmpBv(pu, puCascaded->bv.hor, puCascaded->bv.ver))
+      {
+        if (!PU::CheckBvAvailable(pBvs, puCascaded->bv))
+        {
+          pBvs.push_back(puCascaded->bv);
+          if (pBvs.size() >= totalNum)
+          {
+            break;
+          }
+        }
+      }
+      if ((puCascaded->cu->predMode == MODE_IBC && puCascaded->interDir == 3) || (puCascaded->cu->tmpFlag && puCascaded->cu->tmpIdx > 0
+#if JVET_AG0136_INTRA_TMP_LIC
+        && !puCascaded->cu->tmpLicFlag
+#endif
+        ))
+      {
+        if (puCascaded->cu->predMode == MODE_IBC)
+        {
+          bv = puCascaded->mv[REF_PIC_LIST_1];
+          bv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT);
+        }
+        else
+        {
+          bv = Mv(puCascaded->cu->tmpXdisp, puCascaded->cu->tmpYdisp);
+        }
+        arbv2 = cMv + bv;
+        if (PU::validItmpBv(pu, arbv2.hor, arbv2.ver))
+        {
+          if (!PU::CheckBvAvailable(pBvs, arbv2))
+          {
+            pBvs.push_back(arbv2);
+            if (pBvs.size() >= totalNum)
+            {
+              break;
+            }
+          }
+        }
+        if (PU::validItmpBv(pu, bv.hor, bv.ver))
+        {
+          if (!PU::CheckBvAvailable(pBvs, bv))
+          {
+            pBvs.push_back(bv);
+
+            if (pBvs.size() >= totalNum)
+            {
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  return;
+}
+#endif
+
 int PU::getItmpMergeCandidate(const PredictionUnit& pu, std::vector<Mv>& pBvs
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
     , std::vector<Mv>& pSgpmMvs
diff --git a/source/Lib/CommonLib/UnitTools.h b/source/Lib/CommonLib/UnitTools.h
index 1ec48889760a33f17d6c6e610a2af99e19edee3f..2bbce580a714faa0232172417ae4245e5e48ccc7 100644
--- a/source/Lib/CommonLib/UnitTools.h
+++ b/source/Lib/CommonLib/UnitTools.h
@@ -213,6 +213,9 @@ namespace PU
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
   bool validIBCItmpMv(const PredictionUnit& pu, Mv curMv, int templateSize);
 #endif
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  void  getSparseArBvMergeCandidate(const PredictionUnit& pu, std::vector<Mv>& pBvs, static_vector<TempLibFast, MTMP_NUM_SPARSE>& sparseMtmpCandList);
+#endif
 #endif
 #if JVET_AD0184_REMOVAL_OF_DIVISION_OPERATIONS
   int getMeanValue(int sum, int div);
diff --git a/source/Lib/CommonLib/x86/IntraX86.h b/source/Lib/CommonLib/x86/IntraX86.h
index b3302bbdbe474a69d383d421b5b5a3b81d871c5a..96cf4777fabc9498e0d140178804744a00a5ef90 100644
--- a/source/Lib/CommonLib/x86/IntraX86.h
+++ b/source/Lib/CommonLib/x86/IntraX86.h
@@ -164,11 +164,25 @@ inline int summation16(const short* pSrc, const int start, const int end)
 #endif
 
 template<X86_VEXT vext>
-inline void calcDiffDelta4Joint(const short* const pSrc1, const short* const pSrc2, const __m128i delta, const int start, const int end, int& sad, int& mrsad)
+inline void calcDiffDelta4Joint(const short* const pSrc1, const short* const pSrc2, const __m128i delta, const int start, const int end, int& sad, int& mrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                , const int licShift
+#endif
+                                )
 {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const __m128i referenceVec = _mm_loadl_epi64((const __m128i*) &pSrc2[start]);
+  const __m128i difference = _mm_sub_epi16(_mm_loadl_epi64((const __m128i*) &pSrc1[start]), referenceVec);
+  const __m128i differenceLic = _mm_sub_epi16(_mm_loadl_epi64((const __m128i*) &pSrc1[start + licShift]), referenceVec);
+#else
   const __m128i difference = _mm_sub_epi16(_mm_loadl_epi64((const __m128i*) &pSrc1[start]), _mm_loadl_epi64((const __m128i*) &pSrc2[start]));
+#endif
   const __m128i vsumSad16 = _mm_abs_epi16(difference);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const __m128i vsumMrsad16 = _mm_abs_epi16(_mm_sub_epi16(differenceLic, delta));
+#else
   const __m128i vsumMrsad16 = _mm_abs_epi16(_mm_sub_epi16(difference, delta));
+#endif
   const __m128i vzero = _mm_setzero_si128();
   __m128i vsumSad32 = _mm_unpacklo_epi16(vsumSad16, vzero);
   vsumSad32 = _mm_add_epi32(vsumSad32, _mm_shuffle_epi32(vsumSad32, 0x4e));
@@ -192,12 +206,26 @@ inline int calcDiffDelta4(const short* const pSrc1, const short* const pSrc2, co
 }
 
 template<X86_VEXT vext>
-inline void calcDiffDelta8Joint(const short* const pSrc1, const short* const pSrc2, const __m128i delta, const int start, const int end, int& sad, int& mrsad)
+inline void calcDiffDelta8Joint(const short* const pSrc1, const short* const pSrc2, const __m128i delta, const int start, const int end, int& sad, int& mrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                , const int licShift
+#endif
+                                )
 {
 #if USE_AVX2
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const __m128i referenceVec = _mm_lddqu_si128((const __m128i*) &pSrc2[start]);
+  const __m128i difference = _mm_sub_epi16(_mm_loadu_si128((const __m128i*) &pSrc1[start]), referenceVec);
+  const __m128i differenceLic = _mm_sub_epi16(_mm_loadu_si128((const __m128i*) &pSrc1[start + licShift]), referenceVec);
+#else
   const __m128i difference = _mm_sub_epi16(_mm_loadu_si128((const __m128i*) &pSrc1[start]), _mm_lddqu_si128((const __m128i*) &pSrc2[start]));
+#endif
   const __m128i vsumSad16 = _mm_abs_epi16(difference);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const __m128i vsumMrsad16 = _mm_abs_epi16(_mm_sub_epi16(differenceLic, delta));
+#else
   const __m128i vsumMrsad16 = _mm_abs_epi16(_mm_sub_epi16(difference, delta));
+#endif
   const __m128i vzero = _mm_setzero_si128();
 #else
   const __m128i vzero = _mm_setzero_si128();
@@ -207,9 +235,17 @@ inline void calcDiffDelta8Joint(const short* const pSrc1, const short* const pSr
   {
     const __m128i vsrc1 = _mm_loadu_si128((const __m128i*) &pSrc1[iX]);
     const __m128i vsrc2 = _mm_lddqu_si128((const __m128i*) &pSrc2[iX]);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const __m128i vsrcLic1 = _mm_loadu_si128((const __m128i*) & pSrc1[iX + licShift]);
+    const __m128i differenceLic = _mm_sub_epi16(vsrcLic1, vsrc2);
+#endif
     const __m128i difference = _mm_sub_epi16(vsrc1, vsrc2);
     vsumSad16 = _mm_add_epi16(vsumSad16, _mm_abs_epi16(difference));
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    vsumMrsad16 = _mm_add_epi16(vsumMrsad16, _mm_abs_epi16(_mm_sub_epi16(differenceLic, delta)));
+#else
     vsumMrsad16 = _mm_add_epi16(vsumMrsad16, _mm_abs_epi16(_mm_sub_epi16(difference, delta)));
+#endif
   }
 #endif
   __m128i vsumSad32 = _mm_add_epi32(_mm_unpacklo_epi16(vsumSad16, vzero), _mm_unpackhi_epi16(vsumSad16, vzero));
@@ -246,7 +282,11 @@ inline int calcDiffDelta8(const short* const pSrc1, const short* const pSrc2, co
 
 #if USE_AVX2
 template<X86_VEXT vext>
-inline void calcDiffDelta16Joint(const short* pSrc1, const short* pSrc2, const __m256i delta, const int start, const int end, int& sad, int& mrsad)
+inline void calcDiffDelta16Joint(const short* pSrc1, const short* pSrc2, const __m256i delta, const int start, const int end, int& sad, int& mrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                 , const int licShift
+#endif
+                                 )
 {
   const __m256i vzero = _mm256_setzero_si256();
   __m256i vsumSad16 = vzero;
@@ -254,10 +294,20 @@ inline void calcDiffDelta16Joint(const short* pSrc1, const short* pSrc2, const _
   for (int iX = start; iX < end; iX += 16)
   {
     const __m256i vsrc1 = _mm256_loadu_si256((const __m256i*) &pSrc1[iX]);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const __m256i vsrcLic1 = _mm256_loadu_si256((const __m256i*) & pSrc1[iX + licShift]);
+#endif
     const __m256i vsrc2 = _mm256_lddqu_si256((const __m256i*) &pSrc2[iX]);
     const __m256i difference = _mm256_sub_epi16(vsrc1, vsrc2);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const __m256i differenceLic = _mm256_sub_epi16(vsrcLic1, vsrc2);
+#endif
     vsumSad16 = _mm256_add_epi16(vsumSad16, _mm256_abs_epi16(difference));
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    vsumMrsad16 = _mm256_add_epi16(vsumMrsad16, _mm256_abs_epi16(_mm256_sub_epi16(differenceLic, delta)));
+#else
     vsumMrsad16 = _mm256_add_epi16(vsumMrsad16, _mm256_abs_epi16(_mm256_sub_epi16(difference, delta)));
+#endif
   }
   __m256i vsumSad32 = _mm256_add_epi32(_mm256_unpacklo_epi16(vsumSad16, vzero), _mm256_unpackhi_epi16(vsumSad16, vzero));
   vsumSad32 = _mm256_add_epi32(vsumSad32, _mm256_shuffle_epi32(vsumSad32, 0x4e));
@@ -288,10 +338,17 @@ inline int calcDiffDelta16(const short* pSrc1, const short* pSrc2, const __m256i
 #endif
 
 template<X86_VEXT vext>
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void calcTargetMeanSIMD(Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean)
+#else
 void calcTargetMeanSIMD(Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, const RefTemplateType tempType, const int requiredTemplate, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, int& topTargetMean, int& leftTargetMean)
+#endif
 {
   topTargetMean = 0;
   leftTargetMean = 0;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const Pel* tmpBuf = tarPatch;
+#endif
   if (tempType == L_SHAPE_TEMPLATE)
   {
     if (requiredTemplate == 3 || requiredTemplate == 0 || requiredTemplate == 1)
@@ -301,7 +358,12 @@ void calcTargetMeanSIMD(Pel** tarPatch, const unsigned int uiPatchWidth, const u
       {
         for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          topTargetMean += summation16<vext>((const short*)tmpBuf, TMP_TEMPLATE_SIZE, uiPatchWidth);
+          tmpBuf += tarStride;
+#else
           topTargetMean += summation16<vext>((const short*) tarPatch[iY], TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
         }
       }
       else if (((uiPatchWidth - TMP_TEMPLATE_SIZE) & 7) == 0)
@@ -311,14 +373,24 @@ void calcTargetMeanSIMD(Pel** tarPatch, const unsigned int uiPatchWidth, const u
       {
         for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          topTargetMean += summation8<vext>((const short*)tmpBuf, TMP_TEMPLATE_SIZE, uiPatchWidth);
+          tmpBuf += tarStride;
+#else
           topTargetMean += summation8<vext>((const short*) tarPatch[iY], TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
         }
       }
       else if (uiPatchWidth == 8)
       {
         for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          topTargetMean += summation4<vext>((const short*)tmpBuf, TMP_TEMPLATE_SIZE, uiPatchWidth);
+          tmpBuf += tarStride;
+#else
           topTargetMean += summation4<vext>((const short*) tarPatch[iY], TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
         }
       }
       topTargetMean >>= log2SizeTop;
@@ -327,7 +399,12 @@ void calcTargetMeanSIMD(Pel** tarPatch, const unsigned int uiPatchWidth, const u
     {
       for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        leftTargetMean += summation4<vext>((const short*)tmpBuf, 0, TMP_TEMPLATE_SIZE);
+        tmpBuf += tarStride;
+#else
         leftTargetMean += summation4<vext>((const short*) tarPatch[iY], 0, TMP_TEMPLATE_SIZE);
+#endif
       }
       leftTargetMean >>= log2SizeLeft;
     }
@@ -339,7 +416,12 @@ void calcTargetMeanSIMD(Pel** tarPatch, const unsigned int uiPatchWidth, const u
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        topTargetMean += summation16<vext>((const short*)tmpBuf, 0, uiPatchWidth - TMP_TEMPLATE_SIZE);
+        tmpBuf += tarStride;
+#else
         topTargetMean += summation16<vext>((const short*) tarPatch[iY], 0, uiPatchWidth - TMP_TEMPLATE_SIZE);
+#endif
       }
     }
     else if (((uiPatchWidth - TMP_TEMPLATE_SIZE) & 7) == 0)
@@ -349,14 +431,24 @@ void calcTargetMeanSIMD(Pel** tarPatch, const unsigned int uiPatchWidth, const u
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        topTargetMean += summation8<vext>((const short*)tmpBuf, 0, uiPatchWidth - TMP_TEMPLATE_SIZE);
+        tmpBuf += tarStride;
+#else
         topTargetMean += summation8<vext>((const short*) tarPatch[iY], 0, uiPatchWidth - TMP_TEMPLATE_SIZE);
+#endif
       }
     }
     else if (uiPatchWidth == 8)
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        topTargetMean += summation4<vext>((const short*)tmpBuf, 0, uiPatchWidth - TMP_TEMPLATE_SIZE);
+        tmpBuf += tarStride;
+#else
         topTargetMean += summation4<vext>((const short*) tarPatch[iY], 0, uiPatchWidth - TMP_TEMPLATE_SIZE);
+#endif
       }
     }
     topTargetMean >>= log2SizeTop;
@@ -365,7 +457,12 @@ void calcTargetMeanSIMD(Pel** tarPatch, const unsigned int uiPatchWidth, const u
   {
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
     {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      leftTargetMean += summation4<vext>((const short*)tmpBuf, 0, TMP_TEMPLATE_SIZE);
+      tmpBuf += tarStride;
+#else
       leftTargetMean += summation4<vext>((const short*) tarPatch[iY], 0, TMP_TEMPLATE_SIZE);
+#endif
     }
     leftTargetMean >>= log2SizeLeft;
   }
@@ -385,7 +482,11 @@ inline __m128i calcMeanRefLeftSIMD(const Pel* const ref, const unsigned int uiPa
 }
 
 template<X86_VEXT vext>
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int uiStride, Pel* tarPatch, int tarStride, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean, const int licShift)
+#else
 void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int uiStride, Pel** tarPatch, const unsigned int uiPatchWidth, const unsigned int uiPatchHeight, int* diffSad, int* diffMrsad, int* iMaxSad, int* iMaxMrsad, const RefTemplateType tempType, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean)
+#endif
 {
   int diffSumSad = 0;
   int diffSumMrsad = 0;
@@ -393,7 +494,14 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
   int topDiffMrsad = MAX_INT;
   int leftDiffSad = MAX_INT;
   int leftDiffMrsad = MAX_INT;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const Pel* const refLic = ref + licShift;
+  const Pel* tmpBuf = tarPatch;
+#endif
   const Pel* refPatchRow = tempType == L_SHAPE_TEMPLATE ? ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE : (tempType == ABOVE_TEMPLATE ? ref - TMP_TEMPLATE_SIZE * uiStride : ref - TMP_TEMPLATE_SIZE);
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  const Pel* refPatchRowLic = tempType == L_SHAPE_TEMPLATE ? refLic - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE : (tempType == ABOVE_TEMPLATE ? refLic - TMP_TEMPLATE_SIZE * uiStride : refLic - TMP_TEMPLATE_SIZE);
+#endif
   int topMeanRef = 0;
   __m128i topMeanDelta = _mm_setzero_si128();
 #if USE_AVX2
@@ -401,7 +509,11 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
 #endif
   if (tempType == L_SHAPE_TEMPLATE)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const Pel* refPatchRowTemp = refPatchRowLic;
+#else
     const Pel* refPatchRowTemp = refPatchRow;
+#endif
 #if USE_AVX2
     if (vext >= AVX2 && ((uiPatchWidth - TMP_TEMPLATE_SIZE) & 15) == 0)
     {
@@ -431,7 +543,11 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
   }
   else if (tempType == ABOVE_TEMPLATE)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const Pel* refPatchRowTemp = refPatchRowLic;
+#else
     const Pel* refPatchRowTemp = refPatchRow;
+#endif
 #if USE_AVX2
     if (vext >= AVX2 && ((uiPatchWidth - TMP_TEMPLATE_SIZE) & 15) == 0)
     {
@@ -481,9 +597,18 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
       {
         const short* const pSrc1 = (const short*) refPatchRowTemp;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        const short* const pSrc2 = (const short*)tmpBuf;
+        tmpBuf += tarStride;
+#else
         const short* const pSrc2 = (const short*) tarPatch[iY];
-        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad);
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#endif
+        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                  , licShift
+#endif
+                                  );
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
           iSumSad <<= TMP_TEMPLATE_COST_SHIFT;
@@ -492,7 +617,11 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
 #endif
         diffSumSad += iSumSad;
         diffSumMrsad += iSumMrsad;
-        calcDiffDelta16Joint<vext>(pSrc1, pSrc2, topMeanDelta256, TMP_TEMPLATE_SIZE, uiPatchWidth, iSumSad, iSumMrsad);
+        calcDiffDelta16Joint<vext>(pSrc1, pSrc2, topMeanDelta256, TMP_TEMPLATE_SIZE, uiPatchWidth, iSumSad, iSumMrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                   , licShift
+#endif
+                                   );
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
@@ -519,9 +648,18 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
       {
         const short* const pSrc1 = (const short*) refPatchRowTemp;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        const short* const pSrc2 = (const short*)tmpBuf;
+        tmpBuf += tarStride;
+#else
         const short* const pSrc2 = (const short*) tarPatch[iY];
-        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad);
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#endif
+        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                  , licShift
+#endif
+                                  );
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
           iSumSad <<= TMP_TEMPLATE_COST_SHIFT;
@@ -530,7 +668,11 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
 #endif
         diffSumSad += iSumSad;
         diffSumMrsad += iSumMrsad;
-        calcDiffDelta8Joint<vext>(pSrc1, pSrc2, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth, iSumSad, iSumMrsad);
+        calcDiffDelta8Joint<vext>(pSrc1, pSrc2, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth, iSumSad, iSumMrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                  , licShift
+#endif
+                                  );
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
@@ -554,9 +696,18 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
       {
         const short* const pSrc1 = (const short*) refPatchRowTemp;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        const short* const pSrc2 = (const short*)tmpBuf;
+        tmpBuf += tarStride;
+#else
         const short* const pSrc2 = (const short*) tarPatch[iY];
-        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad);
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#endif
+        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                  , licShift
+#endif
+                                  );
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
           iSumSad <<= TMP_TEMPLATE_COST_SHIFT;
@@ -565,7 +716,11 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
 #endif
         diffSumSad += iSumSad;
         diffSumMrsad += iSumMrsad;
-        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth, iSumSad, iSumMrsad);
+        calcDiffDelta4Joint<vext>(pSrc1, pSrc2, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth, iSumSad, iSumMrsad
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                                  , licShift
+#endif
+                                  );
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
@@ -583,21 +738,41 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
         }
       }
     }
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const __m128i leftMeanDelta = calcMeanRefLeftSIMD<vext>(refLic, uiPatchHeight, uiStride, log2SizeLeft, leftTargetMean);
+#else
     const __m128i leftMeanDelta = calcMeanRefLeftSIMD<vext>(ref, uiPatchHeight, uiStride, log2SizeLeft, leftTargetMean);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
     const short leftMeanDeltaVal = (short)_mm_cvtsi128_si32(leftMeanDelta);
 #endif	
     refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    tmpBuf = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
     {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      calcDiffDelta4Joint<vext>((const short*)refPatchRow, (const short*)tmpBuf, leftMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad, licShift);
+#else
       calcDiffDelta4Joint<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], leftMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
       const short *pSrc1 = (const short *) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      const short* pSrc2 = tmpBuf;
+      tmpBuf += tarStride;
+#else
       const short *pSrc2 = (const short *) tarPatch[iY];
+#endif
       iSumSad +=
         (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1]) * ((1 <<TMP_TEMPLATE_COST_SHIFT)-1));
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      iSumMrsad += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1 + licShift] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal) * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
+#else
       iSumMrsad += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal)
                     * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
+#endif
 #endif
       diffSumSad += iSumSad;
       diffSumMrsad += iSumMrsad;
@@ -617,7 +792,12 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        calcDiffDelta16Joint<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta256, 0, iCols, iSumSad, iSumMrsad, licShift);
+        tmpBuf += tarStride;
+#else
         calcDiffDelta16Joint<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta256, 0, iCols, iSumSad, iSumMrsad);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
@@ -640,7 +820,12 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        calcDiffDelta8Joint<vext>((const short*)refPatchRow, (const short*)tmpBuf, topMeanDelta, 0, iCols, iSumSad, iSumMrsad, licShift);
+        tmpBuf += tarStride;
+#else
         calcDiffDelta8Joint<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta, 0, iCols, iSumSad, iSumMrsad);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
@@ -660,7 +845,12 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
     {
       for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        calcDiffDelta4Joint<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta, 0, iCols, iSumSad, iSumMrsad, licShift);
+        tmpBuf += tarStride;
+#else
         calcDiffDelta4Joint<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta, 0, iCols, iSumSad, iSumMrsad);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         if (iY == (TMP_TEMPLATE_SIZE - 1))
         {
@@ -679,20 +869,37 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
   }
   else if (tempType == LEFT_TEMPLATE)
   {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+    const __m128i leftMeanDelta = calcMeanRefLeftSIMD<vext>(refLic, uiPatchHeight, uiStride, log2SizeLeft, leftTargetMean);
+#else
     const __m128i leftMeanDelta = calcMeanRefLeftSIMD<vext>(ref, uiPatchHeight, uiStride, log2SizeLeft, leftTargetMean);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
     const short leftMeanDeltaVal = (short)_mm_cvtsi128_si32(leftMeanDelta);
 #endif	
     for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
     {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      calcDiffDelta4Joint<vext>((const short*)refPatchRow, (const short*)tmpBuf, leftMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad, licShift);
+#else
       calcDiffDelta4Joint<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], leftMeanDelta, 0, TMP_TEMPLATE_SIZE, iSumSad, iSumMrsad);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
       const short *pSrc1 = (const short *) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      const short* pSrc2 = tmpBuf;
+      tmpBuf += tarStride;
+#else
       const short *pSrc2 = (const short *) tarPatch[iY];
+#endif
       iSumSad += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1])
                   * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      iSumMrsad += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1 + licShift] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal) * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
+#else
       iSumMrsad += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal)
                     * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
+#endif
 #endif
       diffSumSad += iSumSad;
       diffSumMrsad += iSumMrsad;
@@ -711,6 +918,15 @@ void calcTemplateDiffJointSadMrsadSIMD(const Pel* const ref, const unsigned int
 }
 #endif
 template<X86_VEXT vext>
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+void calcTemplateDiffSIMD(Pel* ref, unsigned int uiStride, Pel* tarPatch, int tarStride, unsigned int uiPatchWidth,
+  unsigned int uiPatchHeight, int* diff, int* iMax, RefTemplateType tempType,
+  int requiredTemplate
+#if JVET_AG0136_INTRA_TMP_LIC
+  , const bool isMrSad, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean
+#endif
+)
+#else
 void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsigned int uiPatchWidth,
                           unsigned int uiPatchHeight, int *diff, int *iMax, RefTemplateType tempType,
                           int requiredTemplate
@@ -718,11 +934,15 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
                           , const bool isMrSad, const int log2SizeTop, const int log2SizeLeft, const int sizeTopLeft, const int topTargetMean, const int leftTargetMean
 #endif
                           )
+#endif
 {
   int diffSum  = 0;
   int topDiff  = MAX_INT;
   int leftDiff = MAX_INT;
   int iY;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+  Pel* tmpBuf = tarPatch;
+#endif
 #if JVET_W0069_TMP_BOUNDARY
   Pel *refPatchRow;
   if (tempType == L_SHAPE_TEMPLATE)
@@ -833,8 +1053,13 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
           {
             const short* const pSrc1 = (const short*) refPatchRowTemp;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            const short* const pSrc2 = (const short*)tmpBuf;
+            tmpBuf += tarStride;
+#else
             const short* const pSrc2 = (const short*) tarPatch[iY];
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#endif
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
             uiSum = calcDiffDelta4<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
@@ -868,8 +1093,13 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
           {
             const short* const pSrc1 = (const short*) refPatchRowTemp;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            const short* const pSrc2 = (const short*)tmpBuf;
+            tmpBuf += tarStride;
+#else
             const short* const pSrc2 = (const short*) tarPatch[iY];
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#endif
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
             uiSum = calcDiffDelta4<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
@@ -900,8 +1130,13 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRowTemp += uiStride)
           {
             const short* const pSrc1 = (const short*) refPatchRowTemp;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            const short* const pSrc2 = (const short*)tmpBuf;
+            tmpBuf += tarStride;
+#else
             const short* const pSrc2 = (const short*) tarPatch[iY];
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#endif
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
             uiSum = calcDiffDelta4<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
@@ -931,12 +1166,24 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         const short leftMeanDeltaVal = (short)_mm_cvtsi128_si32(leftMeanDelta);
 #endif
         refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tmpBuf = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
         for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          uiSum = calcDiffDelta4<vext>((const short*)refPatchRow, (const short*)tmpBuf, leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#else
           uiSum = calcDiffDelta4<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
           const short *pSrc1 = (const short *) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          const short* pSrc2 = tmpBuf;
+          tmpBuf += tarStride;
+#else
           const short *pSrc2 = (const short *) tarPatch[iY];
+#endif
           uiSum += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal)
                     * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
 #endif
@@ -957,14 +1204,23 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
           {
             const short* const pSrc1 = (const short*) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            const short* const pSrc2 = tmpBuf;
+            tmpBuf += tarStride;
+#else
             const short* const pSrc2 = (const short*) tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            diffSum += calcDiffDelta4<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#else
             uiSum = calcDiffDelta4<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
               uiSum <<= TMP_TEMPLATE_COST_SHIFT;
             }
             diffSum += uiSum;
+#endif
             uiSum = calcDiffDelta16<vext>(pSrc1, pSrc2, topMeanDelta256, TMP_TEMPLATE_SIZE, uiPatchWidth);
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
@@ -990,14 +1246,23 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
           {
             const short* const pSrc1 = (const short*) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            const short* const pSrc2 = tmpBuf;
+            tmpBuf += tarStride;
+#else
             const short* const pSrc2 = (const short*) tarPatch[iY];
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            diffSum += calcDiffDelta4<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#else
             uiSum = calcDiffDelta4<vext>(pSrc1, pSrc2, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
               uiSum <<= TMP_TEMPLATE_COST_SHIFT;
             }
             diffSum += uiSum;
+#endif
             uiSum = calcDiffDelta8<vext>(pSrc1, pSrc2, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth);
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
@@ -1020,7 +1285,13 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
           {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            diffSum += calcDiffDelta4<vext>((const short*) refPatchRow, tmpBuf, topMeanDelta, 0, TMP_TEMPLATE_SIZE);
+            uiSum = calcDiffDelta4<vext>((const short*) refPatchRow, tmpBuf, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth);
+            tmpBuf += tarStride;
+#else
             uiSum = calcDiffDelta8<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta, 0, uiPatchWidth);
+#endif
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
               uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1043,12 +1314,24 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           const short leftMeanDeltaVal = (short)_mm_cvtsi128_si32(leftMeanDelta);
 #endif
           refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          tmpBuf = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
           for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
           {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            diffSum += calcDiffDelta4<vext>((const short*)refPatchRow, tmpBuf, leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#else
             diffSum += calcDiffDelta4<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
             const short *pSrc1 = (const short *) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            const short* pSrc2 = tmpBuf;
+            tmpBuf += tarStride;
+#else
             const short *pSrc2 = (const short *) tarPatch[iY];
+#endif
             diffSum += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal)
                         * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
 #endif
@@ -1067,7 +1350,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
           {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            uiSum = calcDiffDelta16<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta256, TMP_TEMPLATE_SIZE, uiPatchWidth);
+            tmpBuf += tarStride;
+#else
             uiSum = calcDiffDelta16<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta256, TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
               uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1090,7 +1378,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
           {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            uiSum = calcDiffDelta8<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth);
+            tmpBuf += tarStride;
+#else
             uiSum = calcDiffDelta8<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
               uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1110,7 +1403,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
           for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
           {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            uiSum = calcDiffDelta4<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth);
+            tmpBuf += tarStride;
+#else
             uiSum = calcDiffDelta4<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta, TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
             if (iY == (TMP_TEMPLATE_SIZE-1)) 
             {
               uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1133,12 +1431,24 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         const short leftMeanDeltaVal = (short)_mm_cvtsi128_si32(leftMeanDelta);
 #endif
         refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tmpBuf = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
         for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          leftDiff += calcDiffDelta4<vext>((const short*)refPatchRow, tmpBuf, leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#else
           leftDiff += calcDiffDelta4<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
           const short *pSrc1 = (const short *) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          const short* pSrc2 = tmpBuf;
+          tmpBuf += tarStride;
+#else
           const short *pSrc2 = (const short *) tarPatch[iY];
+#endif
           leftDiff += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal)
                        * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
 #endif
@@ -1158,7 +1468,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          uiSum = calcDiffDelta16<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta256, 0, iCols);
+          tmpBuf += tarStride;
+#else
           uiSum = calcDiffDelta16<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta256, 0, iCols);
+#endif
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1181,7 +1496,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          uiSum = calcDiffDelta8<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta, 0, iCols);
+          tmpBuf += tarStride;
+#else
           uiSum = calcDiffDelta8<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta, 0, iCols);
+#endif
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1201,7 +1521,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          uiSum = calcDiffDelta4<vext>((const short*)refPatchRow, tmpBuf, topMeanDelta, 0, iCols);
+          tmpBuf += tarStride;
+#else
           uiSum = calcDiffDelta4<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], topMeanDelta, 0, iCols);
+#endif
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1225,10 +1550,19 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
 #endif
       for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++, refPatchRow += uiStride)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        diffSum += calcDiffDelta4<vext>((const short*)refPatchRow, tmpBuf, leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#else
         diffSum += calcDiffDelta4<vext>((const short*) refPatchRow, (const short*) tarPatch[iY], leftMeanDelta, 0, TMP_TEMPLATE_SIZE);
+#endif
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
         const short *pSrc1 = (const short *) refPatchRow;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        const short* pSrc2 = tmpBuf;
+        tmpBuf += tarStride;
+#else
         const short *pSrc2 = (const short *) tarPatch[iY];
+#endif
         diffSum += (abs(pSrc1[TMP_TEMPLATE_SIZE - 1] - pSrc2[TMP_TEMPLATE_SIZE - 1] - leftMeanDeltaVal)
                     * ((1 << TMP_TEMPLATE_COST_SHIFT) - 1));
 #endif
@@ -1258,9 +1592,14 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       {
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          const short* pSrc1 = tmpBuf;
+          tmpBuf += tarStride;
+#else
           const short* pSrc1 = (const short*) tarPatch[iY];
+#endif
           const short* pSrc2 = (const short*) refPatchRow;
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
           uiSum = calcDiff4<vext>(pSrc1, pSrc2, 0, TMP_TEMPLATE_SIZE);
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
@@ -1292,9 +1631,14 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       {
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          const short* pSrc1 = tmpBuf;
+          tmpBuf += tarStride;
+#else
           const short* pSrc1 = (const short*) tarPatch[iY];
+#endif
           const short* pSrc2 = (const short*) refPatchRow;
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
           uiSum = calcDiff4<vext>(pSrc1, pSrc2, 0, TMP_TEMPLATE_SIZE);
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
@@ -1323,9 +1667,14 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       {
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          const short* pSrc1 = tmpBuf;
+          tmpBuf += tarStride;
+#else
           const short* pSrc1 = (const short*) tarPatch[iY];
+#endif
           const short* pSrc2 = (const short*) refPatchRow;
-#if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AH0200_INTRA_TMP_BV_REORDER && !JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
           uiSum = calcDiff4<vext>(pSrc1, pSrc2, 0, TMP_TEMPLATE_SIZE);
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
@@ -1401,10 +1750,18 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
 
       // vertical difference
       int iCols = TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tmpBuf = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
 
       for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow = tmpBuf;
+        tmpBuf += tarStride;
+#else
         tarPatchRow        = tarPatch[iY];
+#endif
         const short *pSrc1 = (const short *) tarPatchRow;
         const short *pSrc2 = (const short *) refPatchRow;
 
@@ -1439,15 +1796,24 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       {
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          const short* const pSrc1 = (const short*)tmpBuf;
+          tmpBuf += tarStride;
+#else
           const short* pSrc1 = (const short*) tarPatch[iY];
+#endif
           const short* pSrc2 = (const short*) refPatchRow;
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          diffSum += calcDiff4<vext>(pSrc1, pSrc2, 0, TMP_TEMPLATE_SIZE);
+#else
           uiSum = calcDiff4<vext>(pSrc1, pSrc2, 0, TMP_TEMPLATE_SIZE);
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
           }
           diffSum += uiSum;
+#endif
           uiSum = calcDiff16<vext>(pSrc1, pSrc2, TMP_TEMPLATE_SIZE, uiPatchWidth);
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
@@ -1472,15 +1838,24 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       {
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          const short* const pSrc1 = (const short*)tmpBuf;
+          tmpBuf += tarStride;
+#else
           const short* pSrc1 = (const short*) tarPatch[iY];
+#endif
           const short* pSrc2 = (const short*) refPatchRow;
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          diffSum += calcDiff4<vext>(pSrc1, pSrc2, 0, TMP_TEMPLATE_SIZE);
+#else
           uiSum = calcDiff4<vext>(pSrc1, pSrc2, 0, TMP_TEMPLATE_SIZE);
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
           }
           diffSum += uiSum;
+#endif
           uiSum = calcDiff8<vext>(pSrc1, pSrc2, TMP_TEMPLATE_SIZE, uiPatchWidth);
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
@@ -1503,7 +1878,13 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          diffSum += calcDiff4<vext>(tmpBuf, (const short*)refPatchRow, 0, TMP_TEMPLATE_SIZE);
+          uiSum = calcDiff4<vext>(tmpBuf, (const short*)refPatchRow, TMP_TEMPLATE_SIZE, uiPatchWidth);
+          tmpBuf += tarStride;
+#else
           uiSum = calcDiff8<vext>((const short*) tarPatch[iY], (const short*) refPatchRow, 0, uiPatchWidth);
+#endif
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1558,10 +1939,18 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
 
       // vertical difference
       int iCols = TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tmpBuf = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
 
       for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow = tmpBuf;
+        tmpBuf += tarStride;
+#else
         tarPatchRow        = tarPatch[iY];
+#endif
         const short *pSrc1 = (const short *) tarPatchRow;
         const short *pSrc2 = (const short *) refPatchRow;
 
@@ -1598,7 +1987,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          uiSum = calcDiff16<vext>(tmpBuf, (const short*)refPatchRow, TMP_TEMPLATE_SIZE, uiPatchWidth);
+          tmpBuf += tarStride;
+#else
           uiSum = calcDiff16<vext>((const short*) tarPatch[iY], (const short*) refPatchRow, TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1621,7 +2015,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          uiSum = calcDiff8<vext>(tmpBuf, (const short*)refPatchRow, TMP_TEMPLATE_SIZE, uiPatchWidth);
+          tmpBuf += tarStride;
+#else
           uiSum = calcDiff8<vext>((const short*) tarPatch[iY], (const short*) refPatchRow, TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1641,7 +2040,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
         for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
         {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+          uiSum = calcDiff4<vext>(tmpBuf, (const short*)refPatchRow, TMP_TEMPLATE_SIZE, uiPatchWidth);
+          tmpBuf += tarStride;
+#else
           uiSum = calcDiff4<vext>((const short*) tarPatch[iY], (const short*) refPatchRow, TMP_TEMPLATE_SIZE, uiPatchWidth);
+#endif
           if (iY == (TMP_TEMPLATE_SIZE-1)) 
           {
             uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1694,13 +2098,21 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
     else   // L
     {
       refPatchRow = ref - TMP_TEMPLATE_SIZE;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tmpBuf = tarPatch + TMP_TEMPLATE_SIZE * tarStride;
+#endif
 
       // vertical difference
       int iCols = TMP_TEMPLATE_SIZE;
 
       for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
       {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        tarPatchRow = tmpBuf;
+        tmpBuf += tarStride;
+#else
         tarPatchRow        = tarPatch[iY];
+#endif
         const short *pSrc1 = (const short *) tarPatchRow;
         const short *pSrc2 = (const short *) refPatchRow;
 
@@ -1739,7 +2151,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
       {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        uiSum = calcDiff16<vext>(tmpBuf, (const short*)refPatchRow, 0, iCols);
+        tmpBuf += tarStride;
+#else
         uiSum = calcDiff16<vext>((const short*) tarPatch[iY], (const short*) refPatchRow, 0, iCols);
+#endif
         if (iY == (TMP_TEMPLATE_SIZE-1)) 
         {
           uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1762,7 +2179,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
       {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        uiSum = calcDiff8<vext>(tmpBuf, (const short*)refPatchRow, 0, iCols);
+        tmpBuf += tarStride;
+#else
         uiSum = calcDiff8<vext>((const short*) tarPatch[iY], (const short*) refPatchRow, 0, iCols);
+#endif
         if (iY == (TMP_TEMPLATE_SIZE-1)) 
         {
           uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1782,7 +2204,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
       for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++, refPatchRow += uiStride)
       {
 #if JVET_AH0200_INTRA_TMP_BV_REORDER
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+        uiSum = calcDiff4<vext>(tmpBuf, (const short*)refPatchRow, 0, iCols);
+        tmpBuf += tarStride;
+#else
         uiSum = calcDiff4<vext>((const short*) tarPatch[iY], (const short*) refPatchRow, 0, iCols);
+#endif
         if (iY == (TMP_TEMPLATE_SIZE-1)) 
         {
           uiSum <<= TMP_TEMPLATE_COST_SHIFT;
@@ -1841,7 +2268,12 @@ void calcTemplateDiffSIMD(Pel *ref, unsigned int uiStride, Pel **tarPatch, unsig
 
     for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
     {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+      tarPatchRow = tmpBuf;
+      tmpBuf += tarStride;
+#else
       tarPatchRow        = tarPatch[iY];
+#endif
       const short *pSrc1 = (const short *) tarPatchRow;
       const short *pSrc2 = (const short *) refPatchRow;
 
diff --git a/source/Lib/DecoderLib/DecCu.cpp b/source/Lib/DecoderLib/DecCu.cpp
index 90fb33183987e425d81355287c701a618f54c642..6259742b3c67493ffe8da5671e2d1156519bf227 100644
--- a/source/Lib/DecoderLib/DecCu.cpp
+++ b/source/Lib/DecoderLib/DecCu.cpp
@@ -1026,7 +1026,11 @@ void DecCu::xIntraRecBlk( TransformUnit& tu, const ComponentID compID )
           } 
           else
           {
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+            m_pcIntraPred->searchFracCandidate(tu.cu, m_pcIntraPred->getTargetPatch(), tempType);
+#else
             m_pcIntraPred->searchFracCandidate(tu.cu, m_pcIntraPred->getTargetPatch(floorLog2(std::max(pu.lwidth(), pu.lheight())) - 2), tempType);
+#endif
           }
         }
 #endif		
diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp
index 309f917a4cfe0f762941959c150d9f3837829df4..162b0dcdef03dc1cf10e35e96b1d237c12b357e2 100644
--- a/source/Lib/EncoderLib/IntraSearch.cpp
+++ b/source/Lib/EncoderLib/IntraSearch.cpp
@@ -1964,7 +1964,11 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
                   CHECK(cu.tmpLicFlag, "cu.tmpLicFlag == 1");
                   cu.ibcLicFlag = cu.tmpLicFlag;
                   cu.ibcLicIdx = uiRdModeListTmp[idxInList].tmpLicIdc;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                  searchFracCandidate(&cu, getTargetPatch(), templateType);
+#else
                   searchFracCandidate(&cu, getTargetPatch(floorLog2(std::max(cu.lwidth(), cu.lheight())) - 2), templateType);
+#endif
                   for (int spIdx = 0; spIdx < std::min(2, (int) m_mtmpFracCandList[cu.tmpIdx].size()); spIdx++)
                   {
                     cu.tmpIsSubPel = m_mtmpFracCandList[cu.tmpIdx][spIdx].m_subpel;
@@ -2031,7 +2035,11 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
                   CHECK(!cu.tmpLicFlag, "cu.tmpLicFlag != 0");
                   cu.ibcLicFlag = cu.tmpLicFlag;
                   cu.ibcLicIdx = uiRdModeListTmpLic[idxInList].tmpLicIdc;
+#if JVET_AI0129_INTRA_TMP_OVERLAPPING_REFINEMENT
+                  searchFracCandidate(&cu, getTargetPatch(), templateType);
+#else
                   searchFracCandidate(&cu, getTargetPatch(floorLog2(std::max(cu.lwidth(), cu.lheight())) - 2), templateType);
+#endif
                   for (int spIdx = 0; spIdx < std::min(2, (int) m_mtmpFracCandList[cu.tmpIdx].size()); spIdx++)
                   {
                     cu.tmpIsSubPel = m_mtmpFracCandList[cu.tmpIdx][spIdx].m_subpel;