diff --git a/source/Lib/CommonLib/IntraPrediction.cpp b/source/Lib/CommonLib/IntraPrediction.cpp
index 0c0781095685db5f1f4236e5a35ab56be99df3ba..1812e6e72b9b4de7934b9af20e43e9e91bcea1e7 100644
--- a/source/Lib/CommonLib/IntraPrediction.cpp
+++ b/source/Lib/CommonLib/IntraPrediction.cpp
@@ -1773,7 +1773,11 @@ void IntraPrediction::initIntraPatternChTypeISP(const CodingUnit& cu, const Comp
 }
 
 #if JVET_V0130_INTRA_TMP
+#if JVET_W0069_TMP_BOUNDARY
+RefTemplateType IntraPrediction::GetRefTemplateType(CodingUnit& cu, CompArea& area)
+#else
 bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area)
+#endif
 {
 	const ChannelType      chType = toChannelType(area.compID);
 	const CodingStructure& cs = *cu.cs;
@@ -1799,7 +1803,11 @@ bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area)
 
   if( numAboveUnits <= 0 || numLeftUnits <= 0 || numAboveRightUnits <= 0 || numLeftBelowUnits <= 0 )
   {
+#if JVET_W0069_TMP_BOUNDARY
+	  return No_Template;
+#else
     return false;
+#endif
   }
 
 	// ----- Step 1: analyze neighborhood -----
@@ -1814,7 +1822,19 @@ bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area)
 
 	//bool retVal = 1;
 
+#if JVET_W0069_TMP_BOUNDARY
+	if (isAboveLeftAvailable(cu, chType, posLT) && isAboveAvailable(cu, chType, posLT, numAboveUnits, unitWidth, (neighborFlags + totalLeftUnits + 1)) && isLeftAvailable(cu, chType, posLT, numLeftUnits, unitHeight, (neighborFlags + totalLeftUnits - 1)))
+		return L_Shape_Template;
+	else if (isAboveLeftAvailable(cu, chType, posLT))
+		return Left_Template;
+	else if (isAboveAvailable(cu, chType, posLT, numAboveUnits, unitWidth, (neighborFlags + totalLeftUnits + 1)))
+		return Up_Template;
+	else
+		return No_Template;
+	CHECK(1, "un defined template type");
+#else
 	return isAboveLeftAvailable(cu, chType, posLT) && isAboveAvailable(cu, chType, posLT, numAboveUnits, unitWidth, (neighborFlags + totalLeftUnits + 1)) && isLeftAvailable(cu, chType, posLT, numLeftUnits, unitHeight, (neighborFlags + totalLeftUnits - 1));
+#endif
 
 	//return retVal;
 }
diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h
index 9cf085f4b8859084f5d38f22a552f6e4dca7d033..b132622a284f45e968aa649af8ee7486ec988b9a 100644
--- a/source/Lib/CommonLib/IntraPrediction.h
+++ b/source/Lib/CommonLib/IntraPrediction.h
@@ -158,7 +158,7 @@ protected:
 
   void xPredIntraBDPCM            ( const CPelBuf &pSrc, PelBuf &pDst, const uint32_t dirMode, const ClpRng& clpRng );
   Pel  xGetPredValDc              ( const CPelBuf &pSrc, const Size &dstSize );
-#if JVET_V0130_INTRA_TMP
+#if JVET_V0130_INTRA_TMP && !JVET_W0069_TMP_BOUNDARY
   bool isRefTemplateAvailable(CodingUnit& cu, CompArea& area);
 #endif
 
@@ -203,6 +203,10 @@ public:
   IntraPrediction();
   virtual ~IntraPrediction();
 
+#if JVET_W0069_TMP_BOUNDARY
+  RefTemplateType GetRefTemplateType(CodingUnit& cu, CompArea& area);
+#endif
+
   void init                       (ChromaFormat chromaFormatIDC, const unsigned bitDepthY);
 #if ENABLE_DIMD
   static void deriveDimdMode      (const CPelBuf &recoBuf, const CompArea &area, CodingUnit &cu);
diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp
index 40d9dea86175ead1382cc38b0383f967be0025a9..548f8cc34819fedee563a6957f02a97ac21e7059 100644
--- a/source/Lib/CommonLib/TrQuant.cpp
+++ b/source/Lib/CommonLib/TrQuant.cpp
@@ -542,7 +542,11 @@ void TempLibFast::initTemplateDiff(unsigned int uiPatchWidth, unsigned int uiPat
 	}
 }
 
+#if JVET_W0069_TMP_BOUNDARY
+void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType)
+#else
 void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight)
+#endif
 {
 	const ComponentID compID = COMPONENT_Y;
 	unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE;
@@ -557,6 +561,10 @@ void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsig
 	//fill template
 	//up-left & up 
 	Pel* tarTemp;
+#if JVET_W0069_TMP_BOUNDARY
+	if (TempType == L_Shape_Template)
+	{
+#endif
 	Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride - TMP_TEMPLATE_SIZE;
 	for (uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++)
 	{
@@ -577,9 +585,42 @@ void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsig
 		}
 		pCurrTemp += uiPicStride;
 	}
+#if JVET_W0069_TMP_BOUNDARY
+	}
+  else if (TempType == Up_Template)
+  {
+    Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride;
+    for (uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++)
+    {
+      tarTemp = tarPatch[uiY];
+      for (uiX = 0; uiX < uiBlkWidth; uiX++)
+      {
+        tarTemp[uiX] = pCurrTemp[uiX];
+      }
+      pCurrTemp += uiPicStride;
+    }
+  }
+	else if (TempType == Left_Template)
+	{
+		Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE;
+		for (uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++)
+		{
+			tarTemp = tarPatch[uiY];
+			for (uiX = 0; uiX < TMP_TEMPLATE_SIZE; uiX++)
+			{
+				tarTemp[uiX] = pCurrTemp[uiX];
+			}
+			pCurrTemp += uiPicStride;
+		}
+	}
+#endif
 }
 
+#if JVET_W0069_TMP_BOUNDARY
+void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType)
+#else
 void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight)
+#endif
 {
 	const ComponentID compID = COMPONENT_Y;
 	const int channelBitDepth = pcCU->cs->sps->getBitDepth(toChannelType(compID));
@@ -590,7 +631,11 @@ void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, un
 	//Initialize the library for saving the best candidates
 	m_tempLibFast.initTemplateDiff(uiPatchWidth, uiPatchHeight, uiBlkWidth, uiBlkHeight, channelBitDepth);
 	short setId = 0; //record the reference picture.
+#if JVET_W0069_TMP_BOUNDARY
+	searchCandidateFromOnePicIntra(pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId, TempType);
+#else
 	searchCandidateFromOnePicIntra(pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId);
+#endif
 	//count collected candidate number
 	int pDiff = m_tempLibFast.getDiff();
 	int maxDiff = m_tempLibFast.getDiffMax();
@@ -606,7 +651,11 @@ void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, un
   }
 }
 
+#if JVET_W0069_TMP_BOUNDARY
+void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId, RefTemplateType TempType)
+#else
 void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId)
+#endif
 {
 	const ComponentID compID = COMPONENT_Y;
 	unsigned int uiBlkWidth = uiPatchWidth - TMP_TEMPLATE_SIZE;
@@ -685,7 +734,11 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 			for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--)
 			{
 				refCurr = ref + iYOffset * refStride + iXOffset;
+#if JVET_W0069_TMP_BOUNDARY
+				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff, TempType);
+#else
 				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
+#endif
 				if (diff < (pDiff))
 				{
 					insertNode(diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId); 
@@ -719,7 +772,11 @@ void  TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch,
 			for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--)
 			{
 				refCurr = ref + iYOffset * refStride + iXOffset;
+#if JVET_W0069_TMP_BOUNDARY
+				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff, TempType);
+#else
 				diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff);
+#endif
 
 				if (diff < (pDiff))
 				{
@@ -780,12 +837,48 @@ bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned
 	return bSucceedFlag;
 }
 
+#if JVET_W0069_TMP_BOUNDARY
+bool TrQuant::generateTM_DC_Prediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int DC_Val)
+{
+  bool bSucceedFlag = true;
+  {
+    for (unsigned int uiY = 0; uiY < uiBlkHeight; uiY++)
+    {
+      for (unsigned int uiX = 0; uiX < uiBlkWidth; uiX++)
+      {
+        piPred[uiX] = DC_Val;
+      }
+      piPred += uiStride;
+    }
+  }
+	return bSucceedFlag;
+}
+#endif
+
+#if JVET_W0069_TMP_BOUNDARY
+int TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType)
+#else
 int TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax)
+#endif
 {
   int iDiffSum = 0;
+#if JVET_W0069_TMP_BOUNDARY
+  Pel* refPatchRow;
+  if (TempType == L_Shape_Template)
+	  refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+  else if (TempType == Left_Template)
+	  refPatchRow = ref - TMP_TEMPLATE_SIZE;
+  else if (TempType == Up_Template)
+	  refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride;
+#else
   Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+#endif
   Pel* tarPatchRow;
 
+#if JVET_W0069_TMP_BOUNDARY
+  if (TempType == L_Shape_Template)
+  {
+#endif
   // horizontal difference
   for( int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
   {
@@ -815,6 +908,43 @@ int TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, u
     }
     refPatchRow += uiStride;
   }
+#if JVET_W0069_TMP_BOUNDARY
+	}
+  else if (TempType == Up_Template)
+  {
+    // top  template difference
+    for (int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
+    {
+      tarPatchRow = tarPatch[iY];
+      for (int iX = 0; iX < uiPatchWidth - TMP_TEMPLATE_SIZE; iX++)
+      {
+        iDiffSum += abs(refPatchRow[iX] - tarPatchRow[iX]);
+      }
+      if (iDiffSum > iMax) //for speeding up
+      {
+        return iDiffSum;
+      }
+      refPatchRow += uiStride;
+    }
+  }
+  else if (TempType == Left_Template)
+  {
+	  // left template difference
+	  for (int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
+	  {
+		  tarPatchRow = tarPatch[iY];
+		  for (int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++)
+		  {
+			  iDiffSum += abs(refPatchRow[iX] - tarPatchRow[iX]);
+		  }
+		  if (iDiffSum > iMax) //for speeding up
+		  {
+			  return iDiffSum;
+		  }
+		  refPatchRow += uiStride;
+	  }
+  }
+#endif
 
   return iDiffSum;
 }
diff --git a/source/Lib/CommonLib/TrQuant.h b/source/Lib/CommonLib/TrQuant.h
index 4145f64047924ac5623726cdac87aa9c71b32a10..ec9b42c35f72db6814722650e3c3749e222ec066 100644
--- a/source/Lib/CommonLib/TrQuant.h
+++ b/source/Lib/CommonLib/TrQuant.h
@@ -130,18 +130,35 @@ public:
   void invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize );
 #endif
 #if JVET_V0130_INTRA_TMP
+#if JVET_W0069_TMP_BOUNDARY
+  int (*m_calcTemplateDiff)(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType);
+  static int calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType);
+#else
   int ( *m_calcTemplateDiff )(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
   static int calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax);
+#endif
   Pel** getTargetPatch(unsigned int uiDepth)       { return m_pppTarPatch[uiDepth]; }
   Pel* getRefPicUsed()                             { return m_refPicUsed; }
   void setRefPicUsed(Pel* ref)                     { m_refPicUsed = ref; }
   unsigned int getStride()                         { return m_uiPicStride; }
   void         setStride(unsigned int uiPicStride) { m_uiPicStride = uiPicStride; }
 
+#if JVET_W0069_TMP_BOUNDARY
+  void searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId, RefTemplateType TempType);
+  void candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType);
+#else
   void searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId);
   void candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight);
+#endif
   bool generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int& foundCandiNum);
+#if JVET_W0069_TMP_BOUNDARY
+  bool generateTM_DC_Prediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int DC_Val);
+#endif
+#if JVET_W0069_TMP_BOUNDARY
+  void getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight, RefTemplateType TempType);
+#else
   void getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight);
+#endif
 #endif
 
   uint32_t getLFNSTIntraMode( int wideAngPredMode );
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 9762eb814ab022abc8a4301133a8cde2e110baa6..8f49a782e0cc37885e23a1267b5f4e7236210fbf 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -111,6 +111,7 @@
 #define ENABLE_DIMD                                       1 // Decoder side intra mode derivation
 #define JVET_V0087_DIMD_NO_ISP                            ENABLE_DIMD // disallow combination of DIMD and ISP
 #define JVET_V0130_INTRA_TMP                              1 // JVET-V0130: template matching prediction
+#define JVET_W0069_TMP_BOUNDARY								  1
 
 #define JVET_W0123_TIMD_FUSION                            1 // Template based intra mode derivation and fusion
 
@@ -656,6 +657,15 @@ enum ChannelType
   CHANNEL_TYPE_CHROMA  = 1,
   MAX_NUM_CHANNEL_TYPE = 2
 };
+#if JVET_W0069_TMP_BOUNDARY
+enum RefTemplateType
+{
+  L_Shape_Template = 1,
+  Left_Template = 2,
+  Up_Template = 3,
+  No_Template = 4,
+};
+#endif
 #if !INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS
 enum TreeType
 {
diff --git a/source/Lib/CommonLib/x86/TrQuantX86.h b/source/Lib/CommonLib/x86/TrQuantX86.h
index 7404f8fdc78506b73bbcaca8c743dda9447e090e..c560f9fbb46da8ee56874d6a3705e54df53f7b35 100644
--- a/source/Lib/CommonLib/x86/TrQuantX86.h
+++ b/source/Lib/CommonLib/x86/TrQuantX86.h
@@ -412,15 +412,33 @@ uint32_t computeSAD_SIMD( const Pel* ref, const Pel* cur, const int size )
 
 #if ENABLE_SIMD_TMP
 template< X86_VEXT vext >
+#if JVET_W0069_TMP_BOUNDARY
+int calcTemplateDiffSIMD(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax, RefTemplateType TempType)
+#else
 int calcTemplateDiffSIMD( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax )
+#endif
 {
   int iDiffSum = 0;
   int iY;
+#if JVET_W0069_TMP_BOUNDARY
+  Pel* refPatchRow;
+  if (TempType == L_Shape_Template)
+    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+  else if (TempType == Left_Template)
+    refPatchRow = ref - TMP_TEMPLATE_SIZE;
+  else if (TempType == Up_Template)
+    refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride;
+#else
   Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE;
+#endif
   Pel* tarPatchRow;
   uint32_t uiSum;
 
   // horizontal difference
+#if JVET_W0069_TMP_BOUNDARY
+  if (TempType == L_Shape_Template)
+  {
+#endif
   for( iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ )
   {
     tarPatchRow = tarPatch[iY];
@@ -527,6 +545,124 @@ int calcTemplateDiffSIMD( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsig
     // update location
     refPatchRow += uiStride;
   }
+#if JVET_W0069_TMP_BOUNDARY
+  }
+  else if (TempType == Up_Template)
+  {
+  // horizontal difference
+  for (iY = 0; iY < TMP_TEMPLATE_SIZE; iY++)
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = (const short*)tarPatchRow;
+    const short* pSrc2 = (const short*)refPatchRow;
+
+    // SIMD difference
+    //int  iRows = uiPatchHeight;
+    int  iCols = uiPatchWidth - TMP_TEMPLATE_SIZE;
+    if ((iCols & 7) == 0)
+    {
+      // Do with step of 8
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for (int iX = 0; iX < iCols; iX += 8)
+        {
+          __m128i vsrc1 = _mm_loadu_si128((const __m128i*)(&pSrc1[iX]));
+          __m128i vsrc2 = _mm_lddqu_si128((const __m128i*)(&pSrc2[iX]));
+          vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
+        }
+        __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
+        vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
+      uiSum = _mm_cvtsi128_si32(vsum32);
+    }
+    else
+    {
+      // Do with step of 4
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      //for (int iY = 0; iY < iRows; iY += iSubStep)
+      {
+        __m128i vsum16 = vzero;
+        for (int iX = 0; iX < iCols; iX += 4)
+        {
+          __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
+          __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
+          vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
+        }
+        __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
+        vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+        //pSrc1 += iStrideSrc1;
+        //pSrc2 += iStrideSrc2;
+      }
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
+      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
+      uiSum = _mm_cvtsi128_si32(vsum32);
+    }
+    iDiffSum += uiSum;
+
+    if (iDiffSum > iMax) //for speeding up
+    {
+      return iDiffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+
+  
+  }
+  else if (TempType == Left_Template)
+  {
+
+  // vertical difference
+  int  iCols = TMP_TEMPLATE_SIZE;
+
+  for (iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++)
+  {
+    tarPatchRow = tarPatch[iY];
+    const short* pSrc1 = (const short*)tarPatchRow;
+    const short* pSrc2 = (const short*)refPatchRow;
+
+    // SIMD difference
+
+    // Do with step of 4
+    __m128i vzero = _mm_setzero_si128();
+    __m128i vsum32 = vzero;
+    //for (int iY = 0; iY < iRows; iY += iSubStep)
+    {
+      __m128i vsum16 = vzero;
+      for (int iX = 0; iX < iCols; iX += 4)
+      {
+        __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]);
+        __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]);
+        vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)));
+      }
+      __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
+      vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+      //pSrc1 += iStrideSrc1;
+      //pSrc2 += iStrideSrc2;
+    }
+    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
+    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
+    uiSum = _mm_cvtsi128_si32(vsum32);
+
+    iDiffSum += uiSum;
+
+    if (iDiffSum > iMax) //for speeding up
+    {
+      return iDiffSum;
+    }
+    // update location
+    refPatchRow += uiStride;
+  }
+  }
+#endif
 
   return iDiffSum;
 }
diff --git a/source/Lib/DecoderLib/DecCu.cpp b/source/Lib/DecoderLib/DecCu.cpp
index 59f5ca00eae3380566c8f06d1968edd7244ea708..27aa5ae24fb9cdd8b59ab5c8f01dd590b2266ef3 100644
--- a/source/Lib/DecoderLib/DecCu.cpp
+++ b/source/Lib/DecoderLib/DecCu.cpp
@@ -384,9 +384,24 @@ void DecCu::xIntraRecBlk( TransformUnit& tu, const ComponentID compID )
 	  if (PU::isTmp(pu, chType))
 	  {
 		  int foundCandiNum;
+#if JVET_W0069_TMP_BOUNDARY
+		  RefTemplateType TempType = m_pcIntraPred->GetRefTemplateType(*(tu.cu), tu.cu->blocks[COMPONENT_Y]);
+		  if (TempType != No_Template)
+		  {
+			  m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight(), TempType);
+			  m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight(), TempType);
+			  m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
+		  }
+		  else
+		  {
+			  foundCandiNum = 1;
+			  m_pcTrQuant->generateTM_DC_Prediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (tu.cu->cs->sps->getBitDepth(CHANNEL_TYPE_LUMA) - 1));
+		  }
+#else
 		  m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight());
 		  m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight());
 		  m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
+#endif
 		  assert(foundCandiNum >= 1);
 	  }
 	  else if (PU::isMIP(pu, chType))
@@ -585,7 +600,7 @@ void DecCu::xIntraRecACTBlk(TransformUnit& tu)
 
     PelBuf piPred = cs.getPredBuf(area);
     m_pcIntraPred->initIntraPatternChType(*tu.cu, area);
-#if JVET_V0130_INTRA_TMP
+#if JVET_V0130_INTRA_TMP && ! JVET_W0069_TMP_BOUNDARY
 	if (PU::isTmp(pu, chType))
 	{
 		int foundCandiNum;
diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp
index 821eca75970bf218bee449080e33d97af1dbdc84..ac3df4bf2bcbfc44388c7ae22ecab14969c5c18b 100644
--- a/source/Lib/EncoderLib/IntraSearch.cpp
+++ b/source/Lib/EncoderLib/IntraSearch.cpp
@@ -891,12 +891,30 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c
               bool bsuccessfull = 0;
               CodingUnit cu_cpy = cu;
 
+#if JVET_W0069_TMP_BOUNDARY
+			  RefTemplateType TemplateType = GetRefTemplateType(cu_cpy, cu_cpy.blocks[COMPONENT_Y]);
+			  if (TemplateType != No_Template)
+#else
               if( isRefTemplateAvailable( cu_cpy, cu_cpy.blocks[COMPONENT_Y] ) )
+#endif
               {
+#if JVET_W0069_TMP_BOUNDARY
+				  m_pcTrQuant->getTargetTemplate(&cu_cpy, pu.lwidth(), pu.lheight(), TemplateType);
+				  m_pcTrQuant->candidateSearchIntra(&cu_cpy, pu.lwidth(), pu.lheight(), TemplateType);
+				  bsuccessfull = m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
+#else
                 m_pcTrQuant->getTargetTemplate( &cu_cpy, pu.lwidth(), pu.lheight() );
                 m_pcTrQuant->candidateSearchIntra( &cu_cpy, pu.lwidth(), pu.lheight() );
                 bsuccessfull = m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+#endif
               }
+#if JVET_W0069_TMP_BOUNDARY
+			  else
+			  {
+				  foundCandiNum = 1;
+				  bsuccessfull = m_pcTrQuant->generateTM_DC_Prediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (cu_cpy.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA) - 1));
+			  }
+#endif
               if( bsuccessfull && foundCandiNum >= 1 )
               {
 
@@ -3660,9 +3678,24 @@ void IntraSearch::xIntraCodingTUBlock(TransformUnit &tu, const ComponentID &comp
         if( PU::isTmp( pu, chType ) )
         {
           int foundCandiNum;
+#if JVET_W0069_TMP_BOUNDARY
+		  RefTemplateType TempType = GetRefTemplateType(*(tu.cu), tu.cu->blocks[COMPONENT_Y]);
+		  if (TempType != No_Template)
+		  {
+			  m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight(), TempType);
+			  m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight(), TempType);
+			  m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum);
+		  }
+		  else
+		  {
+			  foundCandiNum = 1;
+			  m_pcTrQuant->generateTM_DC_Prediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), 1 << (tu.cu->cs->sps->getBitDepth(CHANNEL_TYPE_LUMA) - 1));
+		  }
+#else
           m_pcTrQuant->getTargetTemplate( tu.cu, pu.lwidth(), pu.lheight() );
           m_pcTrQuant->candidateSearchIntra( tu.cu, pu.lwidth(), pu.lheight() );
           m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum );
+#endif
           CHECK( foundCandiNum < 1, "" );
         }
         else if( PU::isMIP( pu, chType ) )
@@ -4922,7 +4955,7 @@ bool IntraSearch::xRecurIntraCodingACTQT(CodingStructure &cs, Partitioner &parti
       PelBuf         piResi = resiBuf.bufs[compID];
 
       initIntraPatternChType(*tu.cu, area);
-#if JVET_V0130_INTRA_TMP
+#if JVET_V0130_INTRA_TMP && !JVET_W0069_TMP_BOUNDARY
       if( PU::isTmp( pu, chType ) )
       {
         int foundCandiNum;