diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index 115a5dcb6044553dfc85c36bf24c81889b080d3b..5b73a47645f03f637f56b6429f249eeb2ef6dc4b 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -11437,13 +11437,21 @@ void  InterPrediction::sortIbcAdaptiveMergeMbvdCandidates(PredictionUnit &pu, Me
 
       if (m_bAMLTemplateAvailabe[0])
       {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
         uiCost += cDistParam.distFunc(cDistParam);
       }
 
       if (uiCost < m_mbvdCandCostList[endEncIdx - 1] && m_bAMLTemplateAvailabe[1])
       {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
         uiCost += cDistParam.distFunc(cDistParam);
       }
 
@@ -11591,14 +11599,22 @@ void  InterPrediction::sortIbcMergeMbvdCandidates(PredictionUnit &pu, MergeCtx&
 
     if (m_bAMLTemplateAvailabe[0])
     {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
       m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
       uiCost += cDistParam.distFunc(cDistParam);
     }
 
     if (m_bAMLTemplateAvailabe[1])
     {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
       m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
       uiCost += cDistParam.distFunc(cDistParam);
     }
@@ -12519,14 +12535,22 @@ void  InterPrediction::sortInterMergeMMVDCandidates(PredictionUnit &pu, MergeCtx
       }
       if (m_bAMLTemplateAvailabe[0])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
         uiCost += cDistParam.distFunc(cDistParam);
         
       }
 
       if (m_bAMLTemplateAvailabe[1])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
         uiCost += cDistParam.distFunc(cDistParam);
       }
       
@@ -17191,14 +17215,22 @@ void  InterPrediction::adjustAffineMergeCandidates(PredictionUnit &pu, AffineMer
 
       if (m_bAMLTemplateAvailabe[0])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
         uiCost += cDistParam.distFunc(cDistParam);
       }
 
       if (m_bAMLTemplateAvailabe[1])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
         uiCost += cDistParam.distFunc(cDistParam);
       }
@@ -17673,13 +17705,21 @@ void InterPrediction::adjustAffineMergeCandidates(PredictionUnit &pu, AffineMerg
 
           if (m_bAMLTemplateAvailabe[0])
           {
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
             uiCost += cDistParam.distFunc(cDistParam);
           }
           if (m_bAMLTemplateAvailabe[1])
           {
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
             uiCost += cDistParam.distFunc(cDistParam);
           }
@@ -17780,13 +17820,21 @@ void InterPrediction::adjustAffineMergeCandidates(PredictionUnit &pu, AffineMerg
           );
           if (m_bAMLTemplateAvailabe[0])
           {
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
             uiCost += cDistParam.distFunc(cDistParam);
           }
           if (m_bAMLTemplateAvailabe[1])
           {
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
             uiCost += cDistParam.distFunc(cDistParam);
           }
@@ -17885,13 +17933,21 @@ void InterPrediction::adjustAffineMergeCandidates(PredictionUnit &pu, AffineMerg
           );
           if (m_bAMLTemplateAvailabe[0])
           {
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
             uiCost += cDistParam.distFunc(cDistParam);
           }
           if (m_bAMLTemplateAvailabe[1])
           {
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
             uiCost += cDistParam.distFunc(cDistParam);
           }
@@ -19641,14 +19697,22 @@ void  InterPrediction::adjustIBCMergeCandidates(PredictionUnit &pu, MergeCtx& mr
 
     if (m_bAMLTemplateAvailabe[0])
     {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop, pcBufPredRefTop, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
       m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop, pcBufPredRefTop, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
       uiCost += cDistParam.distFunc(cDistParam);
     }
 
     if (m_bAMLTemplateAvailabe[1])
     {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft, pcBufPredRefLeft, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
       m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft, pcBufPredRefLeft, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
       uiCost += cDistParam.distFunc(cDistParam);
     }
@@ -19825,14 +19889,22 @@ void  InterPrediction::adjustIBCMergeCandidates(PredictionUnit &pu, MergeCtx& mr
 
     if (m_bAMLTemplateAvailabe[0])
     {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop, pcBufPredRefTop, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
       m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop, pcBufPredRefTop, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
       uiCost += cDistParam.distFunc(cDistParam);
     }
 
     if (m_bAMLTemplateAvailabe[1])
     {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft, pcBufPredRefLeft, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
       m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft, pcBufPredRefLeft, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
       uiCost += cDistParam.distFunc(cDistParam);
     }
@@ -19950,14 +20022,22 @@ void  InterPrediction::adjustAffineMergeCandidatesOneGroup(PredictionUnit &pu, A
         );
         if (m_bAMLTemplateAvailabe[0])
         {
+#if JVET_AJ0096_SATD_REORDER_INTER
+          m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
           m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
           uiCost += cDistParam.distFunc(cDistParam);
         }
 
         if (m_bAMLTemplateAvailabe[1])
         {
+#if JVET_AJ0096_SATD_REORDER_INTER
+          m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
           m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
           uiCost += cDistParam.distFunc(cDistParam);
         }
@@ -20140,7 +20220,11 @@ Distortion InterPrediction::getTempCost(const PredictionUnit &pu, const PelBuf &
   Distortion uiCost;
   DistParam  cDistParam;
   cDistParam.applyWeight = false;
+#if JVET_AJ0096_SATD_REORDER_INTRA
+  m_pcRdCost->setDistParam(cDistParam, cur, org, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
   m_pcRdCost->setDistParam(cDistParam, cur, org, pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
   uiCost = cDistParam.distFunc(cDistParam);
   return uiCost;
 }
@@ -31955,7 +32039,11 @@ void InterPrediction::deriveAffineMVDCandVecFromMotionInforPred(const Prediction
               const bool res = getAffAMLRefTemplateMvdPredUni<1>(tmpPU, pcBufPredRefTop, pcBufPredRefLeft, pu.cs->sps->getUseFastSubTmvp(), tmp);
               if (res)
               {
+#if JVET_AJ0096_SATD_REORDER_INTER
+                m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
                 m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
                 uiCost += cDistParam.distFunc(cDistParam);
               }
               else
@@ -31994,7 +32082,11 @@ void InterPrediction::deriveAffineMVDCandVecFromMotionInforPred(const Prediction
               const bool res = getAffAMLRefTemplateMvdPredUni<2>(tmpPU, pcBufPredRefTop, pcBufPredRefLeft, pu.cs->sps->getUseFastSubTmvp(), tmp);
               if (res)
               {
+#if JVET_AJ0096_SATD_REORDER_INTER
+                m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
                 m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
                 uiCost += cDistParam.distFunc(cDistParam);
               }
               else
@@ -32270,7 +32362,11 @@ void InterPrediction::reorderRefCombList(PredictionUnit &pu, std::vector<RefList
               }
               if (res)
               {
+#if JVET_AJ0096_SATD_REORDER_INTER
+                m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
                 m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
                 uiCost += cDistParam.distFunc(cDistParam);
               }
               else
@@ -32289,7 +32385,11 @@ void InterPrediction::reorderRefCombList(PredictionUnit &pu, std::vector<RefList
               }
               if (res)
               {
+#if JVET_AJ0096_SATD_REORDER_INTER
+                m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
                 m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
                 uiCost += cDistParam.distFunc(cDistParam);
               }
               else
@@ -32426,14 +32526,22 @@ void InterPrediction::reorderRefCombList(PredictionUnit &pu, std::vector<RefList
           {
             if (m_bAMLTemplateAvailabe[0])
             {
+#if JVET_AJ0096_SATD_REORDER_INTER
+              m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
               m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
               uiCost += cDistParam.distFunc(cDistParam);
             }
 
             if (m_bAMLTemplateAvailabe[1])
             {
+#if JVET_AJ0096_SATD_REORDER_INTER
+              m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
               m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
               uiCost += cDistParam.distFunc(cDistParam);
             }
@@ -33351,9 +33459,17 @@ void InterPrediction::reorderRefPairList(PredictionUnit &pu, std::vector<RefPicP
           if (m_bAMLTemplateAvailabe[0])
           {
 #if JVET_AD0140_MVD_PREDICTION
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), ((identicalMotion && !tmpPU.cu->licFlag)? pcBufPredRefTopIdMotion:pcBufPredRefTop).Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), ((identicalMotion && !tmpPU.cu->licFlag)? pcBufPredRefTopIdMotion:pcBufPredRefTop).Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
+#else
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
 #else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 #endif
 
             uiCost += cDistParam.distFunc(cDistParam);
@@ -33362,9 +33478,17 @@ void InterPrediction::reorderRefPairList(PredictionUnit &pu, std::vector<RefPicP
           if (m_bAMLTemplateAvailabe[1])
           {
 #if JVET_AD0140_MVD_PREDICTION
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), ((identicalMotion && !tmpPU.cu->licFlag) ? pcBufPredRefLeftIdMotion : pcBufPredRefLeft).Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), ((identicalMotion && !tmpPU.cu->licFlag) ? pcBufPredRefLeftIdMotion : pcBufPredRefLeft).Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
+#else
+#if JVET_AJ0096_SATD_REORDER_INTER
+            m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
 #else
             m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 #endif
 
             uiCost += cDistParam.distFunc(cDistParam);
@@ -33721,7 +33845,11 @@ void InterPrediction::reorderRefPairList(PredictionUnit &pu, std::vector<RefPicP
           }
 #endif
 
+#if JVET_AJ0096_SATD_REORDER_INTER
+          m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
           m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
           uiCost += cDistParam.distFunc(cDistParam);
         }
@@ -33774,7 +33902,11 @@ void InterPrediction::reorderRefPairList(PredictionUnit &pu, std::vector<RefPicP
           }
 #endif
 
+#if JVET_AJ0096_SATD_REORDER_INTER
+          m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
           m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
           uiCost += cDistParam.distFunc(cDistParam);
         }
 #if JVET_AD0140_MVD_PREDICTION
@@ -35666,14 +35798,22 @@ void InterPrediction::defineSignHypMatchAffine(PredictionUnit& pu, const RefPicL
       uiCost = 0;
       if (m_bAMLTemplateAvailabe[0])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
         uiCost += cDistParam.distFunc(cDistParam);
       }
 
       if (m_bAMLTemplateAvailabe[1])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
 
         uiCost += cDistParam.distFunc(cDistParam);
       }
@@ -36463,12 +36603,20 @@ void InterPrediction::defineSignHypMatchAffine(PredictionUnit& pu, const RefPicL
 
       if (numTemplate[0])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurTop.Y(), pcBufPredRefTop.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
         uiCost += cDistParam.distFunc(cDistParam);
       }
       if (numTemplate[1])
       {
+#if JVET_AJ0096_SATD_REORDER_INTER
+        m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, true);
+#else
         m_pcRdCost->setDistParam(cDistParam, pcBufPredCurLeft.Y(), pcBufPredRefLeft.Y(), pu.cs->sps->getBitDepth(CHANNEL_TYPE_LUMA), COMPONENT_Y, false);
+#endif
         uiCost += cDistParam.distFunc(cDistParam);
       }
 #endif
diff --git a/source/Lib/CommonLib/IntraPrediction.cpp b/source/Lib/CommonLib/IntraPrediction.cpp
index f800a2da54e3c2fed3f3427f0e4fdfa19a5f0b62..e899340f1f517f4ec0eb17863dae9c613be2fbc6 100644
--- a/source/Lib/CommonLib/IntraPrediction.cpp
+++ b/source/Lib/CommonLib/IntraPrediction.cpp
@@ -3025,13 +3025,21 @@ Mv IntraPrediction::refineChromaBv(const ComponentID compId, const PredictionUni
     if (topCanUse)
     {
       PelBuf tempRef = PelBuf(refPix + 1, uiWidth, Size(uiWidth, DBV_TEMPLATE_SIZE));
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_dbvSadCost->setDistParam(cDistParam, tempCurTop, tempRef, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), compId, uiWidth >= 4 && uiHeight >= 4 ? true : false);
+#else
       m_dbvSadCost->setDistParam(cDistParam, tempCurTop, tempRef, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), compId, false);
+#endif
       uiCost += cDistParam.distFunc(cDistParam);
     }
     if (leftCanUse)
     {
       PelBuf tempRef = PelBuf(refPix + 1 + stride, uiHeight, Size(uiHeight, DBV_TEMPLATE_SIZE));
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_dbvSadCost->setDistParam(cDistParam, tempCurLeft, tempRef, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), compId, uiWidth >= 4 && uiHeight >= 4 ? true : false);
+#else
       m_dbvSadCost->setDistParam(cDistParam, tempCurLeft, tempRef, pu.cs->sps->getBitDepth(CHANNEL_TYPE_CHROMA), compId, false);
+#endif
       uiCost += cDistParam.distFunc(cDistParam);
     }
     aBvCostVec.push_back(std::pair<Mv, Distortion>(*it, uiCost));
@@ -4793,7 +4801,11 @@ void IntraPrediction::geneChromaFusionPred(const ComponentID compId, PelBuf &piP
     if (numSample > 0)
     {
       int bestSAD = cccmSAD < cclmSAD ? cccmSAD : cclmSAD;
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      if (bestSAD > 144 * numSample)
+#else
       if (bestSAD > 64 * numSample)
+#endif
       {
         w0 = 3;
         w1 = 1;
@@ -7906,21 +7918,39 @@ void IntraPrediction::deriveMPMSorted(const PredictionUnit& pu, uint8_t* mpm, in
   distParamSad[1].useMR = false;
   if (eTempType == LEFT_ABOVE_NEIGHBOR)
   {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg + iTempWidth, piPred + iTempWidth, iOrgStride,
+      uiPredStride, channelBitDepth, COMPONENT_Y, uiWidth, iTempHeight, 0, 1, true);
+    m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg + iTempHeight * iOrgStride,
+      piPred + iTempHeight * uiPredStride, iOrgStride, uiPredStride, channelBitDepth,
+      COMPONENT_Y, iTempWidth, uiHeight, 0, 1, true);
+#else
     m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg + iTempWidth, piPred + iTempWidth, iOrgStride,
       uiPredStride, channelBitDepth, COMPONENT_Y, uiWidth, iTempHeight, 0, 1, false);   // Use HAD (SATD) cost
     m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg + iTempHeight * iOrgStride,
       piPred + iTempHeight * uiPredStride, iOrgStride, uiPredStride, channelBitDepth,
       COMPONENT_Y, iTempWidth, uiHeight, 0, 1, false);
+#endif
   }
   else if (eTempType == LEFT_NEIGHBOR)
   {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg, piPred, iOrgStride, uiPredStride, channelBitDepth,
+      COMPONENT_Y, iTempWidth, uiHeight, 0, 1, true);
+#else
     m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg, piPred, iOrgStride, uiPredStride, channelBitDepth,
       COMPONENT_Y, iTempWidth, uiHeight, 0, 1, false);
+#endif
   }
   else if (eTempType == ABOVE_NEIGHBOR)
   {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg, piPred, iOrgStride, uiPredStride, channelBitDepth,
+      COMPONENT_Y, uiWidth, iTempHeight, 0, 1, true);
+#else
     m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg, piPred, iOrgStride, uiPredStride, channelBitDepth,
       COMPONENT_Y, uiWidth, iTempHeight, 0, 1, false);
+#endif
   }
   initTimdIntraPatternLuma(*pu.cu, area, eTempType != ABOVE_NEIGHBOR ? iTempWidth : 0,
     eTempType != LEFT_NEIGHBOR ? iTempHeight : 0, uiRefWidth, uiRefHeight);
@@ -21746,6 +21776,19 @@ uint32_t IntraPrediction::xCalculateCCLMcost(const PredictionUnit &pu, const Com
 
   const SizeType cWidth  = chromaArea.width;
   const SizeType cHeight = chromaArea.height;
+#if JVET_AJ0096_SATD_REORDER_INTRA
+  const ChannelType chType = toChannelType(compID);
+  DistParam cDistParam;
+  cDistParam.applyWeight = false;
+  static Pel predChromaA[MAX_CU_SIZE];
+  static Pel predChromaL[MAX_CU_SIZE];
+  PelBuf predTop(predChromaA, cWidth, 1);
+  PelBuf predLeft(predChromaL, 1, cHeight);
+  static Pel reconChromaA[MAX_CU_SIZE];
+  static Pel reconChromaL[MAX_CU_SIZE];
+  PelBuf reconTop(reconChromaA, cWidth, 1);
+  PelBuf reconLeft(reconChromaL, 1, cHeight);
+#endif
 
   CodingStructure  &cs = *(pu.cs);
   const CodingUnit &cu = *(pu.cu);
@@ -21780,8 +21823,17 @@ uint32_t IntraPrediction::xCalculateCCLMcost(const PredictionUnit &pu, const Com
         {
           predChroma = ClipPel(rightShift(cclmModel.a2 * src[pos], cclmModel.shift2) + cclmModel.b2, pu.cs->slice->clpRng(compID));
         }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+        predTop.at(pos, 0) = predChroma;
+        reconTop.at(pos, 0) = curChroma0[pos];
+#else
         totalSAD += abs(predChroma - curChroma0[pos]);
+#endif
       }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_dbvSadCost->setDistParam(cDistParam, predTop, reconTop, pu.cs->sps->getBitDepth(chType), compID, cWidth >= 4 && cHeight >= 4 ? true : false);
+      totalSAD += (int)cDistParam.distFunc(cDistParam);
+#endif
     }
     else
 #endif
@@ -21789,8 +21841,17 @@ uint32_t IntraPrediction::xCalculateCCLMcost(const PredictionUnit &pu, const Com
       for (int pos = 0; pos < cWidth; pos++)
       {
         Pel predChroma = ClipPel(rightShift(cclmModel.a * src[pos], cclmModel.shift) + cclmModel.b, pu.cs->slice->clpRng(compID));
+#if JVET_AJ0096_SATD_REORDER_INTRA
+        predTop.at(pos, 0) = predChroma;
+        reconTop.at(pos, 0) = curChroma0[pos];
+#else
         totalSAD += abs(predChroma - curChroma0[pos]);
+#endif
       }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_dbvSadCost->setDistParam(cDistParam, predTop, reconTop, pu.cs->sps->getBitDepth(chType), compID, cWidth >= 4 && cHeight >= 4 ? true : false);
+      totalSAD += (int)cDistParam.distFunc(cDistParam);
+#endif
     }
   }
 
@@ -21814,8 +21875,17 @@ uint32_t IntraPrediction::xCalculateCCLMcost(const PredictionUnit &pu, const Com
         {
           predChroma = ClipPel(rightShift(cclmModel.a2 * src[pos * srcStride], cclmModel.shift2) + cclmModel.b2, pu.cs->slice->clpRng(compID));
         }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+        predLeft.at(0, pos) = predChroma;
+        reconLeft.at(0, pos) = curChroma0[pos * curStride];
+#else
         totalSAD += abs(predChroma - curChroma0[pos * curStride]);
+#endif
       }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_dbvSadCost->setDistParam(cDistParam, predLeft, reconLeft, pu.cs->sps->getBitDepth(chType), compID, cWidth >= 4 && cHeight >= 4 ? true : false);
+      totalSAD += (int)cDistParam.distFunc(cDistParam);
+#endif
     }
     else
 #endif
@@ -21823,8 +21893,17 @@ uint32_t IntraPrediction::xCalculateCCLMcost(const PredictionUnit &pu, const Com
       for (int pos = 0; pos < cHeight; pos++)
       {
         Pel predChroma = ClipPel(rightShift(cclmModel.a * src[pos * srcStride], cclmModel.shift) + cclmModel.b, pu.cs->slice->clpRng(compID));
+#if JVET_AJ0096_SATD_REORDER_INTRA
+        predLeft.at(0, pos) = predChroma;
+        reconLeft.at(0, pos) = curChroma0[pos * curStride];
+#else
         totalSAD += abs(predChroma - curChroma0[pos * curStride]);
+#endif
       }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      m_dbvSadCost->setDistParam(cDistParam, predLeft, reconLeft, pu.cs->sps->getBitDepth(chType), compID, cWidth >= 4 && cHeight >= 4 ? true : false);
+      totalSAD += (int)cDistParam.distFunc(cDistParam);
+#endif
     }
   }
 
@@ -21902,6 +21981,19 @@ uint32_t IntraPrediction::xCalculateCCCMcost(const PredictionUnit &pu, const Com
 
   const SizeType cWidth  = chromaArea.width;
   const SizeType cHeight = chromaArea.height;
+#if JVET_AJ0096_SATD_REORDER_INTRA
+  const ChannelType chType = toChannelType(compID);
+  DistParam cDistParam;
+  cDistParam.applyWeight = false;
+  static Pel predChromaA[MAX_CU_SIZE];
+  static Pel predChromaL[MAX_CU_SIZE];
+  PelBuf predTop(predChromaA, cWidth, 1);
+  PelBuf predLeft(predChromaL, 1, cHeight);
+  static Pel reconChromaA[MAX_CU_SIZE];
+  static Pel reconChromaL[MAX_CU_SIZE];
+  PelBuf reconTop(reconChromaA, cWidth, 1);
+  PelBuf reconLeft(reconChromaL, 1, cHeight);
+#endif
 
   CodingStructure  &cs = *(pu.cs);
   const CodingUnit &cu = *(pu.cu);
@@ -21953,8 +22045,17 @@ uint32_t IntraPrediction::xCalculateCCCMcost(const PredictionUnit &pu, const Com
 #endif
         predChroma = ClipPel<Pel>(cccmModel[0].convolve(samples), clpRng);
 
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      predTop.at(pos, 0) = predChroma;
+      reconTop.at(pos, 0) = curChroma0[pos];
+#else
       totalSAD += abs(predChroma - curChroma0[pos]);
+#endif
     }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_dbvSadCost->setDistParam(cDistParam, predTop, reconTop, pu.cs->sps->getBitDepth(chType), compID, cWidth >= 4 && cHeight >= 4 ? true : false);
+    totalSAD += (int)cDistParam.distFunc(cDistParam);
+#endif
   }
 
   if (checkLeft)
@@ -21989,8 +22090,17 @@ uint32_t IntraPrediction::xCalculateCCCMcost(const PredictionUnit &pu, const Com
       else
 #endif
         predChroma = ClipPel<Pel>(cccmModel[0].convolve(samples), clpRng);
+#if JVET_AJ0096_SATD_REORDER_INTRA
+      predLeft.at(0, pos) = predChroma;
+      reconLeft.at(0, pos) = curChroma0[pos * curStride];
+#else
       totalSAD += abs(predChroma - curChroma0[pos * curStride]);
+#endif
     }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_dbvSadCost->setDistParam(cDistParam, predLeft, reconLeft, pu.cs->sps->getBitDepth(chType), compID, cWidth >= 4 && cHeight >= 4 ? true : false);
+    totalSAD += (int)cDistParam.distFunc(cDistParam);
+#endif
   }
 
   return totalSAD;
@@ -27274,12 +27384,22 @@ void IntraPrediction::getTmrlList(CodingUnit& cu)
   distParamSad[1].applyWeight = false;
   distParamSad[1].useMR = false;
 
+#if JVET_AJ0096_SATD_REORDER_INTRA
+  m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg + tmrlInfo.uiTemplateLeft, piPred + tmrlInfo.uiTemplateLeft, iOrgStride, uiPredStride,
+    channelBitDepth, COMPONENT_Y, uiWidth, tmrlInfo.uiTemplateAbove, 0, 1, true);
+#else
   m_timdSatdCost->setTimdDistParam(distParamSad[0], piOrg + tmrlInfo.uiTemplateLeft, piPred + tmrlInfo.uiTemplateLeft, iOrgStride, uiPredStride,
     channelBitDepth, COMPONENT_Y, uiWidth, tmrlInfo.uiTemplateAbove, 0, 1, false);
+#endif
   if (cu.lx())
   {
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg + tmrlInfo.uiTemplateAbove * iOrgStride, piPred + tmrlInfo.uiTemplateAbove * uiPredStride,
+      iOrgStride, uiPredStride, channelBitDepth, COMPONENT_Y, tmrlInfo.uiTemplateLeft, uiHeight, 0, 1, true);
+#else
     m_timdSatdCost->setTimdDistParam(distParamSad[1], piOrg + tmrlInfo.uiTemplateAbove * iOrgStride, piPred + tmrlInfo.uiTemplateAbove * uiPredStride,
       iOrgStride, uiPredStride, channelBitDepth, COMPONENT_Y, tmrlInfo.uiTemplateLeft, uiHeight, 0, 1, false);
+#endif
   }
 
   // step-2. define search range.
@@ -28253,7 +28373,11 @@ void IntraPrediction::reorderEipCands(const PredictionUnit& pu, static_vector<Ei
         predTop.at(w, h) = ClipPel(cand.convolve(inputs), clipRng);
       }
     }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_dbvSadCost->setDistParam(cDistParam, predTop, recoTop, pu.cs->sps->getBitDepth(chType), compId, true);
+#else
     m_dbvSadCost->setDistParam(cDistParam, predTop, recoTop, pu.cs->sps->getBitDepth(chType), compId, false);
+#endif
     uiCost += cDistParam.distFunc(cDistParam);
 
     for (int h = 0; h < blockHeight; h++)
@@ -28264,7 +28388,11 @@ void IntraPrediction::reorderEipCands(const PredictionUnit& pu, static_vector<Ei
         predLeft.at(w, h) = ClipPel(cand.convolve(inputs), clipRng);
       }
     }
+#if JVET_AJ0096_SATD_REORDER_INTRA
+    m_dbvSadCost->setDistParam(cDistParam, predLeft, recoLeft, pu.cs->sps->getBitDepth(chType), compId, true);
+#else
     m_dbvSadCost->setDistParam(cDistParam, predLeft, recoLeft, pu.cs->sps->getBitDepth(chType), compId, false);
+#endif
     uiCost += cDistParam.distFunc(cDistParam);
 
     updateCandList(model, uiCost, tmpCandList, candCostList, NUM_EIP_MERGE_SIGNAL);
diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp
index 608a946e414b0145e39ffa646fef1c5a23cffe6a..da0d801cd2ab7c16a936ad2e34b4385ffb9fceb4 100644
--- a/source/Lib/CommonLib/RdCost.cpp
+++ b/source/Lib/CommonLib/RdCost.cpp
@@ -2346,6 +2346,196 @@ Distortion RdCost::xCalcHADs1xN(const Pel* piOrg, const Pel* piCur, int iStrideO
   return satd;
 }
 #endif
+
+#if JVET_AJ0096_SATD_REORDER_INTRA || JVET_AJ0096_SATD_REORDER_INTER
+Distortion RdCost::xCalcHADs1x16( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iRows, int iCols)
+{
+  int j, sad = 0;
+  int diff[16], m1[16], m2[16];
+  if (iRows == 1)
+  {
+    diff[0] = piOrg[0] - piCur[0];
+    diff[1] = piOrg[1] - piCur[1];
+    diff[2] = piOrg[2] - piCur[2];
+    diff[3] = piOrg[3] - piCur[3];
+    diff[4] = piOrg[4] - piCur[4];
+    diff[5] = piOrg[5] - piCur[5];
+    diff[6] = piOrg[6] - piCur[6];
+    diff[7] = piOrg[7] - piCur[7];
+
+    diff[8] = piOrg[8] - piCur[8];
+    diff[9] = piOrg[9] - piCur[9];
+    diff[10] = piOrg[10] - piCur[10];
+    diff[11] = piOrg[11] - piCur[11];
+    diff[12] = piOrg[12] - piCur[12];
+    diff[13] = piOrg[13] - piCur[13];
+    diff[14] = piOrg[14] - piCur[14];
+    diff[15] = piOrg[15] - piCur[15];
+  }
+  else if (iCols == 1)
+  {
+    diff[0] = piOrg[0] - piCur[0];
+    diff[1] = piOrg[1 * iStrideOrg] - piCur[1 * iStrideCur];
+    diff[2] = piOrg[2 * iStrideOrg] - piCur[2 * iStrideCur];
+    diff[3] = piOrg[3 * iStrideOrg] - piCur[3 * iStrideCur];
+    diff[4] = piOrg[4 * iStrideOrg] - piCur[4 * iStrideCur];
+    diff[5] = piOrg[5 * iStrideOrg] - piCur[5 * iStrideCur];
+    diff[6] = piOrg[6 * iStrideOrg] - piCur[6 * iStrideCur];
+    diff[7] = piOrg[7 * iStrideOrg] - piCur[7 * iStrideCur];
+
+    diff[8] = piOrg[8 * iStrideOrg] - piCur[8 * iStrideCur];
+    diff[9] = piOrg[9 * iStrideOrg] - piCur[9 * iStrideCur];
+    diff[10] = piOrg[10 * iStrideOrg] - piCur[10 * iStrideCur];
+    diff[11] = piOrg[11 * iStrideOrg] - piCur[11 * iStrideCur];
+    diff[12] = piOrg[12 * iStrideOrg] - piCur[12 * iStrideCur];
+    diff[13] = piOrg[13 * iStrideOrg] - piCur[13 * iStrideCur];
+    diff[14] = piOrg[14 * iStrideOrg] - piCur[14 * iStrideCur];
+    diff[15] = piOrg[15 * iStrideOrg] - piCur[15 * iStrideCur];
+  }
+  else
+  {
+    CHECK(1, "shall not be here");
+  }
+
+  m2[0] = diff[0] + diff[8];
+  m2[1] = diff[1] + diff[9];
+  m2[2] = diff[2] + diff[10];
+  m2[3] = diff[3] + diff[11];
+  m2[4] = diff[4] + diff[12];
+  m2[5] = diff[5] + diff[13];
+  m2[6] = diff[6] + diff[14];
+  m2[7] = diff[7] + diff[15];
+  m2[8] = diff[0] - diff[8];
+  m2[9] = diff[1] - diff[9];
+  m2[10] = diff[2] - diff[10];
+  m2[11] = diff[3] - diff[11];
+  m2[12] = diff[4] - diff[12];
+  m2[13] = diff[5] - diff[13];
+  m2[14] = diff[6] - diff[14];
+  m2[15] = diff[7] - diff[15];
+
+  m1[0] = m2[0] + m2[4];
+  m1[1] = m2[1] + m2[5];
+  m1[2] = m2[2] + m2[6];
+  m1[3] = m2[3] + m2[7];
+  m1[4] = m2[0] - m2[4];
+  m1[5] = m2[1] - m2[5];
+  m1[6] = m2[2] - m2[6];
+  m1[7] = m2[3] - m2[7];
+  m1[8] = m2[8] + m2[12];
+  m1[9] = m2[9] + m2[13];
+  m1[10] = m2[10] + m2[14];
+  m1[11] = m2[11] + m2[15];
+  m1[12] = m2[8] - m2[12];
+  m1[13] = m2[9] - m2[13];
+  m1[14] = m2[10] - m2[14];
+  m1[15] = m2[11] - m2[15];
+
+  m2[0] = m1[0] + m1[2];
+  m2[1] = m1[1] + m1[3];
+  m2[2] = m1[0] - m1[2];
+  m2[3] = m1[1] - m1[3];
+  m2[4] = m1[4] + m1[6];
+  m2[5] = m1[5] + m1[7];
+  m2[6] = m1[4] - m1[6];
+  m2[7] = m1[5] - m1[7];
+  m2[8] = m1[8] + m1[10];
+  m2[9] = m1[9] + m1[11];
+  m2[10] = m1[8] - m1[10];
+  m2[11] = m1[9] - m1[11];
+  m2[12] = m1[12] + m1[14];
+  m2[13] = m1[13] + m1[15];
+  m2[14] = m1[12] - m1[14];
+  m2[15] = m1[13] - m1[15];
+
+  for( j = 0; j < 16; j++ )
+  {
+    sad += abs( m2[j] );
+  }
+
+#if JVET_R0164_MEAN_SCALED_SATD
+  sad -= abs(m2[0]);
+  sad += abs(m2[0]) >> 2;
+#endif
+  sad  = ( int ) ( sad / sqrt( 16.0 * 1 ) * 2 );
+
+  return sad;
+}
+
+Distortion RdCost::xCalcHADs1x8( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iRows, int iCols)
+{
+  int j;
+  Distortion sad = 0;
+  int diff[8], m1[8], m2[8];
+  if (iRows == 1)
+  {
+    diff[0] = piOrg[0] - piCur[0];
+    diff[1] = piOrg[1] - piCur[1];
+    diff[2] = piOrg[2] - piCur[2];
+    diff[3] = piOrg[3] - piCur[3];
+    diff[4] = piOrg[4] - piCur[4];
+    diff[5] = piOrg[5] - piCur[5];
+    diff[6] = piOrg[6] - piCur[6];
+    diff[7] = piOrg[7] - piCur[7];
+  }
+  else if (iCols == 1)
+  {
+    diff[0] = piOrg[0] - piCur[0];
+    diff[1] = piOrg[1 * iStrideOrg] - piCur[1 * iStrideCur];
+    diff[2] = piOrg[2 * iStrideOrg] - piCur[2 * iStrideCur];
+    diff[3] = piOrg[3 * iStrideOrg] - piCur[3 * iStrideCur];
+    diff[4] = piOrg[4 * iStrideOrg] - piCur[4 * iStrideCur];
+    diff[5] = piOrg[5 * iStrideOrg] - piCur[5 * iStrideCur];
+    diff[6] = piOrg[6 * iStrideOrg] - piCur[6 * iStrideCur];
+    diff[7] = piOrg[7 * iStrideOrg] - piCur[7 * iStrideCur];
+  }
+  else
+  {
+    CHECK(1, "shall not be here");
+  }
+
+  m2[0] = diff[0] + diff[4];
+  m2[1] = diff[1] + diff[5];
+  m2[2] = diff[2] + diff[6];
+  m2[3] = diff[3] + diff[7];
+  m2[4] = diff[0] - diff[4];
+  m2[5] = diff[1] - diff[5];
+  m2[6] = diff[2] - diff[6];
+  m2[7] = diff[3] - diff[7];
+
+  m1[0] = m2[0] + m2[2];
+  m1[1] = m2[1] + m2[3];
+  m1[2] = m2[0] - m2[2];
+  m1[3] = m2[1] - m2[3];
+  m1[4] = m2[4] + m2[6];
+  m1[5] = m2[5] + m2[7];
+  m1[6] = m2[4] - m2[6];
+  m1[7] = m2[5] - m2[7];
+
+  m2[0] = m1[0] + m1[1];
+  m2[1] = m1[0] - m1[1];
+  m2[2] = m1[2] + m1[3];
+  m2[3] = m1[2] - m1[3];
+  m2[4] = m1[4] + m1[5];
+  m2[5] = m1[4] - m1[5];
+  m2[6] = m1[6] + m1[7];
+  m2[7] = m1[6] - m1[7];
+
+  for (j = 0; j < 8; j++)
+  {
+    sad += abs(m2[j]);
+  }
+
+#if JVET_R0164_MEAN_SCALED_SATD
+  sad -= abs(m2[0]);
+  sad += abs(m2[0]) >> 2;
+#endif
+  sad  = ( int )( sad / sqrt( 8.0 * 1 ) * 2 );
+
+  return sad;
+}
+#endif
+
 Distortion RdCost::xCalcHADs2x2( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iStep )
 {
   Distortion satd = 0;
@@ -3134,6 +3324,40 @@ Distortion RdCost::xGetHADs( const DistParam &rcDtParam )
       piCur += iOffsetCur;
     }
   }
+#if JVET_AJ0096_SATD_REORDER_INTRA || JVET_AJ0096_SATD_REORDER_INTER
+  else if (iRows == 1 && iCols % 16 == 0)
+  {
+    for( x = 0; x < iCols; x += 16 )
+    {
+      uiSum += xCalcHADs1x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iRows, 16);
+    }
+  }
+  else if (iCols == 1 && iRows % 16 == 0)
+  {
+    for( y = 0; y < iRows; y += 16 )
+    {
+      uiSum += xCalcHADs1x16( &piOrg[0], &piCur[0], iStrideOrg, iStrideCur, 16, iCols );
+      piOrg += (iStrideOrg << 4);
+      piCur += (iStrideCur << 4);
+    }
+  }
+  else if (iRows == 1 && iCols % 8 == 0)
+  {
+    for( x = 0; x < iCols; x += 8 )
+    {
+      uiSum += xCalcHADs1x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iRows, 8);
+    }
+  }
+  else if (iCols == 1 && iRows % 8 == 0)
+  {
+    for( y = 0; y < iRows; y += 8 )
+    {
+      uiSum += xCalcHADs1x8(&piOrg[0], &piCur[0], iStrideOrg, iStrideCur, 8, iCols);
+      piOrg += (iStrideOrg << 3);
+      piCur += (iStrideCur << 3);
+    }
+  }
+#endif
 #if JVET_AI0185_ADAPTIVE_COST_IN_MERGE_MODE
   else if (iRows == 1 || iCols == 1)
   {
diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h
index 57dce9fb2a65c31e779ce35f9b03005c6c5e6df7..b767de0b2e8bed135ddba8100eed678d26274151 100644
--- a/source/Lib/CommonLib/RdCost.h
+++ b/source/Lib/CommonLib/RdCost.h
@@ -1097,6 +1097,10 @@ private:
   static Distortion xGetHADs          ( const DistParam& pcDtParam );
 #if JVET_AI0185_ADAPTIVE_COST_IN_MERGE_MODE
   static Distortion xCalcHADs1xN      ( const Pel* piOrg, const Pel* piCurr, int iStrideOrg, int iStrideCur, int iRows, int iCols);
+#endif
+#if JVET_AJ0096_SATD_REORDER_INTRA || JVET_AJ0096_SATD_REORDER_INTER
+  static Distortion xCalcHADs1x16     ( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iRows, int iCols);
+  static Distortion xCalcHADs1x8      ( const Pel *piOrg, const Pel *piCur, int iStrideOrg, int iStrideCur, int iRows, int iCols);
 #endif
   static Distortion xCalcHADs2x2      ( const Pel *piOrg, const Pel *piCurr, int iStrideOrg, int iStrideCur, int iStep );
   static Distortion xCalcHADs4x4      ( const Pel *piOrg, const Pel *piCurr, int iStrideOrg, int iStrideCur, int iStep );
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 8d1d8515f3e526193f87fb87f5295c1cfa3bf573..58801c1d722b0b5c3cbf1d812f221799831d32b6 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -51,8 +51,6 @@
 #include <cassert>
 #include <cstdint>
 
-
-
 #define BASE_ENCODER                                      1
 #define BASE_NORMATIVE                                    1
 #define TOOLS                                             1
@@ -258,6 +256,7 @@
 #define JVET_AE0094_IBC_NONADJACENT_SPATIAL_CANDIDATES    1 // JVET-AE0094: IBC with non-adjacent spatial candidates
 #define JVET_AG0091_ARBVP                                 1 // JVET-AG0091: Auto-relocated block vector prediction
 #define JVET_AI0082_TEMPORAL_BV                           1 // JVET-AI0081: Temporal BV for IBC merge list construction
+#define JVET_AJ0096_SATD_REORDER_INTRA                    1 // JVET-AJ0096: SATD-based reordering for intra coding
 
 #if JVET_AC0071_DBV && JVET_V0130_INTRA_TMP
 #define JVET_AF0066_ENABLE_DBV_4_SINGLE_TREE              1 // JVET-AF0066: Enable DBV mode in single tree configuration
@@ -363,6 +362,7 @@
 #define JVET_AI0185_ADAPTIVE_COST_IN_MERGE_MODE           1 // JVET-AI0185 adaptive cost function selection in merge mode
 #define JVET_AI0183_MVP_EXTENSION                         1 // JVET-AI0183 MVP extension
 // Inter template matching tools
+#define JVET_AJ0096_SATD_REORDER_INTER                    1 // JVET-AJ0096: SATD-based reordering for inter coding
 #define ENABLE_INTER_TEMPLATE_MATCHING                    1 // It controls whether template matching is enabled for inter prediction
 #if ENABLE_INTER_TEMPLATE_MATCHING
 #define TM_AMVP                                           1 // Add template matching to non-subblock inter to refine regular AMVP candidates
diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h
index 8cac6fc5ca594bb3612633e03ad7f705135a879f..5a877a07ed810536e120974461d856b25f70f7a3 100644
--- a/source/Lib/CommonLib/x86/RdCostX86.h
+++ b/source/Lib/CommonLib/x86/RdCostX86.h
@@ -2102,6 +2102,146 @@ static uint32_t xCalcHAD16x8_AVX2( const Torg *piOrg, const Tcur *piCur, const i
   return (sad);
 }
 
+#if JVET_AJ0096_SATD_REORDER_INTRA || JVET_AJ0096_SATD_REORDER_INTER
+static uint32_t xCalcHADs1x16_SSE(const Torg* piOrg, const Tcur* piCur, int iStrideOrg, int iStrideCur, int iRows, int iCols)
+{
+  __m128i diff[4], m1[4], m2[4];
+
+  if (iRows == 1)
+  {
+    __m128i org0 = _mm_loadu_si128((__m128i*)&piOrg[0]);
+    __m128i cur0 = _mm_loadu_si128((__m128i*)&piCur[0]);
+    diff[0] = _mm_sub_epi16(org0, cur0);
+    diff[1] = _mm_cvtepi16_epi32( _mm_srli_si128( diff[0], 8 ) );
+    diff[0] = _mm_cvtepi16_epi32( diff[0] );
+
+    org0 = _mm_loadu_si128((__m128i*)&piOrg[8]);
+    cur0 = _mm_loadu_si128((__m128i*)&piCur[8]);
+    diff[2] = _mm_sub_epi16(org0, cur0);
+    diff[3] = _mm_cvtepi16_epi32( _mm_srli_si128( diff[2], 8 ) );
+    diff[2] = _mm_cvtepi16_epi32( diff[2] );
+  }
+  else if (iCols == 1)
+  {
+    Pel diffI[16];
+    for (int i = 0; i < 16; i++)
+    {
+      diffI[i] = piOrg[0] - piCur[0];
+      piOrg += iStrideOrg;
+      piCur += iStrideCur;
+    }
+    diff[0] = _mm_loadu_si128((__m128i*)&diffI[0]);
+    diff[1] = _mm_cvtepi16_epi32( _mm_srli_si128( diff[0], 8 ) );
+    diff[0] = _mm_cvtepi16_epi32( diff[0] );
+    diff[2] = _mm_loadu_si128((__m128i*)&diffI[8]);
+    diff[3] = _mm_cvtepi16_epi32( _mm_srli_si128( diff[2], 8 ) );
+    diff[2] = _mm_cvtepi16_epi32( diff[2] );
+  }
+  else
+  {
+    std::cerr << "shall not be here" << std::endl;
+    return -1;
+  }
+
+  m2[0] = _mm_add_epi32(diff[0], diff[2]);
+  m2[1] = _mm_add_epi32(diff[1], diff[3]);
+  m2[2] = _mm_sub_epi32(diff[0], diff[2]);
+  m2[3] = _mm_sub_epi32(diff[1], diff[3]);
+
+  m1[0] = _mm_add_epi32(m2[0], m2[1]);
+  m1[1] = _mm_sub_epi32(m2[0], m2[1]);
+  m1[2] = _mm_add_epi32(m2[2], m2[3]);
+  m1[3] = _mm_sub_epi32(m2[2], m2[3]);
+
+  m2[0] = _mm_unpacklo_epi32(m1[0], m1[1]);
+  m2[1] = _mm_unpackhi_epi32(m1[0], m1[1]);
+  m2[2] = _mm_abs_epi32(_mm_add_epi32(m2[0], m2[1]));
+#if JVET_R0164_MEAN_SCALED_SATD
+  uint32_t absDc = _mm_cvtsi128_si32(m2[2]);
+#endif
+  m2[3] = _mm_abs_epi32(_mm_sub_epi32(m2[0], m2[1]));
+  __m128i iSum = _mm_add_epi32(m2[2], m2[3]);
+  m2[0] = _mm_unpacklo_epi32(m1[2], m1[3]);
+  m2[1] = _mm_unpackhi_epi32(m1[2], m1[3]);
+  m2[2] = _mm_abs_epi32(_mm_add_epi32(m2[0], m2[1]));
+  m2[3] = _mm_abs_epi32(_mm_sub_epi32(m2[0], m2[1]));
+  iSum = _mm_add_epi32(iSum, m2[2]);
+  iSum = _mm_add_epi32(iSum, m2[3]);
+  iSum = _mm_add_epi32(iSum, _mm_shuffle_epi32(iSum, 0x4e));   // 01001110
+  iSum = _mm_add_epi32(iSum, _mm_shuffle_epi32(iSum, 0xb1));   // 10110001
+  uint32_t sad = _mm_cvtsi128_si32( iSum );
+
+#if JVET_R0164_MEAN_SCALED_SATD
+  sad -= absDc;
+  sad += absDc >> 2;
+#endif
+  sad = sad >> 1;
+
+  return sad;
+}
+
+static uint32_t xCalcHADs1x8_SSE(const Torg* piOrg, const Tcur* piCur, int iStrideOrg, int iStrideCur, int iRows, int iCols)
+{
+  __m128i diff[2], m1[2], m2[2];
+
+  if (iRows == 1)
+  {
+    __m128i org0 = _mm_loadu_si128((__m128i*)&piOrg[0]);
+    __m128i cur0 = _mm_loadu_si128((__m128i*)&piCur[0]);
+    diff[0] = _mm_sub_epi16(org0, cur0);
+    diff[1] = _mm_cvtepi16_epi32( _mm_srli_si128( diff[0], 8 ) );
+    diff[0] = _mm_cvtepi16_epi32( diff[0] );
+  }
+  else if (iCols == 1)
+  {
+    Pel diffI[8];
+    for (int i = 0; i < 8; i++)
+    {
+      diffI[i] = piOrg[0] - piCur[0];
+      piOrg += iStrideOrg;
+      piCur += iStrideCur;
+    }
+    diff[0] = _mm_loadu_si128((__m128i*)&diffI[0]);
+    diff[1] = _mm_cvtepi16_epi32( _mm_srli_si128( diff[0], 8 ) );
+    diff[0] = _mm_cvtepi16_epi32( diff[0] );
+  }
+  else
+  {
+    std::cerr << "shall not be here" << std::endl;
+    return -1;
+  }
+
+  m2[0] = _mm_add_epi32(diff[0], diff[1]);
+  m2[1] = _mm_sub_epi32(diff[0], diff[1]);
+
+  m1[0] = _mm_unpacklo_epi32(m2[0], m2[1]);
+  m1[1] = _mm_unpackhi_epi32(m2[0], m2[1]);
+  m2[0] = _mm_add_epi32(m1[0], m1[1]);
+  m2[1] = _mm_sub_epi32(m1[0], m1[1]);
+
+  m1[0] = _mm_unpacklo_epi32(m2[0], m2[1]);
+  m1[1] = _mm_unpackhi_epi32(m2[0], m2[1]);
+  m2[0] = _mm_abs_epi32(_mm_add_epi32(m1[0], m1[1]));
+#if JVET_R0164_MEAN_SCALED_SATD
+  uint32_t absDc = _mm_cvtsi128_si32(m2[0]);
+#endif
+  m2[1] = _mm_abs_epi32(_mm_sub_epi32(m1[0], m1[1]));
+
+  __m128i iSum = _mm_add_epi32(m2[0], m2[1]);
+  iSum = _mm_add_epi32(iSum, _mm_shuffle_epi32(iSum, 0x4e));   // 01001110
+  iSum = _mm_add_epi32(iSum, _mm_shuffle_epi32(iSum, 0xb1));   // 10110001
+  uint32_t sad = _mm_cvtsi128_si32( iSum );
+
+#if JVET_R0164_MEAN_SCALED_SATD
+  sad -= absDc;
+  sad += absDc >> 2;
+#endif
+  sad  = ( int ) ( sad / sqrt( 8.0 * 1 ) * 2 );
+
+  return sad;
+}
+#endif
+
 
 static uint32_t xCalcHAD8x16_AVX2( const Pel* piOrg, const Pel* piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth )
 {
@@ -2570,6 +2710,40 @@ Distortion RdCost::xGetHADs_SIMD( const DistParam &rcDtParam )
       piCur += iOffsetCur;
     }
   }
+#if JVET_AJ0096_SATD_REORDER_INTRA || JVET_AJ0096_SATD_REORDER_INTER
+  else if (iRows == 1 && iCols % 16 == 0)
+  {
+    for( x = 0; x < iCols; x += 16 )
+    {
+      uiSum += xCalcHADs1x16_SSE(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iRows, 16);
+    }
+  }
+  else if (iCols == 1 && iRows % 16 == 0)
+  {
+    for( y = 0; y < iRows; y += 16 )
+    {
+      uiSum += xCalcHADs1x16_SSE( &piOrg[0], &piCur[0], iStrideOrg, iStrideCur, 16, iCols );
+      piOrg += (iStrideOrg << 4);
+      piCur += (iStrideCur << 4);
+    }
+  }
+  else if (iRows == 1 && iCols % 8 == 0)
+  {
+    for( x = 0; x < iCols; x += 8 )
+    {
+      uiSum += xCalcHADs1x8_SSE(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iRows, 8);
+    }
+  }
+  else if (iCols == 1 && iRows % 8 == 0)
+  {
+    for( y = 0; y < iRows; y += 8 )
+    {
+      uiSum += xCalcHADs1x8_SSE(&piOrg[0], &piCur[0], iStrideOrg, iStrideCur, 8, iCols);
+      piOrg += (iStrideOrg << 3);
+      piCur += (iStrideCur << 3);
+    }
+  }
+#endif
 #if JVET_AI0185_ADAPTIVE_COST_IN_MERGE_MODE
   else if (iRows == 1 || iCols == 1)
   {