diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp
index ead92686fb3847607b77b1843b0a9526e57bb8df..d5fff470e0c3445ffd2155c40bb63ef4b479b522 100644
--- a/source/App/EncoderApp/EncAppCfg.cpp
+++ b/source/App/EncoderApp/EncAppCfg.cpp
@@ -1830,14 +1830,22 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] )
     msg( WARNING, "*************************************************************************\n" );
   }
 
+#if ENABLE_QPA_SUB_CTU
+ #if QP_SWITCHING_FOR_PARALLEL
+  if (m_LargeCTU && (m_iQP < 38) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && (m_iSourceWidth <= 2048) && (m_iSourceHeight <= 1280)
+ #else
+  if (m_LargeCTU && ((int)m_fQP < 38) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && (m_iSourceWidth <= 2048) && (m_iSourceHeight <= 1280)
+ #endif
+      && ((1 << (std::max (m_quadtreeTULog2MaxSize, m_tuLog2MaxSize) + 1)) == m_uiCTUSize) && (m_iSourceWidth > 512 || m_iSourceHeight > 320))
+  {
+    m_iMaxCuDQPDepth = 1;
+  }
+#else
  #if QP_SWITCHING_FOR_PARALLEL
   if( m_LargeCTU && ( m_iQP < 38 ) && ( m_iGOPSize > 4 ) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && ( m_iSourceHeight <= 1280 ) && ( m_iSourceWidth <= 2048 ) )
  #else
   if( m_LargeCTU && ( ( int ) m_fQP < 38 ) && ( m_iGOPSize > 4 ) && m_bUsePerceptQPA && !m_bUseAdaptiveQP && ( m_iSourceHeight <= 1280 ) && ( m_iSourceWidth <= 2048 ) )
  #endif
-#else
-  if( false )
-#endif
   {
     msg( WARNING, "*************************************************************************\n" );
     msg( WARNING, "* WARNING: QPA on with large CTU for <=HD sequences, limiting CTU size! *\n" );
@@ -1847,6 +1855,8 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] )
     if( ( 1u << m_quadtreeTULog2MaxSize ) > m_uiCTUSize ) m_quadtreeTULog2MaxSize--;
     if( ( 1u << m_tuLog2MaxSize         ) > m_uiCTUSize ) m_tuLog2MaxSize--;
   }
+#endif
+#endif // ENABLE_QPA
 
   const int minCuSize = 1 << MIN_CU_LOG2;
   m_uiMaxCodingDepth = 0;
diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h
index e16ab2d3f4a9b163cbbbe0f2b33bdb1c70ef416d..116f5c69fac4469eb69e6ec8ba755d133060ecdd 100644
--- a/source/Lib/CommonLib/Buffer.h
+++ b/source/Lib/CommonLib/Buffer.h
@@ -118,6 +118,7 @@ struct AreaBuf : public Size
   void removeHighFreq       ( const AreaBuf<T>& other, const bool bClip, const ClpRng& clpRng);
   void updateHistogram      ( std::vector<int32_t>& hist ) const;
 
+  T    mean                 () const;
   T    meanDiff             ( const AreaBuf<const T> &other ) const;
   void subtract             ( const T val );
 
@@ -546,6 +547,27 @@ void AreaBuf<T>::extendBorderPel( unsigned margin )
     ::memcpy( p - ( y + 1 ) * s, p, sizeof( T ) * ( w + ( margin << 1 ) ) );
   }
 }
+
+template<typename T>
+T AreaBuf<T>::mean() const
+{
+  int64_t  sum = 0;
+
+  CHECK (area() == 0, "size of area is zero");
+
+  const T* src = buf;
+
+#define MEAN_INC      src += stride
+#define MEAN_OP(ADDR) sum += src[ADDR]
+
+  SIZE_AWARE_PER_EL_OP(MEAN_OP, MEAN_INC);
+
+#undef MEAN_INC
+#undef MEAN_OP
+
+  return T ((sum + (area() >> 1)) / area());
+}
+
 template<typename T>
 T AreaBuf<T>::meanDiff( const AreaBuf<const T> &other ) const
 {
diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h
index 2e90cc5382746f8a7675372eceb892748202fa98..55756cdbed31462ec6fae7c98e966efb9c0b34b1 100644
--- a/source/Lib/CommonLib/Picture.h
+++ b/source/Lib/CommonLib/Picture.h
@@ -297,6 +297,9 @@ public:
 #if ENABLE_QPA
   std::vector<double>     m_uEnerHpCtu;                         ///< CTU-wise L2 or squared L1 norm of high-passed luma input
   std::vector<Pel>        m_iOffsetCtu;                         ///< CTU-wise DC offset (later QP index offset) of luma input
+ #if ENABLE_QPA_SUB_CTU
+  std::vector<int8_t>     m_subCtuQP;                           ///< sub-CTU-wise adapted QPs for delta-QP depth of 1 or more
+ #endif
 #endif
 
   std::vector<SAOBlkParam> m_sao[2];
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index b60bfdd4e99d78688d73f37a552e30744313f8de..1d0452b39117b53d0fda1c2196cd0bb4e042e83a 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -224,7 +224,7 @@
 #define SHARP_LUMA_DELTA_QP                               1 ///< include non-normative LCU deltaQP and normative chromaQP change
 #define ER_CHROMA_QP_WCG_PPS                              1 ///< Chroma QP model for WCG used in Anchor 3.2
 #define ENABLE_QPA                                        1 ///< Non-normative perceptual QP adaptation according to JVET-H0047 and JVET-K0206. Deactivated by default, activated using encoder arguments --PerceptQPA=1 --SliceChromaQPOffsetPeriodicity=1
-
+#define ENABLE_QPA_SUB_CTU                              ( 1 && ENABLE_QPA ) ///< when maximum delta-QP depth is greater than zero, use sub-CTU QPA
 
 
 #define RDOQ_CHROMA                                       1 ///< use of RDOQ in chroma
diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp
index d34cdde6d78d357ad6ff06f4bf98053a5433ebe4..f74add71740e260dd27ea94d3353de02ddd5e820 100644
--- a/source/Lib/EncoderLib/EncCu.cpp
+++ b/source/Lib/EncoderLib/EncCu.cpp
@@ -363,7 +363,7 @@ void EncCu::compressCtu( CodingStructure& cs, const UnitArea& area, const unsign
   cs.useSubStructure( *bestCS, partitioner->chType, CS::getArea( *bestCS, area, partitioner->chType ), copyUnsplitCTUSignals, false, false, copyUnsplitCTUSignals );
   cs.slice->copyMotionLUTs(bestMotCandLUTs, cs.slice->getMotionLUTs());
 
-  if( !cs.pcv->ISingleTree && cs.slice->isIRAP() && cs.pcv->chrFormat != CHROMA_400 )
+  if (CS::isDualITree (cs) && isChromaEnabled (cs.pcv->chrFormat))
   {
     m_CABACEstimator->getCtx() = m_CurrCtx->start;
 
@@ -639,7 +639,7 @@ void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Par
   {
     EncTestMode currTestMode = m_modeCtrl->currTestMode();
 
-    if (tempCS->pps->getUseDQP() && CS::isDualITree(*tempCS) && isChroma(partitioner.chType))
+    if (pps.getUseDQP() && CS::isDualITree(*tempCS) && isChroma(partitioner.chType))
     {
       const Position chromaCentral(tempCS->area.Cb().chromaPos().offset(tempCS->area.Cb().chromaSize().width >> 1, tempCS->area.Cb().chromaSize().height >> 1));
       const Position lumaRefPos(chromaCentral.x << getComponentScaleX(COMPONENT_Cb, tempCS->area.chromaFormat), chromaCentral.y << getComponentScaleY(COMPONENT_Cb, tempCS->area.chromaFormat));
@@ -653,15 +653,24 @@ void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Par
       }
     }
 
+#if SHARP_LUMA_DELTA_QP || ENABLE_QPA_SUB_CTU
+    if (partitioner.currDepth <= pps.getMaxCuDQPDepth() && (
 #if SHARP_LUMA_DELTA_QP
-    if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() && partitioner.currDepth <= pps.getMaxCuDQPDepth() )
+        (m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled()) ||
+#endif
+#if ENABLE_QPA_SUB_CTU
+        (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && pps.getUseDQP())
+#else
+        false
+#endif
+      ))
     {
 #if ENABLE_SPLIT_PARALLELISM
       CHECK( tempCS->picture->scheduler.getSplitJobId() > 0, "Changing lambda is only allowed in the master thread!" );
 #endif
       if (currTestMode.qp >= 0)
       {
-        updateLambda(&slice, currTestMode.qp);
+        updateLambda (&slice, currTestMode.qp, CS::isDualITree (*tempCS) || (partitioner.currDepth == 0));
       }
     }
 #endif
@@ -773,8 +782,8 @@ void EncCu::xCompressCU( CodingStructure *&tempCS, CodingStructure *&bestCS, Par
   CHECK( bestCS->cost             == MAX_DOUBLE                , "No possible encoding found" );
 }
 
-#if SHARP_LUMA_DELTA_QP
-void EncCu::updateLambda( Slice* slice, double dQP )
+#if SHARP_LUMA_DELTA_QP || ENABLE_QPA_SUB_CTU
+void EncCu::updateLambda (Slice* slice, const int dQP, const bool updateRdCostLambda)
 {
 #if WCG_EXT
   int    NumberBFrames = ( m_pcEncCfg->getGOPSize() - 1 );
@@ -838,14 +847,19 @@ void EncCu::updateLambda( Slice* slice, double dQP )
   dLambda *= lambdaModifier;
 
   int qpBDoffset = slice->getSPS()->getQpBDOffset(CHANNEL_TYPE_LUMA);
-  int iQP = Clip3(-qpBDoffset, MAX_QP, (int)floor(dQP + 0.5));
+  int iQP = Clip3(-qpBDoffset, MAX_QP, (int)floor((double)dQP + 0.5));
   m_pcSliceEncoder->setUpLambda(slice, dLambda, iQP);
 
 #else
-  int iQP = (int)dQP;
+  int iQP = dQP;
   const double oldQP     = (double)slice->getSliceQpBase();
+#if ENABLE_QPA_SUB_CTU
+  const double oldLambda = (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && slice->getPPS()->getUseDQP()) ? slice->getLambdas()[0] :
+                           m_pcSliceEncoder->calculateLambda (slice, m_pcSliceEncoder->getGopId(), slice->getDepth(), oldQP, oldQP, iQP);
+#else
   const double oldLambda = m_pcSliceEncoder->calculateLambda (slice, m_pcSliceEncoder->getGopId(), slice->getDepth(), oldQP, oldQP, iQP);
-  const double newLambda = oldLambda * pow (2.0, (dQP - oldQP) / 3.0);
+#endif
+  const double newLambda = oldLambda * pow (2.0, ((double)dQP - oldQP) / 3.0);
 #if RDOQ_CHROMA_LAMBDA
   const double chromaLambda = newLambda / m_pcRdCost->getChromaWeight();
   const double lambdaArray[MAX_NUM_COMPONENT] = {newLambda, chromaLambda, chromaLambda};
@@ -853,7 +867,7 @@ void EncCu::updateLambda( Slice* slice, double dQP )
 #else
   m_pcTrQuant->setLambda (newLambda);
 #endif
-  m_pcRdCost->setLambda( newLambda, slice->getSPS()->getBitDepths() );
+  if (updateRdCostLambda) m_pcRdCost->setLambda (newLambda, slice->getSPS()->getBitDepths());
 #endif
 }
 #endif
@@ -1051,7 +1065,12 @@ void EncCu::xCheckModeSplit(CodingStructure *&tempCS, CodingStructure *&bestCS,
   m_CABACEstimator->getCtx() = SubCtx( Ctx::SplitFlag,   ctxStartSP );
   m_CABACEstimator->getCtx() = SubCtx( Ctx::BTSplitFlag, ctxStartBT );
 
-  if( cost > bestCS->cost )
+  if( cost > bestCS->cost
+#if ENABLE_QPA_SUB_CTU
+    || (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && pps.getUseDQP() && (pps.getMaxCuDQPDepth() > 0) && (split == CU_HORZ_SPLIT || split == CU_VERT_SPLIT) &&
+        (partitioner.currArea().lwidth() == tempCS->pcv->maxCUWidth) && (partitioner.currArea().lheight() == tempCS->pcv->maxCUHeight)) // force quad-split or no split at CTU level
+#endif
+    )
   {
     xCheckBestMode( tempCS, bestCS, partitioner, encTestMode );
     return;
@@ -2833,10 +2852,15 @@ void EncCu::xCheckRDCostCPRModeMerge2Nx2N(CodingStructure *&tempCS, CodingStruct
             m_pcInterSearch->encodeResAndCalcRdInterCU(*tempCS, partitioner, (numResidualPass != 0), true, chroma);
             xEncodeDontSplit(*tempCS, partitioner);
 
+#if ENABLE_QPA_SUB_CTU
+            xCheckDQP (*tempCS, partitioner);
+#else
+            // this if-check is redundant
             if (tempCS->pps->getUseDQP() && (partitioner.currDepth) <= tempCS->pps->getMaxCuDQPDepth())
             {
               xCheckDQP(*tempCS, partitioner);
             }
+#endif
 
             hasResidual[emtCuFlag] = cu.rootCbf;
             emtCost[emtCuFlag] = tempCS->cost;
@@ -2934,10 +2958,15 @@ void EncCu::xCheckRDCostCPRMode(CodingStructure *&tempCS, CodingStructure *&best
           }
           xEncodeDontSplit(*tempCS, partitioner);
 
+#if ENABLE_QPA_SUB_CTU
+          xCheckDQP (*tempCS, partitioner);
+#else
+          // this if-check is redundant
           if (tempCS->pps->getUseDQP() && (partitioner.currDepth) <= tempCS->pps->getMaxCuDQPDepth())
           {
             xCheckDQP(*tempCS, partitioner);
           }
+#endif
 
           DTRACE_MODE_COST(*tempCS, m_pcRdCost->getLambda());
           xCheckBestMode(tempCS, bestCS, partitioner, encTestMode);
diff --git a/source/Lib/EncoderLib/EncCu.h b/source/Lib/EncoderLib/EncCu.h
index 4adbc26dce805e3710a1d1673de0025bd1f97415..22e50e7a8aaa90999ca4bfb34be4e85c34369aaa 100644
--- a/source/Lib/EncoderLib/EncCu.h
+++ b/source/Lib/EncoderLib/EncCu.h
@@ -127,8 +127,8 @@ private:
 #endif
   int                   m_bestGbiIdx[2];
   double                m_bestGbiCost[2];
-#if SHARP_LUMA_DELTA_QP
-  void    updateLambda      ( Slice* slice, double dQP );
+#if SHARP_LUMA_DELTA_QP || ENABLE_QPA_SUB_CTU
+  void    updateLambda      ( Slice* slice, const int dQP, const bool updateRdCostLambda );
 #endif
 
 public:
diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp
index 7377a0f2dd3c785ffcf78c8b7c153d4fe85f6d87..56da75e88d36164dc175e84f936577ccd3bdad2b 100644
--- a/source/Lib/EncoderLib/EncGOP.cpp
+++ b/source/Lib/EncoderLib/EncGOP.cpp
@@ -1980,8 +1980,16 @@ void EncGOP::compressGOP( int iPOCLast, int iNumPicRcvd, PicList& rcListPic,
     std::vector<OutputBitstream> substreamsOut(numSubstreams);
 
 #if ENABLE_QPA
-    pcPic->m_uEnerHpCtu.resize( numberOfCtusInFrame );
-    pcPic->m_iOffsetCtu.resize( numberOfCtusInFrame );
+    pcPic->m_uEnerHpCtu.resize (numberOfCtusInFrame);
+    pcPic->m_iOffsetCtu.resize (numberOfCtusInFrame);
+ #if ENABLE_QPA_SUB_CTU
+    if (pcSlice->getPPS()->getUseDQP() && pcSlice->getPPS()->getMaxCuDQPDepth() > 0)
+    {
+      const PreCalcValues &pcv = *pcPic->cs->pcv;
+      const unsigned   mtsLog2 = (unsigned)g_aucLog2[std::min (pcPic->cs->sps->getMaxTrSize(), pcv.maxCUWidth)];
+      pcPic->m_subCtuQP.resize ((pcv.maxCUWidth >> mtsLog2) * (pcv.maxCUHeight >> mtsLog2));
+    }
+ #endif
 #endif
     if (pcSlice->getSPS()->getUseSAO())
     {
diff --git a/source/Lib/EncoderLib/EncModeCtrl.cpp b/source/Lib/EncoderLib/EncModeCtrl.cpp
index eeee01f51e3c8c2262e37cb09b968b09afba6ebc..f5032831e5bdf4c74c8e47dabfe943fe56c5e7e4 100644
--- a/source/Lib/EncoderLib/EncModeCtrl.cpp
+++ b/source/Lib/EncoderLib/EncModeCtrl.cpp
@@ -151,19 +151,18 @@ void EncModeCtrl::xGetMinMaxQP( int& minQP, int& maxQP, const CodingStructure& c
       minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - deltaQP );
       maxQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP + deltaQP );
     }
+#if ENABLE_QPA_SUB_CTU
+    else if (pps.getUseDQP() && pps.getMaxCuDQPDepth() > 0 && (!CS::isDualITree (cs) || isLuma (partitioner.chType)))
+    {
+      minQP = baseQP;
+      maxQP = baseQP;
+    }
+#endif
     else
     {
       minQP = cs.currQP[partitioner.chType];
       maxQP = cs.currQP[partitioner.chType];
     }
-
-#if SHARP_LUMA_DELTA_QP
-    if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() )
-    {
-      minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - m_lumaQPOffset );
-      maxQP = minQP; // force encode choose the modified QO
-    }
-#endif
   }
   else
   {
@@ -173,7 +172,11 @@ void EncModeCtrl::xGetMinMaxQP( int& minQP, int& maxQP, const CodingStructure& c
       minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - deltaQP );
       maxQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP + deltaQP );
     }
-    else if( currDepth < pps.getMaxCuDQPDepth() )
+    else if (currDepth < pps.getMaxCuDQPDepth()
+#if ENABLE_QPA_SUB_CTU
+         || (pps.getUseDQP() && pps.getMaxCuDQPDepth() > 0 && (!CS::isDualITree (cs) || isLuma (partitioner.chType)))
+#endif
+            )
     {
       minQP = baseQP;
       maxQP = baseQP;
@@ -183,15 +186,15 @@ void EncModeCtrl::xGetMinMaxQP( int& minQP, int& maxQP, const CodingStructure& c
       minQP = cs.currQP[partitioner.chType];
       maxQP = cs.currQP[partitioner.chType];
     }
-
+  }
 #if SHARP_LUMA_DELTA_QP
-    if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() )
-    {
-      minQP = Clip3( -sps.getQpBDOffset( CHANNEL_TYPE_LUMA ), MAX_QP, baseQP - m_lumaQPOffset );
-      maxQP = minQP;
-    }
-#endif
+
+  if (m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() && (!CS::isDualITree (cs) || isLuma (partitioner.chType)))
+  {
+    minQP = Clip3 (-sps.getQpBDOffset (CHANNEL_TYPE_LUMA), MAX_QP, baseQP - m_lumaQPOffset);
+    maxQP = minQP;
   }
+#endif
 }
 
 
@@ -247,16 +250,8 @@ int EncModeCtrl::calculateLumaDQP( const CPelBuf& rcOrg )
   CHECK( m_pcEncCfg->getLumaLevelToDeltaQPMapping().mode != LUMALVL_TO_DQP_AVG_METHOD, "invalid delta qp mode" );
 #endif
   {
-    // Use avg method
-    int sum = 0;
-    for( uint32_t y = 0; y < rcOrg.height; y++ )
-    {
-      for( uint32_t x = 0; x < rcOrg.width; x++ )
-      {
-        sum += rcOrg.at( x, y );
-      }
-    }
-    avg = ( double ) sum / rcOrg.area();
+    // Use average luma value
+    avg = (double) rcOrg.mean();
   }
 #if !WCG_EXT
   else
@@ -870,6 +865,18 @@ void EncModeCtrlMTnoRQT::initCTUEncoding( const Slice &slice )
   }
 }
 
+#if ENABLE_QPA_SUB_CTU
+static Position getMaxLumaDQPDepthPos (const CodingStructure &cs, const Partitioner &partitioner)
+{
+  if (partitioner.currDepth <= cs.pps->getMaxCuDQPDepth())
+  {
+    return partitioner.currArea().lumaPos();
+  }
+  const PartLevel splitAtMaxDepth = partitioner.getPartStack().at (cs.pps->getMaxCuDQPDepth());
+  // the parent node of qtDepth + mttDepth == maxDqpDepth
+  return splitAtMaxDepth.parts[splitAtMaxDepth.idx].lumaPos();
+}
+#endif
 
 void EncModeCtrlMTnoRQT::initCULevel( Partitioner &partitioner, const CodingStructure& cs )
 {
@@ -927,28 +934,38 @@ void EncModeCtrlMTnoRQT::initCULevel( Partitioner &partitioner, const CodingStru
 
   // QP
   int baseQP = cs.baseQP;
-  if( m_pcEncCfg->getUseAdaptiveQP() )
+  if (!CS::isDualITree (cs) || isLuma (partitioner.chType))
   {
-    if (!CS::isDualITree(cs) || isLuma(partitioner.chType))
+    if (m_pcEncCfg->getUseAdaptiveQP())
     {
-      baseQP = Clip3(-cs.sps->getQpBDOffset(CHANNEL_TYPE_LUMA), MAX_QP, baseQP + xComputeDQP(cs, partitioner));
+      baseQP = Clip3 (-cs.sps->getQpBDOffset (CHANNEL_TYPE_LUMA), MAX_QP, baseQP + xComputeDQP (cs, partitioner));
     }
-  }
+#if ENABLE_QPA_SUB_CTU
+    else if (m_pcEncCfg->getUsePerceptQPA() && !m_pcEncCfg->getUseRateCtrl() && cs.pps->getUseDQP() && cs.pps->getMaxCuDQPDepth() > 0)
+    {
+      const PreCalcValues &pcv = *cs.pcv;
 
-  int minQP = baseQP;
-  int maxQP = baseQP;
+      if ((partitioner.currArea().lwidth() < pcv.maxCUWidth) && (partitioner.currArea().lheight() < pcv.maxCUHeight) && cs.picture)
+      {
+        const Position    &pos = getMaxLumaDQPDepthPos (cs, partitioner);
+        const unsigned mtsLog2 = (unsigned)g_aucLog2[std::min (cs.sps->getMaxTrSize(), pcv.maxCUWidth)];
+        const unsigned  stride = pcv.maxCUWidth >> mtsLog2;
 
+        baseQP = cs.picture->m_subCtuQP[((pos.x & pcv.maxCUWidthMask) >> mtsLog2) + stride * ((pos.y & pcv.maxCUHeightMask) >> mtsLog2)];
+      }
+    }
+#endif
 #if SHARP_LUMA_DELTA_QP
-  if( m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() )
-  {
-    if( partitioner.currDepth <= cs.pps->getMaxCuDQPDepth() )
+    if (m_pcEncCfg->getLumaLevelToDeltaQPMapping().isEnabled() && partitioner.currDepth <= cs.pps->getMaxCuDQPDepth())
     {
       CompArea clipedArea = clipArea( cs.area.Y(), cs.picture->Y() );
       // keep using the same m_QP_LUMA_OFFSET in the same CTU
       m_lumaQPOffset = calculateLumaDQP( cs.getOrgBuf( clipedArea ) );
     }
-  }
 #endif
+  }
+  int minQP = baseQP;
+  int maxQP = baseQP;
 
   xGetMinMaxQP( minQP, maxQP, cs, partitioner, baseQP, *cs.sps, *cs.pps, true );
   bool checkCpr = true;
diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp
index acdebc0583c458926d47e4c70da4314e93864085..1fa2f008aaaf0a19e12e575e3be2ea5909869a9e 100644
--- a/source/Lib/EncoderLib/EncSlice.cpp
+++ b/source/Lib/EncoderLib/EncSlice.cpp
@@ -151,6 +151,11 @@ static inline int apprI3Log2 (const double d) // rounded 3*log2(d)
   return d < 1.5e-13 ? -128 : int (floor (3.0 * log (d) / log (2.0) + 0.5));
 }
 
+static inline int lumaDQPOffset (const uint32_t avgLumaValue, const int bitDepth)
+{
+  return (1 - int ((3 * uint64_t (avgLumaValue * avgLumaValue)) >> uint64_t (2 * bitDepth - 1)));
+}
+
 static void filterAndCalculateAverageEnergies (const Pel* pSrc, const int  iSrcStride,
                                                double &hpEner,  const int  iHeight,    const int iWidth,
                                                const uint32_t uBitDepth /* luma bit-depth (4-16) */)
@@ -203,52 +208,111 @@ static double getAveragePictureEnergy (const CPelBuf picOrig, const uint32_t uBi
 }
 #endif
 
+static int getGlaringColorQPOffset (Picture* const pcPic, const int ctuAddr, const uint32_t startAddr, const uint32_t boundingAddr,
+                                    const int bitDepth,   uint32_t &avgLumaValue)
+{
+  const PreCalcValues& pcv  = *pcPic->cs->pcv;
+  const ChromaFormat chrFmt = pcPic->chromaFormat;
+  const uint32_t chrWidth   = pcv.maxCUWidth  >> getChannelTypeScaleX (CH_C, chrFmt);
+  const uint32_t chrHeight  = pcv.maxCUHeight >> getChannelTypeScaleY (CH_C, chrFmt);
+  const int      midLevel   = 1 << (bitDepth - 1);
+  int chrValue = MAX_INT;
+  avgLumaValue = (startAddr < boundingAddr) ? 0 : (uint32_t)pcPic->getOrigBuf().Y().mean();
+
+  if (ctuAddr >= 0) // luma
+  {
+    avgLumaValue = (uint32_t)pcPic->m_iOffsetCtu[ctuAddr];
+  }
+  else if (startAddr < boundingAddr)
+  {
+    for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++)
+    {
+#if HEVC_TILES_WPP
+      const uint32_t ctuRsAddr = pcPic->tileMap->getCtuTsToRsAddrMap (ctuTsAddr);
+#else
+      const uint32_t ctuRsAddr = ctuTsAddr;
+#endif
+
+      avgLumaValue += pcPic->m_iOffsetCtu[ctuRsAddr];
+    }
+    avgLumaValue = (avgLumaValue + ((boundingAddr - startAddr) >> 1)) / (boundingAddr - startAddr);
+  }
+
+  for (uint32_t comp = COMPONENT_Cb; comp < MAX_NUM_COMPONENT; comp++)
+  {
+    const ComponentID compID = (ComponentID)comp;
+    int avgCompValue;
+
+    if (ctuAddr >= 0) // chroma
+    {
+      const CompArea chrArea = clipArea (CompArea (compID, chrFmt, Area ((ctuAddr % pcv.widthInCtus) * chrWidth, (ctuAddr / pcv.widthInCtus) * chrHeight, chrWidth, chrHeight)), pcPic->block (compID));
+
+      avgCompValue = pcPic->getOrigBuf (chrArea).mean();
+    }
+    else avgCompValue = pcPic->getOrigBuf (pcPic->block (compID)).mean();
+
+    if (chrValue > avgCompValue) chrValue = avgCompValue; // minimum of the DC offsets
+  }
+  CHECK (chrValue < 0, "DC offset cannot be negative!");
+
+  chrValue = (int)avgLumaValue - chrValue;
+
+  if (chrValue > midLevel) return apprI3Log2 (double (chrValue * chrValue) / double (midLevel * midLevel));
+
+  return 0;
+}
+
 static int applyQPAdaptationChroma (Picture* const pcPic, Slice* const pcSlice, EncCfg* const pcEncCfg, const int sliceQP)
 {
+  const int iBitDepth              = pcSlice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA); // overall image bit-depth
   double hpEner[MAX_NUM_COMPONENT] = {0.0, 0.0, 0.0};
   int    optSliceChromaQpOffset[2] = {0, 0};
   int    savedLumaQP               = -1;
+  uint32_t meanLuma                = MAX_UINT;
 
   for (uint32_t comp = 0; comp < getNumberValidComponents (pcPic->chromaFormat); comp++)
   {
     const ComponentID compID = (ComponentID)comp;
     const CPelBuf    picOrig = pcPic->getOrigBuf (pcPic->block (compID));
 
-    filterAndCalculateAverageEnergies (picOrig.buf, picOrig.stride, hpEner[comp], picOrig.height, picOrig.width,
-                                       pcSlice->getSPS()->getBitDepth (toChannelType (compID)) - (isChroma (compID) ? 1 : 0));
+    filterAndCalculateAverageEnergies (picOrig.buf,    picOrig.stride, hpEner[comp],
+                                       picOrig.height, picOrig.width,  iBitDepth - (isChroma (compID) ? 1 : 0));
     if (isChroma (compID))
     {
       const int  adaptChromaQPOffset = 2.0 * hpEner[comp] <= hpEner[0] ? 0 : apprI3Log2 (2.0 * hpEner[comp] / hpEner[0]);
-   #if GLOBAL_AVERAGING
-      int       averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP + apprI3Log2 (hpEner[0] / getAveragePictureEnergy (pcPic->getOrigBuf().Y(), pcSlice->getSPS()->getBitDepth (CH_L))));
-   #else
-      int       averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP); // mean slice QP
-   #endif
-   #if SHARP_LUMA_DELTA_QP
-
-      // change mean picture QP index based on picture's average luma value (Sharp)
-      if (pcEncCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES)
+
+      if (savedLumaQP < 0)
       {
-        const CPelBuf picLuma = pcPic->getOrigBuf().Y();
-        uint64_t uAvgLuma = 0;
+#if GLOBAL_AVERAGING
+        int     averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP + apprI3Log2 (hpEner[0] / getAveragePictureEnergy (pcPic->getOrigBuf().Y(), iBitDepth)));
+#else
+        int     averageAdaptedLumaQP = Clip3 (0, MAX_QP, sliceQP); // mean slice QP
+#endif
 
-        for (SizeType y = 0; y < picLuma.height; y++)
+        averageAdaptedLumaQP += getGlaringColorQPOffset (pcPic, -1 /*ctuRsAddr*/, 0 /*startAddr*/, 0 /*boundingAddr*/, iBitDepth, meanLuma);
+
+        if (averageAdaptedLumaQP > MAX_QP
+#if SHARP_LUMA_DELTA_QP
+            && (pcEncCfg->getLumaLevelToDeltaQPMapping().mode != LUMALVL_TO_DQP_NUM_MODES)
+#endif
+            ) averageAdaptedLumaQP = MAX_QP;
+#if SHARP_LUMA_DELTA_QP
+
+        // change mean picture QP index based on picture's average luma value (Sharp)
+        if (pcEncCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES)
         {
-          for (SizeType x = 0; x < picLuma.width; x++)
-          {
-            uAvgLuma += (uint64_t)picLuma.at (x, y);
-          }
+          if (meanLuma == MAX_UINT) meanLuma = pcPic->getOrigBuf().Y().mean();
+
+          averageAdaptedLumaQP = Clip3 (0, MAX_QP, averageAdaptedLumaQP + lumaDQPOffset (meanLuma, iBitDepth));
         }
-        uAvgLuma = (uAvgLuma + (picLuma.area() >> 1)) / picLuma.area();
+#endif
 
-        averageAdaptedLumaQP = Clip3 (0, MAX_QP, averageAdaptedLumaQP + 1 - int((3 * uAvgLuma * uAvgLuma) >> uint64_t (2 * pcSlice->getSPS()->getBitDepth (CH_L) - 1)));
-      }
-   #endif
-      const int lumaChromaMappingDQP = averageAdaptedLumaQP - getScaledChromaQP (averageAdaptedLumaQP, pcEncCfg->getChromaFormatIdc());
+        savedLumaQP = averageAdaptedLumaQP;
+      } // savedLumaQP < 0
 
-      optSliceChromaQpOffset[comp-1] = std::min (3 + lumaChromaMappingDQP, adaptChromaQPOffset + lumaChromaMappingDQP);
+      const int lumaChromaMappingDQP = savedLumaQP - getScaledChromaQP (savedLumaQP, pcEncCfg->getChromaFormatIdc());
 
-      if (savedLumaQP < 0) savedLumaQP = averageAdaptedLumaQP; // save it for later
+      optSliceChromaQpOffset[comp-1] = std::min (3 + lumaChromaMappingDQP, adaptChromaQPOffset + lumaChromaMappingDQP);
     }
   }
 
@@ -794,9 +858,9 @@ void EncSlice::resetQP( Picture* pic, int sliceQP, double lambda )
 }
 
 #if ENABLE_QPA
-static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,        const PreCalcValues& pcv,
-                               const uint32_t startAddr, const uint32_t boundingAddr, const bool useSharpLumaDQP,
-                               const double hpEnerAvg,   const double hpEnerMax,      const bool useFrameWiseQPA, const int previouslyAdaptedLumaQP = -1)
+static bool applyQPAdaptation (Picture* const pcPic,       Slice* const pcSlice,        const PreCalcValues& pcv,
+                               const uint32_t startAddr,   const uint32_t boundingAddr, const bool useSharpLumaDQP,
+                               const bool useFrameWiseQPA, const int previouslyAdaptedLumaQP = -1)
 {
   const int  iBitDepth   = pcSlice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA);
   const int  iQPIndex    = pcSlice->getSliceQp(); // initial QP index for current slice, used in following loops
@@ -804,6 +868,35 @@ static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,
   const TileMap& tileMap = *pcPic->tileMap;
 #endif
   bool   sliceQPModified = false;
+  uint32_t   meanLuma    = MAX_UINT;
+  double     hpEnerAvg   = 0.0;
+
+#if GLOBAL_AVERAGING
+  if (!useFrameWiseQPA || previouslyAdaptedLumaQP < 0)  // mean visual activity value and luma value in each CTU
+#endif
+  {
+    for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++)
+    {
+#if HEVC_TILES_WPP
+      const uint32_t ctuRsAddr  = tileMap.getCtuTsToRsAddrMap (ctuTsAddr);
+#else
+      const uint32_t ctuRsAddr  = ctuTsAddr;
+#endif
+      const Position pos ((ctuRsAddr % pcv.widthInCtus) * pcv.maxCUWidth, (ctuRsAddr / pcv.widthInCtus) * pcv.maxCUHeight);
+      const CompArea ctuArea    = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x, pos.y, pcv.maxCUWidth, pcv.maxCUHeight)), pcPic->Y());
+      const CompArea fltArea    = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x > 0 ? pos.x - 1 : 0, pos.y > 0 ? pos.y - 1 : 0, pcv.maxCUWidth + (pos.x > 0 ? 2 : 1), pcv.maxCUHeight + (pos.y > 0 ? 2 : 1))), pcPic->Y());
+      const CPelBuf  picOrig    = pcPic->getOrigBuf (fltArea);
+      double hpEner = 0.0;
+
+      filterAndCalculateAverageEnergies (picOrig.buf,    picOrig.stride, hpEner,
+                                         picOrig.height, picOrig.width,  iBitDepth);
+      hpEnerAvg += hpEner;
+      pcPic->m_uEnerHpCtu[ctuRsAddr] = hpEner;
+      pcPic->m_iOffsetCtu[ctuRsAddr] = pcPic->getOrigBuf (ctuArea).mean();
+    }
+
+    hpEnerAvg /= double (boundingAddr - startAddr);
+  }
 #if GLOBAL_AVERAGING
   const double hpEnerPic = 1.0 / getAveragePictureEnergy (pcPic->getOrigBuf().Y(), iBitDepth); // inverse, speed
 #else
@@ -812,36 +905,40 @@ static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,
 
   if (useFrameWiseQPA || (iQPIndex >= MAX_QP))
   {
-    int iQPFixed;
+    int iQPFixed = (previouslyAdaptedLumaQP < 0) ? Clip3 (0, MAX_QP, iQPIndex + apprI3Log2 (hpEnerAvg * hpEnerPic)) : previouslyAdaptedLumaQP;
 
-    if (useFrameWiseQPA)
+    if (isChromaEnabled (pcPic->chromaFormat) && (iQPIndex < MAX_QP) && (previouslyAdaptedLumaQP < 0))
     {
-      iQPFixed = (previouslyAdaptedLumaQP < 0) ? Clip3 (0, MAX_QP, iQPIndex + apprI3Log2 (hpEnerAvg * hpEnerPic)) : previouslyAdaptedLumaQP; // average-activity slice QP
-    }
-    else
-    {
-      iQPFixed = Clip3 (0, MAX_QP, iQPIndex + ((apprI3Log2 (hpEnerAvg * hpEnerPic) + apprI3Log2 (hpEnerMax * hpEnerPic) + 1) >> 1)); // adapted slice QP = (mean(QP) + max(QP)) / 2
+      iQPFixed += getGlaringColorQPOffset (pcPic, -1 /*ctuRsAddr*/, startAddr, boundingAddr, iBitDepth, meanLuma);
+
+      if (iQPFixed > MAX_QP
+#if SHARP_LUMA_DELTA_QP
+          && !useSharpLumaDQP
+#endif
+          ) iQPFixed = MAX_QP;
     }
 #if SHARP_LUMA_DELTA_QP
 
     // change new fixed QP based on average CTU luma value (Sharp)
     if (useSharpLumaDQP && (iQPIndex < MAX_QP) && (previouslyAdaptedLumaQP < 0))
     {
-      uint64_t uAvgLuma = 0;
-
-      for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++)
+      if (meanLuma == MAX_UINT) // collect picture mean luma value
       {
-#if HEVC_TILES_WPP
-        const uint32_t ctuRsAddr = tileMap.getCtuTsToRsAddrMap (ctuTsAddr);
-#else
-        const uint32_t ctuRsAddr = ctuTsAddr;
-#endif
+        meanLuma = 0;
 
-        uAvgLuma += (uint64_t)pcPic->m_iOffsetCtu[ctuRsAddr];
-      }
-      uAvgLuma = (uAvgLuma + ((boundingAddr - startAddr) >> 1)) / (boundingAddr - startAddr);
+        for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++)
+        {
+ #if HEVC_TILES_WPP
+          const uint32_t ctuRsAddr = tileMap.getCtuTsToRsAddrMap (ctuTsAddr);
+ #else
+          const uint32_t ctuRsAddr = ctuTsAddr;
+ #endif
 
-      iQPFixed = Clip3 (0, MAX_QP, iQPFixed + 1 - int((3 * uAvgLuma * uAvgLuma) >> uint64_t(2 * iBitDepth - 1)));
+          meanLuma += pcPic->m_iOffsetCtu[ctuRsAddr];  // CTU mean
+        }
+        meanLuma = (meanLuma + ((boundingAddr - startAddr) >> 1)) / (boundingAddr - startAddr);
+      }
+      iQPFixed = Clip3 (0, MAX_QP, iQPFixed + lumaDQPOffset (meanLuma, iBitDepth));
     }
 #endif
 
@@ -872,7 +969,7 @@ static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,
       pcPic->m_iOffsetCtu[ctuRsAddr] = (Pel)iQPFixed; // fixed QPs
     }
   }
-  else
+  else // CTU-wise QPA
   {
     for (uint32_t ctuTsAddr = startAddr; ctuTsAddr < boundingAddr; ctuTsAddr++)
     {
@@ -884,21 +981,30 @@ static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,
 
       int iQPAdapt = Clip3 (0, MAX_QP, iQPIndex + apprI3Log2 (pcPic->m_uEnerHpCtu[ctuRsAddr] * hpEnerPic));
 
-#if SHARP_LUMA_DELTA_QP
       if (pcv.widthInCtus > 1) // try to enforce CTU SNR greater than zero dB
-#else
-      if (!pcSlice->isIntra()) // try to enforce CTU SNR greater than zero dB
-#endif
       {
-        const Pel      dcOffset   = pcPic->m_iOffsetCtu[ctuRsAddr];
+        meanLuma = (uint32_t)pcPic->m_iOffsetCtu[ctuRsAddr];
+
+        if (isChromaEnabled (pcPic->chromaFormat))
+        {
+          iQPAdapt += getGlaringColorQPOffset (pcPic, (int)ctuRsAddr, startAddr, boundingAddr, iBitDepth, meanLuma);
+
+          if (iQPAdapt > MAX_QP
+#if SHARP_LUMA_DELTA_QP
+              && !useSharpLumaDQP
+#endif
+              ) iQPAdapt = MAX_QP;
+          CHECK (meanLuma != (uint32_t)pcPic->m_iOffsetCtu[ctuRsAddr], "luma DC offsets don't match");
+        }
 #if SHARP_LUMA_DELTA_QP
 
         // change adaptive QP based on mean CTU luma value (Sharp)
         if (useSharpLumaDQP)
         {
-          const uint64_t uAvgLuma   = (uint64_t)dcOffset;
-
-          iQPAdapt = std::max (0, iQPAdapt + 1 - int((3 * uAvgLuma * uAvgLuma) >> uint64_t(2 * iBitDepth - 1)));
+ #if ENABLE_QPA_SUB_CTU
+          pcPic->m_uEnerHpCtu[ctuRsAddr] = (double)meanLuma; // for sub-CTU QPA
+ #endif
+          iQPAdapt = Clip3 (0, MAX_QP, iQPAdapt + lumaDQPOffset (meanLuma, iBitDepth));
         }
 
 #endif
@@ -915,7 +1021,7 @@ static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,
         {
           for (SizeType w = 0; w < iSrcWidth; w++)
           {
-            uAbsDCless += (uint32_t)abs (pSrc[w] - dcOffset);
+            uAbsDCless += (uint32_t)abs (pSrc[w] - (Pel)meanLuma);
           }
           pSrc += iSrcStride;
         }
@@ -937,15 +1043,15 @@ static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,
 
           iQPAdapt = std::max (0, iQPAdapt + redVal);
         }
-#if SHARP_LUMA_DELTA_QP
-
-        if (iQPAdapt > MAX_QP) iQPAdapt = MAX_QP;
-#endif
       }
 
       pcPic->m_iOffsetCtu[ctuRsAddr] = (Pel)iQPAdapt; // adapted QPs
 
+#if ENABLE_QPA_SUB_CTU
+      if (pcv.widthInCtus > 1 && pcSlice->getPPS()->getMaxCuDQPDepth() == 0)  // reduce local DQP rate peaks
+#else
       if (pcv.widthInCtus > 1) // try to reduce local bitrate peaks via minimum smoothing of the adapted QPs
+#endif
       {
         iQPAdapt = ctuRsAddr % pcv.widthInCtus; // horizontal offset
         if (iQPAdapt == 0)
@@ -978,6 +1084,97 @@ static bool applyQPAdaptation (Picture* const pcPic,     Slice* const pcSlice,
 
   return sliceQPModified;
 }
+
+#if ENABLE_QPA_SUB_CTU
+static int applyQPAdaptationSubCtu (CodingStructure &cs, const UnitArea ctuArea, const uint32_t ctuAddr, const bool useSharpLumaDQP)
+{
+  const PreCalcValues &pcv = *cs.pcv;
+  const Picture     *pcPic = cs.picture;
+  const int      iBitDepth = cs.slice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA); // overall image bit-depth
+  const int   adaptedCtuQP = pcPic ? pcPic->m_iOffsetCtu[ctuAddr] : cs.slice->getSliceQpBase();
+
+  if (!pcPic || cs.pps->getMaxCuDQPDepth() == 0) return adaptedCtuQP;
+
+  for (unsigned addr = 0; addr < cs.picture->m_subCtuQP.size(); addr++)
+  {
+    cs.picture->m_subCtuQP[addr] = (int8_t)adaptedCtuQP;
+  }
+  if (cs.slice->getSliceQp() < MAX_QP && pcv.widthInCtus > 1)
+  {
+#if SHARP_LUMA_DELTA_QP
+    const int   lumaCtuDQP = useSharpLumaDQP ? lumaDQPOffset ((uint32_t)pcPic->m_uEnerHpCtu[ctuAddr], iBitDepth) : 0;
+#endif
+    const unsigned     mts = std::min (cs.sps->getMaxTrSize(), pcv.maxCUWidth);
+    const unsigned mtsLog2 = (unsigned)g_aucLog2[mts];
+    const unsigned  stride = pcv.maxCUWidth >> mtsLog2;
+    unsigned numAct = 0;    // number of block activities
+    double   sumAct = 0.0; // sum of all block activities
+    double   subAct[16];   // individual block activities
+#if SHARP_LUMA_DELTA_QP
+    uint32_t subMLV[16];   // individual mean luma values
+#endif
+
+    CHECK (mts * 4 < pcv.maxCUWidth || mts * 4 < pcv.maxCUHeight, "max. transform size is too small for given CTU size");
+
+    for (unsigned h = 0; h < (pcv.maxCUHeight >> mtsLog2); h++)
+    {
+      for (unsigned w = 0; w < stride; w++)
+      {
+        const unsigned addr    = w + h * stride;
+        const PosType  x       = ctuArea.lx() + w * mts;
+        const PosType  y       = ctuArea.ly() + h * mts;
+        const CompArea fltArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (x > 0 ? x - 1 : 0, y > 0 ? y - 1 : 0, mts + (x > 0 ? 2 : 1), mts + (y > 0 ? 2 : 1))), pcPic->Y());
+        const CPelBuf  picOrig = pcPic->getOrigBuf (fltArea);
+
+        if (x >= pcPic->lwidth() || y >= pcPic->lheight())
+        {
+          continue;
+        }
+        filterAndCalculateAverageEnergies (picOrig.buf,    picOrig.stride, subAct[addr],
+                                           picOrig.height, picOrig.width,  iBitDepth);
+        numAct++;
+        sumAct += subAct[addr];
+#if SHARP_LUMA_DELTA_QP
+
+        if (useSharpLumaDQP)
+        {
+          const CompArea subArea = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (x, y, mts, mts)), pcPic->Y());
+
+          subMLV[addr] = pcPic->getOrigBuf (subArea).mean();
+        }
+#endif
+      }
+    }
+    if (sumAct <= 0.0) return adaptedCtuQP;
+
+    sumAct = double(numAct) / sumAct; // 1.0 / (average CTU activity)
+
+    for (unsigned h = 0; h < (pcv.maxCUHeight >> mtsLog2); h++)
+    {
+      for (unsigned w = 0; w < stride; w++)
+      {
+        const unsigned addr = w + h * stride;
+
+        if (ctuArea.lx() + w * mts >= pcPic->lwidth() || ctuArea.ly() + h * mts >= pcPic->lheight())
+        {
+          continue;
+        }
+        cs.picture->m_subCtuQP[addr] = (int8_t)Clip3 (0, MAX_QP, adaptedCtuQP + apprI3Log2 (subAct[addr] * sumAct));
+#if SHARP_LUMA_DELTA_QP
+
+        // change adapted QP based on mean sub-CTU luma value (Sharp)
+        if (useSharpLumaDQP)
+        {
+          cs.picture->m_subCtuQP[addr] = (int8_t)Clip3 (0, MAX_QP, (int)cs.picture->m_subCtuQP[addr] - lumaCtuDQP + lumaDQPOffset (subMLV[addr], iBitDepth));
+        }
+#endif
+      }
+    }
+  }
+
+  return adaptedCtuQP;
+}
+#endif // ENABLE_QPA_SUB_CTU
 #endif // ENABLE_QPA
 
 // ====================================================================================================================
@@ -1261,12 +1458,9 @@ void EncSlice::compressSlice( Picture* pcPic, const bool bCompressEntireSlice, c
   CHECK( pcPic->m_prevQP[0] == std::numeric_limits<int>::max(), "Invalid previous QP" );
 
   CodingStructure&  cs          = *pcPic->cs;
-#if ENABLE_QPA || ENABLE_WPP_PARALLELISM
-  const PreCalcValues& pcv      = *cs.pcv;
-  const uint32_t        widthInCtus = pcv.widthInCtus;
-#endif
-
-  cs.slice = pcSlice;
+  cs.slice    = pcSlice;
+  cs.pcv      = pcSlice->getPPS()->pcv;
+  cs.fracBits = 0;
 
   if (startCtuTsAddr == 0)
   {
@@ -1274,74 +1468,19 @@ void EncSlice::compressSlice( Picture* pcPic, const bool bCompressEntireSlice, c
   }
 
 #if ENABLE_QPA
-  double hpEnerMax     = 1.0;
-  double hpEnerPic     = 0.0;
-  int    iSrcOffset;
-
-  if (m_pcCfg->getUsePerceptQPA() && !m_pcCfg->getUseRateCtrl())
-  {
-    for (uint32_t ctuTsAddr = startCtuTsAddr; ctuTsAddr < boundingCtuTsAddr; ctuTsAddr++)
-    {
- #if HEVC_TILES_WPP
-      const uint32_t ctuRsAddr  = tileMap.getCtuTsToRsAddrMap (ctuTsAddr);
- #else
-      const uint32_t ctuRsAddr  = ctuTsAddr;
- #endif
-      const Position pos ((ctuRsAddr % widthInCtus) * pcv.maxCUWidth, (ctuRsAddr / widthInCtus) * pcv.maxCUHeight);
-      const CompArea subArea    = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x, pos.y, pcv.maxCUWidth, pcv.maxCUHeight)), pcPic->Y());
-      const CompArea fltArea    = clipArea (CompArea (COMPONENT_Y, pcPic->chromaFormat, Area (pos.x > 0 ? pos.x - 1 : 0, pos.y > 0 ? pos.y - 1 : 0, pcv.maxCUWidth + (pos.x > 0 ? 2 : 1), pcv.maxCUHeight + (pos.y > 0 ? 2 : 1))), pcPic->Y());
-      const SizeType iSrcStride = pcPic->getOrigBuf (subArea).stride;
-      const Pel*     pSrc       = pcPic->getOrigBuf (subArea).buf;
-      const SizeType iSrcHeight = pcPic->getOrigBuf (subArea).height;
-      const SizeType iSrcWidth  = pcPic->getOrigBuf (subArea).width;
-      const SizeType iFltHeight = pcPic->getOrigBuf (fltArea).height;
-      const SizeType iFltWidth  = pcPic->getOrigBuf (fltArea).width;
-      double hpEner = 0.0;
-
-      DTRACE_UPDATE (g_trace_ctx, std::make_pair ("ctu", ctuRsAddr));
-
-      // compute DC offset to be subtracted from luma values
-      iSrcOffset = 0;
-      for (SizeType h = 0; h < iSrcHeight; h++)
-      {
-        for (SizeType w = 0; w < iSrcWidth; w++)
-        {
-          iSrcOffset += pSrc[w];
-        }
-        pSrc += iSrcStride;
-      }
-      CHECK (iSrcOffset < 0, "DC offset cannot be negative!");
-
-      int x = iSrcHeight * iSrcWidth;
-      iSrcOffset = (iSrcOffset + (x >> 1)) / x; // slow division
-
-      filterAndCalculateAverageEnergies (pcPic->getOrigBuf (fltArea).buf, iSrcStride,
-                                         hpEner, iFltHeight, iFltWidth,
-                                         pcSlice->getSPS()->getBitDepth (CHANNEL_TYPE_LUMA));
-
-      if (hpEner > hpEnerMax) hpEnerMax = hpEner;
-      hpEnerPic += hpEner;
-      pcPic->m_uEnerHpCtu[ctuRsAddr] = hpEner;
-      pcPic->m_iOffsetCtu[ctuRsAddr] = (Pel)iSrcOffset;
-    } // end iteration over all CTUs in current slice
-
-  }
-
   if (m_pcCfg->getUsePerceptQPA() && !m_pcCfg->getUseRateCtrl() && (boundingCtuTsAddr > startCtuTsAddr))
   {
-    const double hpEnerAvg = hpEnerPic / double(boundingCtuTsAddr - startCtuTsAddr);
-
-    if (applyQPAdaptation (pcPic, pcSlice, pcv, startCtuTsAddr, boundingCtuTsAddr, m_pcCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES,
-                           hpEnerAvg, hpEnerMax, (m_pcCfg->getBaseQP() >= 38) || (m_pcCfg->getSourceWidth() <= 512 && m_pcCfg->getSourceHeight() <= 320), m_adaptedLumaQP))
+    if (applyQPAdaptation (pcPic, pcSlice, *cs.pcv, startCtuTsAddr, boundingCtuTsAddr, m_pcCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES,
+                           (m_pcCfg->getBaseQP() >= 38) || (m_pcCfg->getSourceWidth() <= 512 && m_pcCfg->getSourceHeight() <= 320), m_adaptedLumaQP))
     {
       m_CABACEstimator->initCtxModels (*pcSlice);
-  #if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
+#if ENABLE_SPLIT_PARALLELISM || ENABLE_WPP_PARALLELISM
       for (int jId = 1; jId < m_pcLib->getNumCuEncStacks(); jId++)
       {
         CABACWriter* cw = m_pcLib->getCABACEncoder (jId)->getCABACEstimator (pcSlice->getSPS());
         cw->initCtxModels (*pcSlice);
       }
-  #endif
+#endif
 #if HEVC_DEPENDENT_SLICES
       if (!pcSlice->getDependentSliceSegmentFlag())
       {
@@ -1358,10 +1497,6 @@ void EncSlice::compressSlice( Picture* pcPic, const bool bCompressEntireSlice, c
   }
 #endif // ENABLE_QPA
 
-  cs.pcv      = pcSlice->getPPS()->pcv;
-  cs.fracBits = 0;
-
-
 #if ENABLE_WPP_PARALLELISM
   bool bUseThreads = m_pcCfg->getNumWppThreads() > 1;
   if( bUseThreads )
@@ -1420,7 +1555,6 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons
 #endif
 #if ENABLE_QPA
   const int iQPIndex              = pcSlice->getSliceQpBase();
-  int iSrcOffset                  = 0;
 #endif
 
 #if ENABLE_WPP_PARALLELISM
@@ -1515,7 +1649,7 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons
 #else
 #endif
 
-#if RDOQ_CHROMA_LAMBDA && ENABLE_QPA
+#if RDOQ_CHROMA_LAMBDA && ENABLE_QPA && !ENABLE_QPA_SUB_CTU
     double oldLambdaArray[MAX_NUM_COMPONENT] = {0.0};
 #endif
     const double oldLambda = pRdCost->getLambda();
@@ -1561,9 +1695,14 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons
 #if ENABLE_QPA
     else if (pCfg->getUsePerceptQPA() && pcSlice->getPPS()->getUseDQP())
     {
-      iSrcOffset = pcPic->m_iOffsetCtu[ctuRsAddr];
-      const double newLambda = oldLambda * pow (2.0, double(iSrcOffset - iQPIndex) / 3.0);
-      pcPic->m_uEnerHpCtu[ctuRsAddr] = newLambda;
+#if ENABLE_QPA_SUB_CTU
+      const int adaptedQP    = applyQPAdaptationSubCtu (cs, ctuArea, ctuRsAddr, m_pcCfg->getLumaLevelToDeltaQPMapping().mode == LUMALVL_TO_DQP_NUM_MODES);
+#else
+      const int adaptedQP    = pcPic->m_iOffsetCtu[ctuRsAddr];
+#endif
+      const double newLambda = pcSlice->getLambdas()[0] * pow (2.0, double (adaptedQP - iQPIndex) / 3.0);
+      pcPic->m_uEnerHpCtu[ctuRsAddr] = newLambda; // for ALF and SAO
+#if !ENABLE_QPA_SUB_CTU
 #if RDOQ_CHROMA_LAMBDA
       pTrQuant->getLambdas (oldLambdaArray); // save the old lambdas
       const double chromaLambda = newLambda / pRdCost->getChromaWeight();
@@ -1573,7 +1712,8 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons
       pTrQuant->setLambda (newLambda);
 #endif
       pRdCost->setLambda (newLambda, pcSlice->getSPS()->getBitDepths());
-      currQP[0] = currQP[1] = iSrcOffset;
+#endif
+      currQP[0] = currQP[1] = adaptedQP;
     }
 #endif
 
@@ -1684,7 +1824,7 @@ void EncSlice::encodeCtus( Picture* pcPic, const bool bCompressEntireSlice, cons
       pRateCtrl->getRCPic()->updateAfterCTU( pRateCtrl->getRCPic()->getLCUCoded(), actualBits, actualQP, actualLambda,
                                              pcSlice->isIRAP() ? 0 : pCfg->getLCULevelRC() );
     }
-#if ENABLE_QPA
+#if ENABLE_QPA && !ENABLE_QPA_SUB_CTU
     else if (pCfg->getUsePerceptQPA() && pcSlice->getPPS()->getUseDQP())
     {
 #if RDOQ_CHROMA_LAMBDA