Commit bd81a916 authored by Muhammed Coban's avatar Muhammed Coban

Merge branch 'master' of https://vcgit.hhi.fraunhofer.de/jvet/VVCSoftware_VTM into R0083-SW-Cleanup

parents f4502c0a c14437d3
......@@ -2583,7 +2583,11 @@ bool EncAppCfg::xCheckParameter()
xConfirmPara( 0 < m_maxNumGeoCand && m_maxNumGeoCand < 2, "MaxNumGeoCand must be no less than 2 unless MaxNumGeoCand is 0." );
xConfirmPara( m_maxNumIBCMergeCand < 1, "MaxNumIBCMergeCand must be 1 or greater." );
xConfirmPara( m_maxNumIBCMergeCand > IBC_MRG_MAX_NUM_CANDS, "MaxNumIBCMergeCand must be no more than IBC_MRG_MAX_NUM_CANDS." );
#if JVET_R0371_MAX_NUM_SUB_BLK_MRG_CAND
xConfirmPara( m_maxNumAffineMergeCand < (m_SubPuMvpMode ? 1 : 0), "MaxNumAffineMergeCand must be greater or equal to SubPuMvp." );
#else
xConfirmPara( m_maxNumAffineMergeCand < 1, "MaxNumAffineMergeCand must be 1 or greater." );
#endif
xConfirmPara( m_maxNumAffineMergeCand > AFFINE_MRG_MAX_NUM_CANDS, "MaxNumAffineMergeCand must be no more than AFFINE_MRG_MAX_NUM_CANDS." );
if ( m_Affine == 0 )
{
......@@ -3411,6 +3415,7 @@ bool EncAppCfg::xCheckParameter()
}
}
xConfirmPara(m_log2ParallelMergeLevel < 2, "Log2ParallelMergeLevel should be larger than or equal to 2");
xConfirmPara(m_log2ParallelMergeLevel > m_uiCTUSize, "Log2ParallelMergeLevel should be less than or equal to CTU size");
#if U0033_ALTERNATIVE_TRANSFER_CHARACTERISTICS_SEI
xConfirmPara(m_preferredTransferCharacteristics > 255, "transfer_characteristics_idc should not be greater than 255.");
#endif
......
......@@ -217,6 +217,9 @@ void IntraPrediction::predIntraAng( const ComponentID compId, PelBuf &piPred, co
const int iWidth = piPred.width;
const int iHeight = piPred.height;
CHECK(iWidth == 2, "Width of 2 is not supported");
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
CHECK(PU::isMIP(pu, toChannelType(compId)), "We should not get here for MIP.");
#endif
const uint32_t uiDirMode = isLuma( compId ) && pu.cu->bdpcmMode ? BDPCM_IDX : !isLuma(compId) && pu.cu->bdpcmModeChroma ? BDPCM_IDX : PU::getFinalIntraMode(pu, channelType);
CHECK( floorLog2(iWidth) < 2 && pu.cs->pcv->noChroma2x2, "Size not allowed" );
......@@ -1836,24 +1839,62 @@ void IntraPrediction::initIntraMip( const PredictionUnit &pu, const CompArea &ar
// prepare input (boundary) data for prediction
CHECK( m_ipaParam.refFilterFlag, "ERROR: unfiltered refs expected for MIP" );
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
Pel *ptrSrc = getPredictorPtr(area.compID);
const int srcStride = m_refBufferStride[area.compID];
const int srcHStride = 2;
m_matrixIntraPred.prepareInputForPred(CPelBuf(ptrSrc, srcStride, srcHStride), area,
pu.cu->slice->getSPS()->getBitDepth(toChannelType(area.compID)), area.compID);
#else
Pel *ptrSrc = getPredictorPtr( COMPONENT_Y );
const int srcStride = m_refBufferStride[COMPONENT_Y];
const int srcHStride = 2;
m_matrixIntraPred.prepareInputForPred( CPelBuf( ptrSrc, srcStride, srcHStride ), area, pu.cu->slice->getSPS()->getBitDepth( CHANNEL_TYPE_LUMA ) );
#endif
}
void IntraPrediction::predIntraMip( const ComponentID compId, PelBuf &piPred, const PredictionUnit &pu )
{
#if !JVET_R0350_MIP_CHROMA_444_SINGLETREE
CHECK( compId != COMPONENT_Y, "Error: chroma not supported" );
#endif
CHECK( piPred.width > MIP_MAX_WIDTH || piPred.height > MIP_MAX_HEIGHT, "Error: block size not supported for MIP" );
CHECK( piPred.width != (1 << floorLog2(piPred.width)) || piPred.height != (1 << floorLog2(piPred.height)), "Error: expecting blocks of size 2^M x 2^N" );
// generate mode-specific prediction
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
uint32_t modeIdx = MAX_NUM_MIP_MODE;
bool transposeFlag = false;
if (compId == COMPONENT_Y)
{
modeIdx = pu.intraDir[CHANNEL_TYPE_LUMA];
transposeFlag = pu.mipTransposedFlag;
}
else
{
const PredictionUnit &coLocatedLumaPU = PU::getCoLocatedLumaPU(pu);
CHECK(pu.intraDir[CHANNEL_TYPE_CHROMA] != DM_CHROMA_IDX, "Error: MIP is only supported for chroma with DM_CHROMA.");
CHECK(!coLocatedLumaPU.cu->mipFlag, "Error: Co-located luma CU should use MIP.");
modeIdx = coLocatedLumaPU.intraDir[CHANNEL_TYPE_LUMA];
transposeFlag = coLocatedLumaPU.mipTransposedFlag;
}
const int bitDepth = pu.cu->slice->getSPS()->getBitDepth(toChannelType(compId));
CHECK(modeIdx >= getNumModesMip(piPred), "Error: Wrong MIP mode index");
#else
const int bitDepth = pu.cu->slice->getSPS()->getBitDepth( CHANNEL_TYPE_LUMA );
#endif
static_vector<int, MIP_MAX_WIDTH* MIP_MAX_HEIGHT> predMip( piPred.width * piPred.height );
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
m_matrixIntraPred.predBlock(predMip.data(), modeIdx, transposeFlag, bitDepth, compId);
#else
m_matrixIntraPred.predBlock( predMip.data(), pu.intraDir[CHANNEL_TYPE_LUMA], pu.mipTransposedFlag, bitDepth );
#endif
for( int y = 0; y < piPred.height; y++ )
{
......
......@@ -43,6 +43,9 @@
MatrixIntraPrediction::MatrixIntraPrediction():
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
m_component(MAX_NUM_COMPONENT),
#endif
m_reducedBoundary (MIP_MAX_INPUT_SIZE),
m_reducedBoundaryTransposed(MIP_MAX_INPUT_SIZE),
m_inputOffset ( 0 ),
......@@ -58,9 +61,16 @@ MatrixIntraPrediction::MatrixIntraPrediction():
{
}
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
void MatrixIntraPrediction::prepareInputForPred(const CPelBuf &pSrc, const Area &block, const int bitDepth,
const ComponentID compId)
{
m_component = compId;
#else
void MatrixIntraPrediction::prepareInputForPred(const CPelBuf &pSrc, const Area& block, const int bitDepth)
{
#endif
// Step 1: Save block size and calculate dependent values
initPredBlockParams(block);
......@@ -114,8 +124,16 @@ void MatrixIntraPrediction::prepareInputForPred(const CPelBuf &pSrc, const Area&
}
}
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
void MatrixIntraPrediction::predBlock(int *const result, const int modeIdx, const bool transpose, const int bitDepth,
const ComponentID compId)
{
CHECK(m_component != compId, "Boundary has not been prepared for this component.");
#else
void MatrixIntraPrediction::predBlock(int* const result, const int modeIdx, const bool transpose, const int bitDepth)
{
#endif
const bool needUpsampling = ( m_upsmpFactorHor > 1 ) || ( m_upsmpFactorVer > 1 );
const uint8_t* matrix = getMatrixData(modeIdx);
......
......@@ -50,10 +50,20 @@ class MatrixIntraPrediction
public:
MatrixIntraPrediction();
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
void prepareInputForPred(const CPelBuf &pSrc, const Area &block, const int bitDepth, const ComponentID compId);
void predBlock(int *const result, const int modeIdx, const bool transpose, const int bitDepth,
const ComponentID compId);
#else
void prepareInputForPred(const CPelBuf &pSrc, const Area& block, const int bitDepth);
void predBlock(int* const result, const int modeIdx, const bool transpose, const int bitDepth);
#endif
private:
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
ComponentID m_component;
#endif
static_vector<int, MIP_MAX_INPUT_SIZE> m_reducedBoundary; // downsampled boundary of a block
static_vector<int, MIP_MAX_INPUT_SIZE> m_reducedBoundaryTransposed; // downsampled, transposed boundary of a block
int m_inputOffset;
......
......@@ -1006,8 +1006,12 @@ void Quant::quant(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf
const uint32_t lfnstIdx = tu.cu->lfnstIdx;
const int maxNumberOfCoeffs = lfnstIdx > 0 ? ((( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8) ) ? 8 : 16) : piQCoef.area();
memset( piQCoef.buf, 0, sizeof(TCoeff) * piQCoef.area() );
for (int uiBlockPos = 0; uiBlockPos < maxNumberOfCoeffs; uiBlockPos++ )
const ScanElement* scan = g_scanOrder[SCAN_GROUPED_4x4][SCAN_DIAG][gp_sizeIdxInfo->idxFrom(uiWidth)][gp_sizeIdxInfo->idxFrom(uiHeight)];
for (int uiScanPos = 0; uiScanPos < maxNumberOfCoeffs; uiScanPos++)
{
const int uiBlockPos = scan[uiScanPos].idx;
const TCoeff iLevel = piCoef.buf[uiBlockPos];
const TCoeff iSign = (iLevel < 0 ? -1: 1);
......
......@@ -2151,7 +2151,11 @@ Distortion RdCost::xCalcHADs2x2( const Pel *piOrg, const Pel *piCur, int iStride
m[2] = diff[0] - diff[2];
m[3] = diff[1] - diff[3];
#if JVET_R0164_MEAN_SCALED_SATD
satd += abs(m[0] + m[1]) >> 2;
#else
satd += abs(m[0] + m[1]);
#endif
satd += abs(m[0] - m[1]);
satd += abs(m[2] + m[3]);
satd += abs(m[2] - m[3]);
......@@ -2250,7 +2254,12 @@ Distortion RdCost::xCalcHADs4x4( const Pel *piOrg, const Pel *piCur, int iStride
{
satd += abs(d[k]);
}
satd = ((satd+1)>>1);
#if JVET_R0164_MEAN_SCALED_SATD
satd -= abs(d[0]);
satd += abs(d[0]) >> 2;
#endif
satd = ((satd+1)>>1);
return satd;
}
......@@ -2347,7 +2356,11 @@ Distortion RdCost::xCalcHADs8x8( const Pel *piOrg, const Pel *piCur, int iStride
}
}
sad=((sad+2)>>2);
#if JVET_R0164_MEAN_SCALED_SATD
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
#endif
sad = ((sad+2)>>2);
return sad;
}
......@@ -2493,7 +2506,11 @@ Distortion RdCost::xCalcHADs16x8( const Pel *piOrg, const Pel *piCur, int iStrid
}
}
sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
#if JVET_R0164_MEAN_SCALED_SATD
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
#endif
sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
return sad;
}
......@@ -2630,7 +2647,11 @@ Distortion RdCost::xCalcHADs8x16( const Pel *piOrg, const Pel *piCur, int iStrid
}
}
sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
#if JVET_R0164_MEAN_SCALED_SATD
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
#endif
sad = ( int ) ( sad / sqrt( 16.0 * 8 ) * 2 );
return sad;
}
......@@ -2703,7 +2724,11 @@ Distortion RdCost::xCalcHADs4x8( const Pel *piOrg, const Pel *piCur, int iStride
}
}
sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
#if JVET_R0164_MEAN_SCALED_SATD
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
#endif
sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
return sad;
}
......@@ -2782,7 +2807,11 @@ Distortion RdCost::xCalcHADs8x4( const Pel *piOrg, const Pel *piCur, int iStride
}
}
sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
#if JVET_R0164_MEAN_SCALED_SATD
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
#endif
sad = ( int ) ( sad / sqrt( 4.0 * 8 ) * 2 );
return sad;
}
......
......@@ -2376,6 +2376,9 @@ SPS::SPS()
, m_maxHeightInLumaSamples (288)
, m_subPicInfoPresentFlag (false)
, m_numSubPics(1)
#if JVET_R0156_ASPECT4_SPS_CLEANUP
, m_independentSubPicsFlag (false)
#endif
, m_subPicIdMappingExplicitlySignalledFlag ( false )
, m_subPicIdMappingInSpsFlag ( false )
, m_subPicIdLen(16)
......
......@@ -1217,6 +1217,9 @@ private:
Window m_conformanceWindow;
bool m_subPicInfoPresentFlag; // indicates the presence of sub-picture info
uint32_t m_numSubPics; //!< number of sub-pictures used
#if JVET_R0156_ASPECT4_SPS_CLEANUP
bool m_independentSubPicsFlag;
#endif
std::vector<uint32_t> m_subPicCtuTopLeftX;
std::vector<uint32_t> m_subPicCtuTopLeftY;
std::vector<uint32_t> m_subPicWidth;
......@@ -1402,6 +1405,10 @@ public:
m_loopFilterAcrossSubpicEnabledFlag.resize(m_numSubPics);
m_subPicId.resize(m_numSubPics);
}
#if JVET_R0156_ASPECT4_SPS_CLEANUP
void setIndependentSubPicsFlag(bool b) { m_independentSubPicsFlag = b; }
bool getIndependentSubPicsFlag() const { return m_independentSubPicsFlag; }
#endif
uint32_t getNumSubPics( ) const { return m_numSubPics; }
void setSubPicCtuTopLeftX( int i, uint32_t u ) { m_subPicCtuTopLeftX[i] = u; }
uint32_t getSubPicCtuTopLeftX( int i ) const { return m_subPicCtuTopLeftX[i]; }
......
......@@ -51,19 +51,42 @@
#include <cassert>
//########### place macros to be removed in next cycle below this line ###############
#define JVET_R0327_ONE_PASS_CCALF 1 // JVET-R0327: One-pass CCALF
#define JVET_R0200_MOVE_LMCS_AND_SCALING_LIST_SE 1 // JVET-R0200 Move the SH flags slice_lmcs_enabled_flag and slice_explicit_scaling_list_used_flag to be just after the ALF parameters
#define JVET_R0388_DBF_CLEANUP 1 // JVET-R0388: Cleanups on deblocking signalling
#define JVET_R0071_SPS_PPS_CELANUP 1 // JVET-R0071 item 2-4: cleanups on subpicture signalling (item 1 has been ported in JVET_R0156_ASPECT4)
#define JVET_R0271_SLICE_LEVEL_DQ_SDH_RRC 1 // JVET-R0271/R0155: Slice level DQ and SDH granularity for mixed lossy/lossless.
#define JVET_R0143_TSRCdisableLL 1 // JVET-R0143: disable TSRC for lossless coding
#define JVET_R0371_MAX_NUM_SUB_BLK_MRG_CAND 1 // JVET-R0371: set the range of max number of subblock based merge candidate to 0 to 5 sps_sbtmvp_enabled_flag.
#define JVET_R0233_CCALF_LINE_BUFFER_REDUCTION 1 // JVET-R0233 method 2: Line buffer reduction for CCALF
#define JVET_Q0471_CHROMA_QT_SPLIT 1 // JVET-Q0471: Chroma QT split
#define JVET_Q0471_CHROMA_QT_SPLIT 0 // JVET-Q0471: Chroma QT split, reverted by JVET-R0131
#define JVET_R0208_ALF_VB_ROUNDING_FIX 1 // JVET-R0208: Rounding offset fix for ALF virtual boundary processing
#define JVET_R0232_CCALF_APS_CONSTRAINT 1 // JVET-R0232 section 3.2: APS contraint for CCALF
#define JVET_R0210_NUMTILESINSLICE_SIGNALLING 1 // JVET-R0210 section 3.3: Don't signal NumTilesInSlice syntax element when numTilesInPic - slice_address is 1.
#define JVET_R0156_ASPECT4_SPS_CLEANUP 1 // JVET-R0071 #1, R0156 #4, R0284 #1: Condition sps_independent_subpics_flag on "sps_num_subpics_minus1 > 0"
#define JVET_R0156_ASPECT3_SPS_CLEANUP 1 // Condition sps_sublayer_dpb_params_flag on sps_ptl_dpb_hrd_params_present_flag, in addition to sps_max_sublayer_minus1, JVET-R0156 proposal 3, JVET-R0170, JVET-R0222 proposal 2
#define JVET_R0350_MIP_CHROMA_444_SINGLETREE 1 // JVET-R0350: MIP for chroma in case of 4:4:4 format and single tree
//########### place macros to be be kept below this line ###############
#define JVET_R0164_MEAN_SCALED_SATD 1 // JVET-R0164: Use a mean scaled version of SATD in encoder decisions
#define JVET_M0497_MATRIX_MULT 0 // 0: Fast method; 1: Matrix multiplication
#define APPLY_SBT_SL_ON_MTS 1 // apply save & load fast algorithm on inter MTS when SBT is on
......
......@@ -553,9 +553,27 @@ int PU::getIntraMPMs( const PredictionUnit &pu, unsigned* mpm, const ChannelType
bool PU::isMIP(const PredictionUnit &pu, const ChannelType &chType)
{
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
if (chType == CHANNEL_TYPE_LUMA)
{
// Default case if chType is omitted.
return pu.cu->mipFlag;
}
else
{
return isDMChromaMIP(pu) && (pu.intraDir[CHANNEL_TYPE_CHROMA] == DM_CHROMA_IDX);
}
#else
return (chType == CHANNEL_TYPE_LUMA && pu.cu->mipFlag);
#endif
}
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
bool PU::isDMChromaMIP(const PredictionUnit &pu)
{
return !pu.cu->isSepTree() && (pu.chromaFormat == CHROMA_444) && getCoLocatedLumaPU(pu).cu->mipFlag;
}
#endif
uint32_t PU::getIntraDirLuma( const PredictionUnit &pu )
{
......@@ -582,6 +600,14 @@ void PU::getIntraChromaCandModes( const PredictionUnit &pu, unsigned modeList[NU
modeList[6] = MDLM_T_IDX;
modeList[7] = DM_CHROMA_IDX;
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
// If Direct Mode is MIP, mode cannot be already in the list.
if (isDMChromaMIP(pu))
{
return;
}
#endif
const uint32_t lumaMode = getCoLocatedIntraLumaMode(pu);
for( int i = 0; i < 4; i++ )
{
......@@ -638,6 +664,23 @@ uint32_t PU::getFinalIntraMode( const PredictionUnit &pu, const ChannelType &chT
return uiIntraMode;
}
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
const PredictionUnit &PU::getCoLocatedLumaPU(const PredictionUnit &pu)
{
Position topLeftPos = pu.blocks[pu.chType].lumaPos();
Position refPos = topLeftPos.offset(pu.blocks[pu.chType].lumaSize().width >> 1,
pu.blocks[pu.chType].lumaSize().height >> 1);
const PredictionUnit &lumaPU = pu.cu->isSepTree() ? *pu.cs->picture->cs->getPU(refPos, CHANNEL_TYPE_LUMA)
: *pu.cs->getPU(topLeftPos, CHANNEL_TYPE_LUMA);
return lumaPU;
}
uint32_t PU::getCoLocatedIntraLumaMode(const PredictionUnit &pu)
{
return PU::getIntraDirLuma(PU::getCoLocatedLumaPU(pu));
}
#else
uint32_t PU::getCoLocatedIntraLumaMode( const PredictionUnit &pu )
{
Position topLeftPos = pu.blocks[pu.chType].lumaPos();
......@@ -646,6 +689,7 @@ uint32_t PU::getCoLocatedIntraLumaMode( const PredictionUnit &pu )
return PU::getIntraDirLuma( lumaPU );
}
#endif
int PU::getWideAngIntraMode( const TransformUnit &tu, const uint32_t dirMode, const ComponentID compID )
{
......
......@@ -129,8 +129,14 @@ namespace PU
int getLMSymbolList(const PredictionUnit &pu, int *modeList);
int getIntraMPMs(const PredictionUnit &pu, unsigned *mpm, const ChannelType &channelType = CHANNEL_TYPE_LUMA);
bool isMIP (const PredictionUnit &pu, const ChannelType &chType = CHANNEL_TYPE_LUMA);
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
bool isDMChromaMIP (const PredictionUnit &pu);
#endif
uint32_t getIntraDirLuma (const PredictionUnit &pu);
void getIntraChromaCandModes (const PredictionUnit &pu, unsigned modeList[NUM_CHROMA_MODE]);
#if JVET_R0350_MIP_CHROMA_444_SINGLETREE
const PredictionUnit &getCoLocatedLumaPU(const PredictionUnit &pu);
#endif
uint32_t getFinalIntraMode (const PredictionUnit &pu, const ChannelType &chType);
uint32_t getCoLocatedIntraLumaMode (const PredictionUnit &pu);
int getWideAngIntraMode ( const TransformUnit &tu, const uint32_t dirMode, const ComponentID compID );
......
......@@ -523,6 +523,9 @@ static uint32_t xCalcHAD4x4_SSE( const Torg *piOrg, const Tcur *piCur, const int
// abs
__m128i Sum = _mm_abs_epi16( r0 );
#if JVET_R0164_MEAN_SCALED_SATD
uint32_t absDc = _mm_cvtsi128_si32( Sum ) & 0x0000ffff;
#endif
Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r2 ) );
Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r3 ) );
Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r5 ) );
......@@ -534,7 +537,11 @@ static uint32_t xCalcHAD4x4_SSE( const Torg *piOrg, const Tcur *piCur, const int
uint32_t sad = _mm_cvtsi128_si32( Sum );
sad = ( ( sad + 1 ) >> 1 );
#if JVET_R0164_MEAN_SCALED_SATD
sad -= absDc;
sad += absDc >> 2;
#endif
sad = ( ( sad + 1 ) >> 1 );
return sad;
}
......@@ -663,7 +670,12 @@ static uint32_t xCalcHAD8x8_SSE( const Torg *piOrg, const Tcur *piCur, const int
iSum = _mm_hadd_epi32( iSum, iSum );
uint32_t sad = _mm_cvtsi128_si32( iSum );
sad = ( ( sad + 2 ) >> 2 );
#if JVET_R0164_MEAN_SCALED_SATD
uint32_t absDc = _mm_cvtsi128_si32( n1[0][0] );
sad -= absDc;
sad += absDc >> 2;
#endif
sad = ( ( sad + 2 ) >> 2 );
return sad;
}
......@@ -725,6 +737,9 @@ static uint32_t xCalcHAD16x8_SSE( const Torg *piOrg, const Tcur *piCur, const in
// 4 x 8x4 blocks
// 0 1
// 2 3
#if JVET_R0164_MEAN_SCALED_SATD
uint32_t absDc = 0;
#endif
// transpose and do horizontal in two steps
for( int l = 0; l < 2; l++ )
......@@ -841,6 +856,11 @@ static uint32_t xCalcHAD16x8_SSE( const Torg *piOrg, const Tcur *piCur, const in
n1[14] = _mm_abs_epi32( _mm_add_epi32( n2[14], n2[15] ) );
n1[15] = _mm_abs_epi32( _mm_sub_epi32( n2[14], n2[15] ) );
#if JVET_R0164_MEAN_SCALED_SATD
if (l == 0)
absDc = _mm_cvtsi128_si32( n1[0] );
#endif
// sum up
n1[0] = _mm_add_epi32( n1[0], n1[1] );
n1[2] = _mm_add_epi32( n1[2], n1[3] );
......@@ -868,7 +888,11 @@ static uint32_t xCalcHAD16x8_SSE( const Torg *piOrg, const Tcur *piCur, const in
uint32_t sad = _mm_cvtsi128_si32( iSum );
sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2);
#if JVET_R0164_MEAN_SCALED_SATD
sad -= absDc;
sad += absDc >> 2;
#endif
sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2);
return sad;
}
......@@ -984,6 +1008,10 @@ static uint32_t xCalcHAD8x16_SSE( const Torg *piOrg, const Tcur *piCur, const in
}
}
#if JVET_R0164_MEAN_SCALED_SATD
uint32_t absDc = 0;
#endif
for( int l = 0; l < 2; l++ )
{
int off = l * 8;
......@@ -1028,6 +1056,11 @@ static uint32_t xCalcHAD8x16_SSE( const Torg *piOrg, const Tcur *piCur, const in
n1[i][5] = _mm_abs_epi32( _mm_sub_epi32( n2[i][4], n2[i][5] ) );
n1[i][6] = _mm_abs_epi32( _mm_add_epi32( n2[i][6], n2[i][7] ) );
n1[i][7] = _mm_abs_epi32( _mm_sub_epi32( n2[i][6], n2[i][7] ) );
#if JVET_R0164_MEAN_SCALED_SATD
if ( l + i == 0 )
absDc = _mm_cvtsi128_si32( n1[i][0] );
#endif
}
for( int i = 0; i < 8; i++ )
......@@ -1050,7 +1083,11 @@ static uint32_t xCalcHAD8x16_SSE( const Torg *piOrg, const Tcur *piCur, const in
uint32_t sad = _mm_cvtsi128_si32( iSum );
sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2);
#if JVET_R0164_MEAN_SCALED_SATD
sad -= absDc;
sad += absDc >> 2;
#endif
sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2);
return sad;
}
......@@ -1177,6 +1214,9 @@ static uint32_t xCalcHAD8x4_SSE( const Torg *piOrg, const Tcur *piCur, const int
}
}
#if JVET_R0164_MEAN_SCALED_SATD
uint32_t absDc = _mm_cvtsi128_si32( m1[0] );
#endif
m1[0] = _mm_add_epi32( m1[0], m1[1] );
m1[1] = _mm_add_epi32( m1[2], m1[3] );
......@@ -1193,7 +1233,11 @@ static uint32_t xCalcHAD8x4_SSE( const Torg *piOrg, const Tcur *piCur, const int
uint32_t sad = _mm_cvtsi128_si32( iSum );
//sad = ((sad + 2) >> 2);
sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2);
#if JVET_R0164_MEAN_SCALED_SATD
sad -= absDc;
sad += absDc >> 2;
#endif
sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2);
return sad;
}
......@@ -1261,6 +1305,10 @@ static uint32_t xCalcHAD4x8_SSE( const Torg *piOrg, const Tcur *piCur, const int
m2[3] = _mm_unpackhi_epi64( m1[1], m1[3] );
}
#if JVET_R0164_MEAN_SCALED_SATD
uint32_t absDc = 0;
#endif
if( iBitDepth >= 10 /*sizeof( Torg ) > 1 || sizeof( Tcur ) > 1*/ )
{
__m128i n1[4][2];
......@@ -1288,6 +1336,10 @@ static uint32_t xCalcHAD4x8_SSE( const Torg *piOrg, const Tcur *piCur, const int
{
m1[i] = _mm_add_epi32( n1[i][0], n1[i][1] );
}
#if JVET_R0164_MEAN_SCALED_SATD
absDc = _mm_cvtsi128_si32( n1[0][0] );
#endif
}
else
{
......@@ -1310,6 +1362,10 @@ static uint32_t xCalcHAD4x8_SSE( const Torg *piOrg, const Tcur *piCur, const int
ma2 = _mm_unpackhi_epi16( m2[i], vzero );
m1[i] = _mm_add_epi32( ma1, ma2 );
}
#if JVET_R0164_MEAN_SCALED_SATD
absDc = _mm_cvtsi128_si32( m2[0] ) & 0x0000ffff;
#endif
}
m1[0] = _mm_add_epi32( m1[0], m1[1] );
......@@ -1323,7 +1379,11 @@ static uint32_t xCalcHAD4x8_SSE( const Torg *piOrg, const Tcur *piCur, const int
uint32_t sad = _mm_cvtsi128_si32( iSum );
//sad = ((sad + 2) >> 2);
sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2);
#if JVET_R0164_MEAN_SCALED_SATD
sad -= absDc;
sad += absDc >> 2;
#endif
sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2);
return sad;
}
......@@ -1462,6 +1522,11 @@ static uint32_t xCalcHAD16x16_AVX2( const Torg *piOrg, const Tcur *piCur, const
m2[i][7] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][6], m1[i][7] ) );
}
#if JVET_R0164_MEAN_SCALED_SATD
uint32_t absDc0 = _mm_cvtsi128_si32( _mm256_castsi256_si128( m2[0][0] ) );
uint32_t absDc1 = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( m2[0][0], m2[0][0], 0x11 ) ) );
#endif
for( int i = 0; i < 8; i++ )
{
m1[0][i] = _mm256_add_epi32( m2[0][i], m2[1][i] );
......@@ -1481,12 +1546,20 @@ static uint32_t xCalcHAD16x16_AVX2( const Torg *piOrg, const Tcur *piCur, const
iSum = _mm256_hadd_epi32( iSum, iSum );
uint32_t tmp;
tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) );
tmp = ( ( tmp + 2 ) >> 2 );
tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) );
#if JVET_R0164_MEAN_SCALED_SATD
tmp -= absDc0;
tmp += absDc0 >> 2;
#endif
tmp = ( ( tmp + 2 ) >> 2 );
sad += tmp;
tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ) );
tmp = ( ( tmp + 2 ) >> 2 );
tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ) );
#if JVET_R0164_MEAN_SCALED_SATD