diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp index a2881df31554040e244d372d2ca78c24a3912e29..6713cb839e84e1f97994f5ff32603b8f670e9489 100644 --- a/source/App/EncoderApp/EncApp.cpp +++ b/source/App/EncoderApp/EncApp.cpp @@ -1256,6 +1256,9 @@ void EncApp::xInitLibCfg() #endif #if JVET_AH0057_CCALF_COEFF_PRECISION m_cEncLib.setUseCCALFPrecision ( m_ccalfPrecision ); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_cEncLib.setAlfLumaFixedFilterAdjust ( m_alfLumaFixedFilterAdjust ); #endif m_cEncLib.setTestSAODisableAtPictureLevel ( m_bTestSAODisableAtPictureLevel ); m_cEncLib.setSaoEncodingRate ( m_saoEncodingRate ); diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index 737460da0d59fcf4437eb1a11f688ece2af21650..49aa88e1dbec1b60d4ee66748bfa6f026006bc3d 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -1567,6 +1567,9 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) #endif #if JVET_AH0057_CCALF_COEFF_PRECISION ("CCALFPrecision", m_ccalfPrecision, true, "Cross-component Alf with variable precision coefficients") +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + ("AlfLumaFixedFilterAdjust", m_alfLumaFixedFilterAdjust, true, "Alf Luma Fixed Filter Adjustment" ) #endif ("TestSAODisableAtPictureLevel", m_bTestSAODisableAtPictureLevel, false, "Enables the testing of disabling SAO at the picture level after having analysed all blocks") ("SaoEncodingRate", m_saoEncodingRate, 0.75, "When >0 SAO early picture termination is enabled for luma and chroma") diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h index 9a478c16a0901ac17536b60ee3dbf14f38c0769d..2b1b81859dbbe74e8f9f24fbd01cc41e9871808d 100644 --- a/source/App/EncoderApp/EncAppCfg.h +++ b/source/App/EncoderApp/EncAppCfg.h @@ -756,6 +756,9 @@ protected: #endif #if JVET_AH0057_CCALF_COEFF_PRECISION bool m_ccalfPrecision; +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + bool m_alfLumaFixedFilterAdjust; #endif bool m_bTestSAODisableAtPictureLevel; double m_saoEncodingRate; ///< When >0 SAO early picture termination is enabled for luma and chroma diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp index 4a8a3c36089e9cc4441f4dead5b38e76c28f1669..ceaa4d3d184ca1d214f7bc3878b571862737a2b8 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.cpp +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.cpp @@ -64,6 +64,12 @@ AdaptiveLoopFilter::AdaptiveLoopFilter() { m_classifier[i] = nullptr; } +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + for( int i = 0; i < 1; i++ ) + { + m_classifierCodingInfo[i] = nullptr; + } #endif for (size_t i = 0; i < NUM_DIRECTIONS; i++) { @@ -163,6 +169,10 @@ AdaptiveLoopFilter::AdaptiveLoopFilter() #else m_deriveClassificationBlk = deriveClassificationBlk; #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_textureClassMapping = textureClassMapping; + m_calcAlfLumaCodingInfoBlk = calcAlfLumaCodingInfoBlk; +#endif #if ENABLE_SIMD_OPT_ALF #ifdef TARGET_SIMD_X86 @@ -671,6 +681,9 @@ void AdaptiveLoopFilter::ALFProcess(CodingStructure& cs) #endif #if JVET_AE0139_ALF_IMPROVED_FIXFILTER memset(m_ctuPadFlag, 0, sizeof(uint8_t) * m_numCTUsInPic); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf tmpYuvCodingInfo = m_tempBufCodingInfo.getBuf( cs.area ); #endif const PreCalcValues& pcv = *cs.pcv; @@ -769,6 +782,23 @@ void AdaptiveLoopFilter::ALFProcess(CodingStructure& cs) #endif buf = buf.subBuf( UnitArea( cs.area.chromaFormat, Area( clipL ? 0 : MAX_ALF_PADDING_SIZE, clipT ? 0 : MAX_ALF_PADDING_SIZE, w, h ) ) ); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf bufCodingInfo = m_tempBufCodingInfo2.subBuf( UnitArea( CHROMA_400, Area( 0, 0, wBuf, hBuf ) ) ); + bufCodingInfo.copyFrom( tmpYuvCodingInfo.subBuf( UnitArea( CHROMA_400, Area( xStart - ( clipL ? 0 : MAX_ALF_PADDING_SIZE ), yStart - ( clipT ? 0 : MAX_ALF_PADDING_SIZE ), wBuf, hBuf ) ) ) ); + // pad top-left unavailable samples for raster slice + if( xStart == xPos && yStart == yPos && ( rasterSliceAlfPad & 1 ) ) + { + bufCodingInfo.padBorderPel( MAX_ALF_PADDING_SIZE, 1 ); + } + + // pad bottom-right unavailable samples for raster slice + if( xEnd == xPos + width && yEnd == yPos + height && ( rasterSliceAlfPad & 2 ) ) + { + bufCodingInfo.padBorderPel( MAX_ALF_PADDING_SIZE, 2 ); + } + mirroredPaddingForAlf(cs, bufCodingInfo, MAX_ALF_PADDING_SIZE, true, false); + bufCodingInfo = bufCodingInfo.subBuf( UnitArea( CHROMA_400, Area( clipL ? 0 : MAX_ALF_PADDING_SIZE, clipT ? 0 : MAX_ALF_PADDING_SIZE, w, h ) ) ); +#endif #if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT #if JVET_AF0197_LUMA_RESIDUAL_TAP_IN_CCALF PelUnitBuf bufResi = m_tempBufResi2.subBuf(UnitArea(CHROMA_400, Area(0, 0, wBuf, hBuf))); @@ -817,6 +847,9 @@ void AdaptiveLoopFilter::ALFProcess(CodingStructure& cs) bufDb.extendBorderPel(NUM_DB_PAD); bufDb = bufDb.subBuf(UnitArea(CHROMA_400, Area(clipL ? 0 : NUM_DB_PAD, clipT ? 0 : NUM_DB_PAD, w, h))); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + calcAlfLumaCodingInfoBlk(cs, m_classifierCodingInfo[0], blkDst, blkSrc, buf.get(COMPONENT_Y), 2, 2, m_inputBitDepth[CHANNEL_TYPE_LUMA], bufResi.get(COMPONENT_Y), m_laplacian[0], bufCodingInfo.get(COMPONENT_Y) ); +#endif #if JVET_X0071_ALF_BAND_CLASSIFIER deriveClassification( m_classifier, buf.get(COMPONENT_Y), #if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT @@ -1213,6 +1246,9 @@ void AdaptiveLoopFilter::ALFProcess(CodingStructure& cs) { Area blk( xPos, yPos, width, height ); short filterSetIndex = alfCtuFilterIndex[ctuIdx]; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + calcAlfLumaCodingInfoBlk(cs, m_classifierCodingInfo[0], blk, blk, recYuv.get(COMPONENT_Y), 2, 2, m_inputBitDepth[CHANNEL_TYPE_LUMA], tmpYuvResi.get(COMPONENT_Y), m_laplacian[0], tmpYuvCodingInfo.get(COMPONENT_Y) ); +#endif #if JVET_X0071_ALF_BAND_CLASSIFIER deriveClassification( m_classifier, tmpYuv.get(COMPONENT_Y), #if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT @@ -1232,13 +1268,21 @@ void AdaptiveLoopFilter::ALFProcess(CodingStructure& cs) if( filterSetIndex != 0 ) { #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER - deriveFixedFilterResultsCtuBoundary( m_classifier, m_fixFilterResult[COMPONENT_Y], tmpYuv.get( COMPONENT_Y ), tmpYuvBeforeDb.get( COMPONENT_Y ), blk, m_inputBitDepth[CHANNEL_TYPE_LUMA], cs, m_clpRngs.comp[COMPONENT_Y], m_alfClippingValues[CHANNEL_TYPE_LUMA], cs.slice->getSliceQp(), fixedFilterSetIdx, m_mappingDir, m_laplacian, m_ctuEnableFlag[COMPONENT_Y], m_ctuEnableOnlineLumaFlag, ctuIdx, 0 ); + deriveFixedFilterResultsCtuBoundary( m_classifier, m_fixFilterResult[COMPONENT_Y], tmpYuv.get( COMPONENT_Y ), tmpYuvBeforeDb.get( COMPONENT_Y ), blk, m_inputBitDepth[CHANNEL_TYPE_LUMA], cs, m_clpRngs.comp[COMPONENT_Y], m_alfClippingValues[CHANNEL_TYPE_LUMA], cs.slice->getSliceQp(), fixedFilterSetIdx, m_mappingDir, m_laplacian, m_ctuEnableFlag[COMPONENT_Y], m_ctuEnableOnlineLumaFlag, ctuIdx, 0 +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , tmpYuvCodingInfo.get(COMPONENT_Y), tmpYuvResi.get( COMPONENT_Y ) +#endif + ); #else deriveFixedFilterResultsCtuBoundary( m_classifier, m_fixFilterResult, tmpYuv.get( COMPONENT_Y ), tmpYuvBeforeDb.get( COMPONENT_Y ), blk, m_inputBitDepth[CHANNEL_TYPE_LUMA], cs, m_clpRngs.comp[COMPONENT_Y], m_alfClippingValues[CHANNEL_TYPE_LUMA], cs.slice->getSliceQp(), fixedFilterSetIdx, m_mappingDir, m_laplacian, m_ctuEnableFlag[COMPONENT_Y], m_ctuEnableOnlineLumaFlag, ctuIdx, 0 ); #endif deriveFixedFilterResults( m_classifier, tmpYuv.get( COMPONENT_Y ), m_tempBufBeforeDb.get( COMPONENT_Y ), blk, blk, cs, 1, fixedFilterSetIdx ); #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER - deriveFixedFilterResultsCtuBoundary( m_classifier, m_fixFilterResult[COMPONENT_Y], tmpYuv.get( COMPONENT_Y ), tmpYuvBeforeDb.get( COMPONENT_Y ), blk, m_inputBitDepth[CHANNEL_TYPE_LUMA], cs, m_clpRngs.comp[COMPONENT_Y], m_alfClippingValues[CHANNEL_TYPE_LUMA], cs.slice->getSliceQp(), fixedFilterSetIdx, m_mappingDir, m_laplacian, m_ctuEnableFlag[COMPONENT_Y], m_ctuEnableOnlineLumaFlag, ctuIdx, 1 ); + deriveFixedFilterResultsCtuBoundary( m_classifier, m_fixFilterResult[COMPONENT_Y], tmpYuv.get( COMPONENT_Y ), tmpYuvBeforeDb.get( COMPONENT_Y ), blk, m_inputBitDepth[CHANNEL_TYPE_LUMA], cs, m_clpRngs.comp[COMPONENT_Y], m_alfClippingValues[CHANNEL_TYPE_LUMA], cs.slice->getSliceQp(), fixedFilterSetIdx, m_mappingDir, m_laplacian, m_ctuEnableFlag[COMPONENT_Y], m_ctuEnableOnlineLumaFlag, ctuIdx, 1 +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , tmpYuvCodingInfo.get(COMPONENT_Y), tmpYuvResi.get( COMPONENT_Y ) +#endif + ); #else deriveFixedFilterResultsCtuBoundary( m_classifier, m_fixFilterResult, tmpYuv.get( COMPONENT_Y ), tmpYuvBeforeDb.get( COMPONENT_Y ), blk, m_inputBitDepth[CHANNEL_TYPE_LUMA], cs, m_clpRngs.comp[COMPONENT_Y], m_alfClippingValues[CHANNEL_TYPE_LUMA], cs.slice->getSliceQp(), fixedFilterSetIdx, m_mappingDir, m_laplacian, m_ctuEnableFlag[COMPONENT_Y], m_ctuEnableOnlineLumaFlag, ctuIdx, 1 ); #endif @@ -1842,6 +1886,12 @@ void AdaptiveLoopFilter::create(const int picWidth, const int picHeight, const C m_tempBufSAO2.destroy(); m_tempBufSAO2.create(format, Area(0, 0, maxCUWidth + (MAX_ALF_PADDING_SIZE << 1), maxCUHeight + (MAX_ALF_PADDING_SIZE << 1)), maxCUWidth, MAX_ALF_PADDING_SIZE, 0, false); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_tempBufCodingInfo.destroy(); + m_tempBufCodingInfo.create(CHROMA_400, Area(0, 0, picWidth, picHeight), maxCUWidth, MAX_FILTER_LENGTH_FIXED, 0, false); + m_tempBufCodingInfo2.destroy(); + m_tempBufCodingInfo2.create(CHROMA_400, Area( 0, 0, maxCUWidth + (MAX_ALF_PADDING_SIZE << 1), maxCUHeight + (MAX_ALF_PADDING_SIZE << 1) ), maxCUWidth, MAX_ALF_PADDING_SIZE, 0, false ); +#endif #if ALF_IMPROVEMENT int numFixedFilters = EXT_LENGTH << 1; @@ -2142,6 +2192,21 @@ void AdaptiveLoopFilter::create(const int picWidth, const int picHeight, const C } } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + for( int classifier = 0; classifier < 1; classifier++ ) + { + if( m_classifierCodingInfo[classifier] == nullptr ) + { + m_classifierCodingInfo[classifier] = new AlfClassifier*[picHeight]; + m_classifierCodingInfo[classifier][0] = new AlfClassifier[picWidth * picHeight]; + + for( int i = 1; i < picHeight; i++ ) + { + m_classifierCodingInfo[classifier][i] = m_classifierCodingInfo[classifier][0] + i * picWidth; + } + } + } +#endif #if !ALF_IMPROVEMENT for (int filterSetIndex = 0; filterSetIndex < NUM_FIXED_FILTER_SETS; filterSetIndex++) { @@ -2199,6 +2264,17 @@ void AdaptiveLoopFilter::destroy() m_classifier = nullptr; } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + for (int classifier = 0; classifier < 1; classifier++) + { + if (m_classifierCodingInfo[classifier] ) + { + delete[] m_classifierCodingInfo[classifier][0]; + delete[] m_classifierCodingInfo[classifier]; + m_classifierCodingInfo[classifier] = nullptr; + } + } +#endif #if ALF_IMPROVEMENT #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS @@ -2391,6 +2467,10 @@ void AdaptiveLoopFilter::destroy() #if JVET_AI0166_CCALF_CHROMA_SAO_INPUT m_tempBufSAO.destroy(); m_tempBufSAO2.destroy(); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_tempBufCodingInfo.destroy(); + m_tempBufCodingInfo2.destroy(); #endif m_filterShapes[CHANNEL_TYPE_LUMA].clear(); m_filterShapes[CHANNEL_TYPE_CHROMA].clear(); @@ -3127,7 +3207,11 @@ void AdaptiveLoopFilter::deriveFixFilterResultsBlkChroma(AlfClassifier ***classi { if (fixedFiltSetInd == targetFixedFilterSetInd || targetFixedFilterSetInd == -1) { - alfFixedFilterBlk(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blkDst, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFiltInd, fixedFiltSetInd, 0, clpRng, clippingValues, false); + alfFixedFilterBlk(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blkDst, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFiltInd, fixedFiltSetInd, 0, clpRng, clippingValues, false +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); } fixedFiltInd++; } @@ -3485,54 +3569,89 @@ void AdaptiveLoopFilter::calcClass0Var( AlfClassifier **classifier, const Area & { for( int jj = posXDst + j; jj < posXDst + j + subBlkSize; jj++ ) { + classifier[ii][jj] = (actDirInd << 2) + transposeIdx; + } + } #else for( int ii = curBlk.y + i; ii < curBlk.y + i + subBlkSize; ii++ ) { for( int jj = curBlk.x + j; jj < curBlk.x + j + subBlkSize; jj++ ) { -#endif classifier[ii][jj] = (actDirInd << 2) + transposeIdx; } } +#endif } } } #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER -void AdaptiveLoopFilter::alfFixedFilterBlkNonSimd( AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma ) +void AdaptiveLoopFilter::alfFixedFilterBlkNonSimd( AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , CodingStructure &cs +#endif + ) { if( isLuma ) { if( dirWindSize == 0 ) { - fixedFilterBlk<ALF_FIXED_FILTER_9_DB_9>( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues ); + fixedFilterBlk<ALF_FIXED_FILTER_9_DB_9>( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , isLuma, cs, m_classifierCodingInfo[0] +#endif + ); } else { - fixedFilterBlk<ALF_FIXED_FILTER_13_DB_9>( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter13Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues ); + fixedFilterBlk<ALF_FIXED_FILTER_13_DB_9>( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter13Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , isLuma, cs, m_classifierCodingInfo[0] +#endif + ); } } else { - fixedFilterBlk<ALF_FIXED_FILTER_9_DB_9>(classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues); + fixedFilterBlk<ALF_FIXED_FILTER_9_DB_9>(classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , isLuma, cs, m_classifierCodingInfo[0] +#endif + ); } } -void AdaptiveLoopFilter::alfFixedFilterBlk( AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma ) +void AdaptiveLoopFilter::alfFixedFilterBlk( AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , CodingStructure &cs +#endif + ) { if( isLuma ) { if( dirWindSize == 0 ) { - m_fixFilter9x9Db9Blk( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues ); + m_fixFilter9x9Db9Blk( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , isLuma, cs, m_classifierCodingInfo[0] +#endif + ); } else { - m_fixFilter13x13Db9Blk( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter13Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues ); + m_fixFilter13x13Db9Blk( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter13Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , isLuma, cs, m_classifierCodingInfo[0] +#endif + ); } } else { - m_fixFilter9x9Db9Blk( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues ); + m_fixFilter9x9Db9Blk( classifier, src, curBlk, blkDst, srcBeforeDb, fixedFilterResults, picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltQpInd], fixedFiltQpInd, dirWindSize, clpRng, clippingValues +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , isLuma, cs, m_classifierCodingInfo[0] +#endif + ); } } #else @@ -3562,7 +3681,11 @@ void AdaptiveLoopFilter::alfFixedFilterBlk( AlfClassifier **classifier, const CP template<AlfFixedFilterType filtType> #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER -void AdaptiveLoopFilter::fixedFilterBlk( AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] ) +void AdaptiveLoopFilter::fixedFilterBlk( AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ) #else void AdaptiveLoopFilter::fixedFilterBlk( AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] ) #endif @@ -3604,6 +3727,15 @@ void AdaptiveLoopFilter::fixedFilterBlk( AlfClassifier **classifier, const CPelB const Pel *pImgYBeforeDbPad0 = srcLumaBeforeDb.buf + posY * srcBeforeDbStride + posX; #endif const int srcBeforeDbStride2 = srcBeforeDbStride * clsSizeY; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + const bool useCodingInfo = true; + + const bool useBounCondition = applyCodingInfo && !( !isSpsAdjust && isIntraSlice ) && useCodingInfo; + const bool useResiCondition = applyCodingInfo && (isSpsAdjust || !isSpsAdjust) && !isIntraSlice && useCodingInfo; + const int offsetClipValue = 1 << ( clpRng.bd - 1 ); +#endif int fixedFiltIndF0 = -1; int numCoeff; @@ -3635,7 +3767,25 @@ void AdaptiveLoopFilter::fixedFilterBlk( AlfClassifier **classifier, const CPelB int classIdx = classifier[posY + i][posX + j] >> 2; int transposeIdx = classifier[posY + i][posX + j] & 0x3; #endif - int filterIdx = classIndFixed[classIdx]; + int filterIdx = classIndFixed[classIdx]; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + int classIdxBs = 0; + if( useBounCondition ) + { + //0: BS0 Resi0, 1:BS0 Resi1, 2:BS1 Resi0 3:BS1 Resi1 + // A >> 1 = BS + // A - ( A >> 1) * 2 + classIdxBs = classifierCodingInfo[posYDst + i][posXDst + j] >> 1; + } + int classIdxResi = 0; + if( useResiCondition ) + { + //0: BS0 Resi0, 1:BS0 Resi1, 2:BS1 Resi0 3:BS1 Resi1 + // A >> 1 = BS + // A - ( A >> 1) * 2 + classIdxResi = classifierCodingInfo[posYDst + i][posXDst + j] - ((classifierCodingInfo[posYDst + i][posXDst + j] >> 1 ) * 2); + } +#endif const short* coeff; const short* clipp; @@ -3889,6 +4039,26 @@ void AdaptiveLoopFilter::fixedFilterBlk( AlfClassifier **classifier, const CPelB CHECK( 1, "not supported" ); } sum = ( sum + offset ) >> shift; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + sum = Clip3(-offsetClipValue, +offsetClipValue, sum); + + int sign = sum < 0 ? -1 : +1; + + int boundaryStrengthFactor = isIntraSlice ? 4 : 3; + sum = classIdxBs ? sign * ((abs(sum) * (16 + boundaryStrengthFactor) + 8 ) >> 4) : sum; + } + + if( useResiCondition ) + { + sum = Clip3(-offsetClipValue, +offsetClipValue, sum); + + int sign = sum < 0 ? -1 : +1; + int resiStrengthFactor = isIntraSlice ? 0 >> ( !isSpsAdjust ? 1 : 0) : 3 >> (!isSpsAdjust ? 1 : 0); + sum = classIdxResi ? sign * ((abs(sum) * (16 + resiStrengthFactor) + 8 ) >> 4) : sum; + } +#endif sum += curr; #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS @@ -3918,6 +4088,9 @@ void AdaptiveLoopFilter::fixedFilterBlk( AlfClassifier **classifier, const CPelB void AdaptiveLoopFilter::calcClassNew( AlfClassifier **classifier, const Area &blkDst, const Area &curBlk, const CPelBuf& srcLuma, int subBlkSize, AlfClassifier **classifier0, int classifierIdx, int bitDepth #if JVET_AD0222_ALF_RESI_CLASS , const CPelBuf& srcLumaResi, uint32_t **buffer +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , AlfClassifier ** classifierCodingInfo #endif ) { @@ -3939,7 +4112,21 @@ void AdaptiveLoopFilter::calcClassNew( AlfClassifier **classifier, const Area &b const Pel *pY0 = src0 + xOffset; const Pel *pY1 = src1 + xOffset; int sum = pY0[0] + pY0[1] + pY1[0] + pY1[1]; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + int boundShift = classifierCodingInfo[curBlk.y + i][curBlk.x + j ] >> 1; + + int classIdx = 0; + if( boundShift == 0 ) + { + classIdx = ( sum * 12 ) >> (bitDepth + 2); + } + else + { + classIdx = 12 + (( sum * 12 ) >> (bitDepth + 2)); + } +#else int classIdx = (sum * ALF_NUM_CLASSES_CLASSIFIER[classifierIdx]) >> (bitDepth + 2); +#endif for (int ii = curBlk.y + i; ii < curBlk.y + i + subBlkSize; ii++) { for (int jj = curBlk.x + j; jj < curBlk.x + j + subBlkSize; jj++) @@ -3996,11 +4183,34 @@ void AdaptiveLoopFilter::calcClassNew( AlfClassifier **classifier, const Area &b int j2 = j >> 1; int sum = buffer[i2][j2] + buffer[i2][j2 + 2] + buffer[i2 + 2][j2] + buffer[i2 + 2][j2 + 2]; int shiftOffset = ALF_RESI_SHIFT_OFFSET; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + int boundShift = classifierCodingInfo[curBlk.y + i][curBlk.x + j] >> 1; + + int classIdx = sum >> (bitDepth - shiftOffset); + // Merge Neighbor Class, Then Clip + classIdx >>= 1; + if( boundShift == 0) + { + if( classIdx > 12 - 1) + { + classIdx = 12 - 1; + } + } + else + { + if( classIdx > 12 - 1) + { + classIdx = 12 - 1; + } + classIdx += 12; + } +#else int classIdx = sum >> (bitDepth - shiftOffset); if (classIdx > 24) { classIdx = 24; } +#endif for (int ii = curBlk.y + i; ii < curBlk.y + i + subBlkSize; ii++) { for (int jj = curBlk.x + j; jj < curBlk.x + j + subBlkSize; jj++) @@ -4223,15 +4433,18 @@ void AdaptiveLoopFilter::calcClass(AlfClassifier **classifier, const Area &blkDs { for (int jj = posXDst + j; jj < posXDst + j + subBlkSize; jj++) { + classifier[ii][jj] = (actDirInd << 2) + transposeIdx; + } + } #else for (int ii = curBlk.y + i; ii < curBlk.y + i + subBlkSize; ii++) { for (int jj = curBlk.x + j; jj < curBlk.x + j + subBlkSize; jj++) { -#endif classifier[ii][jj] = (actDirInd << 2) + transposeIdx; } } +#endif } } } @@ -4411,7 +4624,11 @@ void AdaptiveLoopFilter::fixedFilteringResi(AlfClassifier **classifier, const CP #endif Pel ***fixedFilterResiResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, - int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]) + int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ) { const int shift = m_NUM_BITS_FIXED_FILTER - 1; const int offset = 1 << (shift - 1); @@ -4432,6 +4649,14 @@ void AdaptiveLoopFilter::fixedFilteringResi(AlfClassifier **classifier, const CP const int clsSizeY = 2; const int clsSizeX = 2; const int srcStride2 = srcStride * clsSizeY; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + const bool useCodingInfo = isSpsAdjust ? true : false; + const bool useBounCondition = applyCodingInfo && !( !isSpsAdjust && isIntraSlice ) && useCodingInfo; + const bool useResiCondition = applyCodingInfo && (isSpsAdjust || !isSpsAdjust) && !isIntraSlice && useCodingInfo; + const int offsetClipValue = 1 << ( clpRng.bd - 1 ); +#endif for (int i = 0; i < curBlk.height; i += clsSizeY) { @@ -4445,6 +4670,25 @@ void AdaptiveLoopFilter::fixedFilteringResi(AlfClassifier **classifier, const CP int transposeIdx = classifier[posY + i][posX + j] & 0x3; #endif int filterIdx = classIndFixed[classIdx]; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + int classIdxBs = 0; + if( useBounCondition ) + { + //0: BS0 Resi0, 1:BS0 Resi1, 2:BS1 Resi0 3:BS1 Resi1 + // A >> 1 = BS + // A - ( A >> 1) * 2 + classIdxBs = classifierCodingInfo[posYDst + i][posXDst + j] >> 1; + } + + int classIdxResi = 0; + if( useResiCondition ) + { + //0: BS0 Resi0, 1:BS0 Resi1, 2:BS1 Resi0 3:BS1 Resi1 + // A >> 1 = BS + // A - ( A >> 1) * 2 + classIdxResi = classifierCodingInfo[posYDst + i][posXDst + j] - ((classifierCodingInfo[posYDst + i][posXDst + j] >> 1 ) * 2); + } +#endif #if JVET_AE0139_ALF_IMPROVED_FIXFILTER std::array<short, FIX_FILTER_NUM_COEFF_9_DB_9> filterCoeff; std::array<short, FIX_FILTER_NUM_COEFF_9_DB_9> filterClipp; @@ -4656,6 +4900,25 @@ void AdaptiveLoopFilter::fixedFilteringResi(AlfClassifier **classifier, const CP #endif sum = (sum + offset) >> shift; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + sum = Clip3(-offsetClipValue, +offsetClipValue, sum); + + int sign = sum < 0 ? -1 : +1; + + int boundaryStrengthFactor = isIntraSlice ? 4 : 3; + sum = classIdxBs ? sign * ((abs(sum) * (16 + boundaryStrengthFactor) + 8 ) >> 4) : sum; + } + if( useResiCondition ) + { + sum = Clip3(-offsetClipValue, +offsetClipValue, sum); + + int sign = sum < 0 ? -1 : +1; + int resiStrengthFactor = isIntraSlice ? 0 >> (!isSpsAdjust ? 1 : 0) : 3 >> (!isSpsAdjust ? 1 : 0); + sum = classIdxResi ? sign * ((abs(sum) * (16 + resiStrengthFactor) + 8 ) >> 4) : sum; + } +#endif sum += curr; #if JVET_AE0139_ALF_IMPROVED_FIXFILTER @@ -4901,7 +5164,11 @@ void AdaptiveLoopFilter::deriveClassificationAndFixFilterResultsBlk( AlfClassifi #if JVET_X0071_ALF_BAND_CLASSIFIER #if JVET_AE0139_ALF_IMPROVED_FIXFILTER #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER - alfFixedFilterBlk( classifier[storeIdx], srcLuma, blkNew, blkDstNew, srcLumaBeforeDb, fixedFilterResults, m_picWidth, fixedFiltInd, fixedFiltSetInd, dirWindSize, clpRng, clippingValues, true ); + alfFixedFilterBlk( classifier[storeIdx], srcLuma, blkNew, blkDstNew, srcLumaBeforeDb, fixedFilterResults, m_picWidth, fixedFiltInd, fixedFiltSetInd, dirWindSize, clpRng, clippingValues, true +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); #else alfFixedFilterBlk( classifier[storeIdx], srcLuma, blkNew, blkDstNew, srcLumaBeforeDb, fixedFilterResults, m_picWidth, fixedFiltInd, fixedFiltSetInd, dirWindSize, clpRng, clippingValues ); #endif @@ -4969,7 +5236,11 @@ void AdaptiveLoopFilter::deriveClassificationAndFixFilterResultsBlk( AlfClassifi if (bResiFixed && dirWindSize == 0) { #if JVET_AE0139_ALF_IMPROVED_FIXFILTER - m_filterResi9x9Blk(classifier[0], srcResiLuma, blk, blkDst, fixedFilterResiResults, m_picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltSetInd], fixedFiltSetInd, dirWindSize, clpRng, clippingValues); + m_filterResi9x9Blk(classifier[0], srcResiLuma, blk, blkDst, fixedFilterResiResults, m_picWidth, fixedFiltInd, m_classIdnFixedFilter9Db9[fixedFiltSetInd], fixedFiltSetInd, dirWindSize, clpRng, clippingValues +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , true, cs, m_classifierCodingInfo[0] +#endif + ); #else m_filterResi13x13Blk(classifier[0], srcResiLuma, #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY @@ -5005,6 +5276,9 @@ void AdaptiveLoopFilter::deriveClassificationAndFixFilterResultsBlk( AlfClassifi #else m_calcClass1(classifier[0], blkDst, Area(blkDst.pos().x, blkDst.pos().y, blkDst.width, blkDst.height), 5, 0, 5, 5, bits, 2, mappingDir, laplacian); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_textureClassMapping(classifier[0], blkDst, 0, 2, m_classifierCodingInfo[0] ); #endif } for( int curClassifierIdx = 1; curClassifierIdx < ALF_NUM_CLASSIFIER; curClassifierIdx++ ) @@ -5017,7 +5291,11 @@ void AdaptiveLoopFilter::deriveClassificationAndFixFilterResultsBlk( AlfClassifi { continue; } - m_calcClass2( classifier[curClassifierIdx], blk, Area( blkDst.pos().x, blkDst.pos().y, blkDst.width, blkDst.height ), srcLuma, 2, classifier[0], curClassifierIdx, bits, srcResiLuma, laplacian[0] ); + m_calcClass2( classifier[curClassifierIdx], blk, Area( blkDst.pos().x, blkDst.pos().y, blkDst.width, blkDst.height ), srcLuma, 2, classifier[0], curClassifierIdx, bits, srcResiLuma, laplacian[0] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , m_classifierCodingInfo[0] +#endif + ); #else m_calcClass2( classifier[curClassifierIdx], blk, Area( blkDst.pos().x, blkDst.pos().y, blkDst.width, blkDst.height ), srcLuma, 2, classifier[0], curClassifierIdx, bits ); #endif @@ -6855,7 +7133,11 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsBlk(AlfClassifier*** classifier if (fixedFiltSetInd == targetFixedFilterSetInd || targetFixedFilterSetInd == -1) { #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER - alfFixedFilterBlk(winIdx == 0 ? classifier[0] : classifier[ALF_NUM_CLASSIFIER], srcLuma, blk, blkDst, srcLumaBeforeDb, m_fixFilterResult[COMPONENT_Y], m_picWidth, fixedFiltInd, fixedFiltSetInd, winIdx, clpRng, clippingValues, true); + alfFixedFilterBlk(winIdx == 0 ? classifier[0] : classifier[ALF_NUM_CLASSIFIER], srcLuma, blk, blkDst, srcLumaBeforeDb, m_fixFilterResult[COMPONENT_Y], m_picWidth, fixedFiltInd, fixedFiltSetInd, winIdx, clpRng, clippingValues, true +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); #else alfFixedFilterBlk(winIdx == 0 ? classifier[0] : classifier[ALF_NUM_CLASSIFIER], srcLuma, blk, blkDst, srcLumaBeforeDb, m_fixFilterResult, m_picWidth, fixedFiltInd, fixedFiltSetInd, winIdx, clpRng, clippingValues); #endif @@ -6905,14 +7187,22 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlkChroma(AlfClassifier ***c m_deriveVariance(src, blk, blk, laplacian); m_deriveClassificationLaplacian(src, blk, blk, laplacian, ALF_CLASSIFIER_FL_CHROMA); m_calcClass0(classifier[ALF_NUM_CLASSIFIER + 1], blk, blk, ALF_CLASSIFIER_FL_CHROMA + 10, 1, NUM_DIR_FIX, NUM_ACT_FIX, bits, 2, mappingDir, laplacian); - alfFixedFilterBlk(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blk, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFilterSetIdx, targetFixedFilterSetInd, 0, clpRng, clippingValues, false); + alfFixedFilterBlk(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blk, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFilterSetIdx, targetFixedFilterSetInd, 0, clpRng, clippingValues, false +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); } else { deriveVariance(src, blk, blk, laplacian); deriveClassificationLaplacian(src, blk, blk, laplacian, ALF_CLASSIFIER_FL_CHROMA); calcClass0Var(classifier[ALF_NUM_CLASSIFIER + 1], blk, blk, ALF_CLASSIFIER_FL_CHROMA + 10, 1, NUM_DIR_FIX, NUM_ACT_FIX, bits, 2, mappingDir, laplacian); - alfFixedFilterBlkNonSimd(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blk, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFilterSetIdx, targetFixedFilterSetInd, 0, clpRng, clippingValues, false); + alfFixedFilterBlkNonSimd(classifier[ALF_NUM_CLASSIFIER + 1], src, blk, blk, srcBeforeDb, fixedFilterResults, m_picWidth, fixedFilterSetIdx, targetFixedFilterSetInd, 0, clpRng, clippingValues, false +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); } } @@ -7000,7 +7290,11 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsCtuBoundaryChroma(AlfClassifier } #endif -void AdaptiveLoopFilter::deriveFixedFilterResultsCtuBoundary(AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const CPelBuf &srcLumaBeforeDb, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlagLuma, uint8_t* ctuEnableOnlineLuma, int ctuIdx, int classifierIdx) +void AdaptiveLoopFilter::deriveFixedFilterResultsCtuBoundary(AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const CPelBuf &srcLumaBeforeDb, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlagLuma, uint8_t* ctuEnableOnlineLuma, int ctuIdx, int classifierIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , const CPelBuf& srcCodingInfo, const CPelBuf& srcResi +#endif + ) #else void AdaptiveLoopFilter::deriveFixedFilterResultsCtuBoundary(AlfClassifier **classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlagLuma, uint8_t* ctuEnableOnlineLuma, int ctuIdx ) #endif @@ -7080,6 +7374,13 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsCtuBoundary(AlfClassifier **cla { if(isBoundaryValid[boundaryIdx] && !isNeighborAvai[boundaryIdx]) { +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + //classifierIdx = 1 can reuse coding info results + if( classifierIdx == 0 ) + { + calcAlfLumaCodingInfoBlk(cs, m_classifierCodingInfo[0], blkCur[boundaryIdx], blkCur[boundaryIdx], srcLuma, 2, 2, m_inputBitDepth[CHANNEL_TYPE_LUMA], srcResi, m_laplacian[0], srcCodingInfo); + } +#endif #if JVET_AE0139_ALF_IMPROVED_FIXFILTER if (boundaryIdx == 1) { @@ -7429,11 +7730,19 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlk( AlfClassifier **classif #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER if( useSimd) { - alfFixedFilterBlk( classifier[ALF_NUM_CLASSIFIER], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true ); + alfFixedFilterBlk( classifier[ALF_NUM_CLASSIFIER], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); } else { - alfFixedFilterBlkNonSimd( classifier[ALF_NUM_CLASSIFIER], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true ); + alfFixedFilterBlkNonSimd( classifier[ALF_NUM_CLASSIFIER], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); } #else if( useSimd) @@ -7456,7 +7765,11 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlk( AlfClassifier **classif #if JVET_AE0139_ALF_IMPROVED_FIXFILTER m_calcClass0( classifier[0], blkCur, blkCur, usedWindowIdx[0], 1, NUM_DIR_FIX, NUM_ACT_FIX, bits, 2, mappingDir, laplacian); #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER - alfFixedFilterBlk( classifier[0], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true ); + alfFixedFilterBlk( classifier[0], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); #else alfFixedFilterBlk(classifier[0], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues ); #endif @@ -7471,7 +7784,11 @@ void AdaptiveLoopFilter::deriveFixedFilterResultsPerBlk( AlfClassifier **classif #if JVET_AE0139_ALF_IMPROVED_FIXFILTER calcClass0Var( classifier[0], blkCur, blkCur, usedWindowIdx[0], 1, NUM_DIR_FIX, NUM_ACT_FIX, bits, 2, mappingDir, laplacian ); #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER - alfFixedFilterBlkNonSimd( classifier[0], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true ); + alfFixedFilterBlkNonSimd( classifier[0], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues, true +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , cs +#endif + ); #else alfFixedFilterBlkNonSimd( classifier[0], srcLuma, blkCur, blkCur, srcLumaBeforeDb, fixedFilterResults, m_picWidth, classifierIdx * NUM_FIXED_FILTER_SETS + fixedFilterSetIdx, targetFixedFilterSetInd, classifierIdx, clpRng, clippingValues ); #endif @@ -7779,15 +8096,27 @@ void AdaptiveLoopFilter::deriveGaussResultsBlk(Pel ***gaussPic, const CPelBuf &s bool useSimd = blkDst.size().width % 8 == 0 ? true : false; if( useSimd ) { - m_gaussFiltering(cs, gaussPic, srcLuma, blkDst, blk, clpRng, clippingValues, filterSetIdx, storeIdx); + m_gaussFiltering(cs, gaussPic, srcLuma, blkDst, blk, clpRng, clippingValues, filterSetIdx, storeIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , true, m_classifierCodingInfo[0] +#endif + ); } else { - gaussFiltering(cs, gaussPic, srcLuma, blkDst, blk, clpRng, clippingValues, filterSetIdx, storeIdx); + gaussFiltering(cs, gaussPic, srcLuma, blkDst, blk, clpRng, clippingValues, filterSetIdx, storeIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , true, m_classifierCodingInfo[0] +#endif + ); } } -void AdaptiveLoopFilter::gaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx ) +void AdaptiveLoopFilter::gaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, AlfClassifier** classifierCodingInfo +#endif + ) { int strideSrc = srcLuma.stride; int xPosSrc = blk.pos().x; @@ -7798,6 +8127,13 @@ void AdaptiveLoopFilter::gaussFiltering(CodingStructure &cs, Pel ***gaussPic, co int shift = 10; const int numCoeff = 12; int diffTH = 32; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + const bool useBounCondition = applyCodingInfo && !(!isSpsAdjust && isIntraSlice); + const bool useResiCondition = applyCodingInfo && (isSpsAdjust || !isSpsAdjust) && !isIntraSlice && false; + const int offsetClipValue = 1 << ( clpRng.bd - 1 ); +#endif int gaussTable[NUM_GAUSS_FILTERED_SOURCE][25] = { @@ -7827,6 +8163,24 @@ void AdaptiveLoopFilter::gaussFiltering(CodingStructure &cs, Pel ***gaussPic, co { for (int j = 0; j < width; j++) { +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + int classIdxBs = 0; + if( useBounCondition ) + { + //0: BS0 Resi0, 1:BS0 Resi1, 2:BS1 Resi0 3:BS1 Resi1 + // A >> 1 = BS + // A - ( A >> 1) * 2 + classIdxBs = classifierCodingInfo[blkDst.y + i][blkDst.x + j] >> 1; + } + int classIdxResi = 0; + if( useResiCondition ) + { + //0: BS0 Resi0, 1:BS0 Resi1, 2:BS1 Resi0 3:BS1 Resi1 + // A >> 1 = BS + // A - ( A >> 1) * 2 + classIdxResi = classifierCodingInfo[blkDst.y + i][blkDst.x + j] - ((classifierCodingInfo[blkDst.y + i][blkDst.x + j] >> 1 ) * 2); + } +#endif int dstPosY = blkDst.y + i + padSize; int dstPosX = blkDst.x + j + padSize; @@ -7867,6 +8221,27 @@ void AdaptiveLoopFilter::gaussFiltering(CodingStructure &cs, Pel ***gaussPic, co sum += 1 << (shift - 1); sum >>= shift; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + sum = Clip3(-offsetClipValue, +offsetClipValue, sum); + + int sign = sum < 0 ? -1 : +1; + + int boundaryStrengthFactor = isIntraSlice ? 4 + 2 : 3 + 2; + + sum = classIdxBs ? sign * ((abs(sum) * (16 + boundaryStrengthFactor) + 8 ) >> 4) : sum; + } + + if( useResiCondition ) + { + sum = Clip3(-offsetClipValue, +offsetClipValue, sum); + + int sign = sum < 0 ? -1 : +1; + int resiStrengthFactor = isIntraSlice ? 0 >> (!isSpsAdjust ? 1 : 0) : 3 >> (!isSpsAdjust ? 1 : 0); + sum = classIdxResi ? sign * ((abs(sum) * (16 + resiStrengthFactor) + 8 ) >> 4) : sum; + } +#endif int diff = Clip3<int>(-diffTH, +diffTH, sum); sum = curr + diff; @@ -7876,3 +8251,143 @@ void AdaptiveLoopFilter::gaussFiltering(CodingStructure &cs, Pel ***gaussPic, co }//height } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION +void AdaptiveLoopFilter::textureClassMapping(AlfClassifier **classifier, const Area& blk, int classifierIdx, int subBlkSize, AlfClassifier **classifierCodingInfo ) +{ + CHECK(classifierIdx != 0, "Wrong Classifier Index for DBF-BS Mapping"); + + int bsMappingTable[2][25] = + { + //A: (0)(12)(3)(4) + //D: (0)(12)(34) + //4A x 3D + { + 0, 1, 1, 2, 2, + 3, 4, 4, 5, 5, + 3, 4, 4, 5, 5, + 6, 7, 7, 8, 8, + 9, 10, 10, 11, 11, + }, + //4A x 3D + { + 0, 1, 1, 2, 2, + 3, 4, 4, 5, 5, + 3, 4, 4, 5, 5, + 6, 7, 7, 8, 8, + 9, 10, 10, 11, 11, + }, + }; + + int width = blk.width; + int height = blk.height; + int posY = blk.pos().y; + int posX = blk.pos().x; + + int bsUnit = 0; + int transposeIdx = 0; + int classIdx = 0; + int classIdxMap = 0; + int bsClassOffet = 12; + + for(int y = 0; y < height; y += subBlkSize) + { + for(int x = 0; x < width; x += subBlkSize) + { + bsUnit = classifierCodingInfo[posY + y][posX + x] >> 1; + + transposeIdx = classifier[posY + y][posX + x] & 0x3; + classIdx = classifier[posY + y][posX + x] >> 2; + + classIdxMap = bsMappingTable[bsUnit][classIdx] + bsUnit * bsClassOffet; + + classIdxMap = (classIdxMap << 2) + transposeIdx; + + for(int ii = 0; ii < subBlkSize; ii++) + { + for(int jj = 0; jj < subBlkSize; jj++) + { + int curPosY = posY + y + ii; + int curPosX = posX + x + jj; + classifier[curPosY][curPosX] = classIdxMap; + } + } + //Unit 2x2 + } + } +} + +void AdaptiveLoopFilter::calcAlfLumaCodingInfoBlk( CodingStructure& cs, AlfClassifier** classifier, const Area &blkDst, const Area &blkSrc, const CPelBuf& srcLuma, int subBlkSize, int classifierIdx, int bitDepth, const CPelBuf& srcLumaResi, uint32_t **buffer, const CPelBuf& srcCodingInfo ) +{ + + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + const bool calcResi = !isIntraSlice && (isSpsAdjust || !isSpsAdjust); + + const Pel *srcResiPtr = srcLumaResi.buf; + int srcResiStride = srcLumaResi.stride; + int yOffset = blkSrc.pos().y * srcResiStride; + const Pel *srcResi0 = &srcResiPtr[yOffset]; + int srcResiStride2 = srcResiStride * 2; + + const Pel *srcResiUp = srcResi0 - 1 * srcResiStride + blkSrc.pos().x - 1; + + const Pel *srcResiDn = srcResiUp + srcResiStride; + + if( calcResi ) + { + //2x2 sum + for (int i = 0; i < blkSrc.height + 1 * 2; i += 2) + { + for (int j = 0; j < blkSrc.width + 1 * 2; j += 2) + { + buffer[i >> 1][j >> 1] = abs(srcResiUp[j]) + abs(srcResiUp[j + 1]) + abs(srcResiDn[j]) + abs(srcResiDn[j + 1]); + } + srcResiUp += srcResiStride2; + srcResiDn += srcResiStride2; + } + } + + const int srcCodingInfoStride = srcCodingInfo.stride; + const Pel* srcCodingInfoPtr = srcCodingInfo.buf + blkSrc.pos().y * srcCodingInfoStride + blkSrc.pos().x; + const Pel* srcCodingInfoPtr0 = srcCodingInfoPtr; + const Pel* srcCodingInfoPtr1 = srcCodingInfoPtr; + + for (int i = 0; i < blkDst.height; i += subBlkSize) + { + for (int j = 0; j < blkDst.width; j += subBlkSize) + { + int classIdxResi = 0; + if( calcResi ) + { + int i2 = i >> 1; + int j2 = j >> 1; + + int sum = buffer[i2][j2] + buffer[i2][j2 + 1] + buffer[i2 + 1][j2] + buffer[i2 + 1][j2 + 1]; + int shiftOffset = 6; + + int avgResi = sum >> (bitDepth - shiftOffset); + int highResiTh = +4; + classIdxResi = avgResi > highResiTh ? 1 : 0; + } + + int classIdxCodingInfo = 0; + srcCodingInfoPtr0 = srcCodingInfoPtr + i * srcCodingInfoStride + j; + srcCodingInfoPtr1 = srcCodingInfoPtr0 + srcCodingInfoStride; + classIdxCodingInfo = ( srcCodingInfoPtr0[+0] + srcCodingInfoPtr0[+1] + srcCodingInfoPtr1[+0] + srcCodingInfoPtr1[+1] ) > 0 ? 1 : 0; + + for (int ii = blkDst.y + i; ii < blkDst.y + i + subBlkSize; ii++) + { + for (int jj = blkDst.x + j; jj < blkDst.x + j + subBlkSize; jj++) + { + //0: BS0 Resi0, 1:BS0 Resi1, 2:BS1 Resi0 3:BS1 Resi1 + // A >> 1 = BS + // A - ( A >> 1) * 2 + classifier[ii][jj] = classIdxCodingInfo * 2 + classIdxResi; + } + } + + } + } + +} +#endif diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h index 9808eb42a09f654bd07fcb88dbe22f97414cf661..d2dacb490aee9c719ad26b12472efe41bcd319eb 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.h +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.h @@ -113,6 +113,9 @@ public: #if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT void copyResiData(CodingStructure &cs) { m_tempBufResi.bufs[COMPONENT_Y].copyFrom(cs.getResiBuf().bufs[COMPONENT_Y]); } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf callCodingInfoBuf( CodingStructure &cs ) { return m_tempBufCodingInfo; } +#endif static constexpr int AlfNumClippingValues[MAX_NUM_CHANNEL_TYPE] = { 4, 4 }; static constexpr int MaxAlfNumClippingValues = 4; @@ -160,7 +163,11 @@ public: #endif Pel ***fixedFilterResiResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, - const ClpRng &clpRng, const Pel clippingValues[4]); + const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ); #endif #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER @@ -174,7 +181,11 @@ public: void deriveFixedFilterResults( AlfClassifier*** classifier, const CPelBuf& srcLuma, const CPelBuf& srcLumaBeforeDb, const Area& blkDst, const Area& blk, CodingStructure &cs, int winIdx, int fixedFilterSetIdx ); static void calcClass0Var( AlfClassifier **classifier, const Area &blkDst, const Area &cu, int dirWindSize, int classDir, int noDir, int noAct, int bitDepth, int subBlkSize, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS] ); static void deriveVariance( const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, uint32_t ***laplacian ); - void deriveFixedFilterResultsCtuBoundary( AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const CPelBuf &srcLumaBeforeDb, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlagLuma, uint8_t* ctuEnableOnlineLuma, int ctuIdx, int classifierIdx ); + void deriveFixedFilterResultsCtuBoundary( AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const CPelBuf &srcLumaBeforeDb, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlagLuma, uint8_t* ctuEnableOnlineLuma, int ctuIdx, int classifierIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , const CPelBuf& srcCodingInfo, const CPelBuf& srcResi +#endif + ); void deriveFixedFilterResultsPerBlk( AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const CPelBuf &srcLumaBeforeDb, const Area &blkCur, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], const int classifierIdx ); void(*m_deriveVariance)(const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, uint32_t ***variance); #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER @@ -182,17 +193,37 @@ public: void deriveFixedFilterResultsPerBlkChroma(AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &src, const CPelBuf &srcBeforeDb, const Area &blk, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS]); void deriveFixFilterResultsBlkChroma( AlfClassifier ***classifier, Pel ***fixedFilterResults, const CPelBuf &src, const CPelBuf &srcBeforeDb, const Area &blkDst, const Area &blk, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS] ); void deriveFixedFilterChroma(AlfClassifier*** classifier, const PelUnitBuf& src, const PelUnitBuf& srcBeforeDb, const Area& blkDst, const Area& blk, CodingStructure &cs, const int classifierIdx, ComponentID compID); - void alfFixedFilterBlkNonSimd(AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma); - void alfFixedFilterBlk(AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma); + void alfFixedFilterBlkNonSimd(AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , CodingStructure &cs +#endif + ); + void alfFixedFilterBlk(AlfClassifier **classifier, const CPelBuf &src, const Area &curBlk, const Area &blkDst, const CPelBuf &srcBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4], bool isLuma +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , CodingStructure &cs +#endif + ); template<AlfFixedFilterType filtType> #else void alfFixedFilterBlkNonSimd(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]); void alfFixedFilterBlk(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]); template<AlfFixedFilterType filtType> #endif - static void fixedFilterBlk(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]); - void(*m_fixFilter13x13Db9Blk)(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]); - void(*m_fixFilter9x9Db9Blk)(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]); + static void fixedFilterBlk(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ); + void(*m_fixFilter13x13Db9Blk)(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ); + void(*m_fixFilter9x9Db9Blk)(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area &curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ); #else void paddingFixedFilterResultsCtu(Pel*** fixedFilterResultsPic, Pel*** fixedFilterResultsCtu, const int fixedFilterSetIdx, const Area &blk); void deriveFixedFilterResultsCtuBoundary(AlfClassifier **classifier, Pel ***fixedFilterResults, const CPelBuf &srcLuma, const Area &blkDst, const int bits, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int qp, int fixedFilterSetIdx, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS], uint8_t* ctuEnableFlagLuma, uint8_t* ctuEnableOnlineLuma, int ctuIdx); @@ -206,8 +237,23 @@ public: void deriveGaussResultsBlk( Pel*** gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, CodingStructure& cs, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, const int storeIdx); void deriveGaussResults(const CPelBuf& srcLumaDb, const Area& blkDst, const Area& blk, CodingStructure &cs, const int filterSetIdx, const int storeIdx ); - static void gaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx ); - void(*m_gaussFiltering) (CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx ); + static void gaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, AlfClassifier** classifierCodingInfo +#endif + ); + void(*m_gaussFiltering) (CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, AlfClassifier** classifierCodingInfo +#endif + ); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + static void textureClassMapping(AlfClassifier **classifier, const Area& blk, int classifierIdx, int subBlkSize, AlfClassifier **classifierCodingInfo); + void( *m_textureClassMapping ) (AlfClassifier **classifier, const Area& blk, int classifierIdx, int subBlkSize, AlfClassifier **classifierCodingInfo); + + static void calcAlfLumaCodingInfoBlk( CodingStructure& cs, AlfClassifier** classifier, const Area &blkDst, const Area &blkSrc, const CPelBuf& srcLuma, int subBlkSize, int classifierIdx, int bitDepth, const CPelBuf& srcLumaResi, uint32_t **buffer, const CPelBuf& srcCodingInfo ); + void( *m_calcAlfLumaCodingInfoBlk )( CodingStructure& cs, AlfClassifier** classifier, const Area &blkDst, const Area &blkSrc, const CPelBuf& srcLuma, int subBlkSize, int classifierIdx, int bitDepth, const CPelBuf& srcLumaResi, uint32_t **buffer, const CPelBuf& srcCodingInfo ); #endif int assignAct(int avg_varPrec, int shift, int noAct); @@ -230,6 +276,9 @@ public: static void calcClassNew( AlfClassifier **classifier, const Area &blkDst, const Area &cu, const CPelBuf& srcLuma, int subBlkSize, AlfClassifier **classifier0, int classifierIdx, int bitDepth #if JVET_AD0222_ALF_RESI_CLASS , const CPelBuf& srcResiLuma, uint32_t **buffer +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , AlfClassifier **classifierCodingInfo #endif ); #else @@ -292,7 +341,11 @@ public: void(*m_calcClass1)(AlfClassifier **classifier, const Area &blkDst, const Area &cu, int dirWindSize, int classDir, int noDir, int noAct, int bitDepth, int subBlkSize, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS]); #if JVET_X0071_ALF_BAND_CLASSIFIER #if JVET_AD0222_ALF_RESI_CLASS - void(*m_calcClass2)(AlfClassifier **classifier, const Area &blkDst, const Area &cu, const CPelBuf& srcLuma, int subBlkSize, AlfClassifier **classifier0, int classifierIdx, int bitDepth, const CPelBuf& srcLumaResi, uint32_t **buffer); + void(*m_calcClass2)(AlfClassifier **classifier, const Area &blkDst, const Area &cu, const CPelBuf& srcLuma, int subBlkSize, AlfClassifier **classifier0, int classifierIdx, int bitDepth, const CPelBuf& srcLumaResi, uint32_t **buffer +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , AlfClassifier **classifierCodingInfo +#endif + ); #else void(*m_calcClass2)(AlfClassifier **classifier, const Area &blkDst, const Area &cu, const CPelBuf& srcLuma, int subBlkSize, AlfClassifier **classifier0, int classifierIdx, int bitDepth); #endif @@ -318,7 +371,11 @@ public: #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY const Area &blkDst, #endif - Pel ***fixedFilterResiResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]); + Pel ***fixedFilterResiResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ); #else void (*m_filterResi13x13Blk)(AlfClassifier **classifier, const CPelBuf &srcResiLuma, const Area &curBlk, #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY @@ -783,6 +840,9 @@ protected: #else AlfClassifier** m_classifier; #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier** m_classifierCodingInfo[1]; +#endif #if ALF_IMPROVEMENT int m_numLumaAltAps[ALF_CTB_MAX_NUM_APS]; short m_coeffApsLuma[ALF_CTB_MAX_NUM_APS][MAX_NUM_ALF_ALTERNATIVES_LUMA][MAX_NUM_ALF_LUMA_COEFF * MAX_NUM_ALF_CLASSES]; @@ -861,6 +921,10 @@ protected: #if JVET_AI0166_CCALF_CHROMA_SAO_INPUT PelStorage m_tempBufSAO; PelStorage m_tempBufSAO2; +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelStorage m_tempBufCodingInfo; + PelStorage m_tempBufCodingInfo2; #endif int m_inputBitDepth[MAX_NUM_CHANNEL_TYPE]; int m_picWidth; diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index a5a6c32fc81feba3cbfcafb07e19305a1a398b66..2eb2493d741c3cb8b6f8aaec6e689be212e07ca1 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -417,7 +417,11 @@ static const int MAX_CCSAO_BAND_NUM_U_BAND_BITS = 4; #endif static const int MAX_NUM_ALF_ALTERNATIVES_CHROMA = 8; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION +static const int MAX_NUM_ALF_CLASSES = 24; +#else static const int MAX_NUM_ALF_CLASSES = 25; +#endif #if ALF_IMPROVEMENT static const int MAX_NUM_ALF_ALTERNATIVES_LUMA = 4; static const int EXT_LENGTH = 2; @@ -544,8 +548,13 @@ static const int ALF_RESI_SHIFT_OFFSET = 4; static const int NUM_RESI_ABS_PAD = 8; static const int ALF_PADDING_SIZE_PRED = 3; static const int ALF_NUM_CLASSIFIER = 3; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION +static const int ALF_CLASSES_RESI = 24; +static const int ALF_CLASSES_NEW = 24; +#else static const int ALF_CLASSES_RESI = 25; static const int ALF_CLASSES_NEW = 25; +#endif static const int ALF_NUM_CLASSES_CLASSIFIER[ALF_NUM_CLASSIFIER] = { MAX_NUM_ALF_CLASSES, ALF_CLASSES_NEW, ALF_CLASSES_RESI }; #else static const int ALF_NUM_CLASSIFIER = 2; diff --git a/source/Lib/CommonLib/LoopFilter.cpp b/source/Lib/CommonLib/LoopFilter.cpp index a5c2be2e0d7590bf3a54564b77804397f0f3000e..cb24c5113b75d1420bdcbaf86f0ce8d46e16d8f9 100644 --- a/source/Lib/CommonLib/LoopFilter.cpp +++ b/source/Lib/CommonLib/LoopFilter.cpp @@ -151,6 +151,9 @@ void LoopFilter::destroy() \param pcPic picture class (Pic) pointer */ void LoopFilter::loopFilterPic( CodingStructure& cs +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , PelUnitBuf& alfCodingInfo, bool storeInfo +#endif ) { const PreCalcValues& pcv = *cs.pcv; @@ -193,7 +196,11 @@ void LoopFilter::loopFilterPic( CodingStructure& cs continue; } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + xDeblockCU( currCU, EDGE_VER, storeInfo, alfCodingInfo ); +#else xDeblockCU( currCU, EDGE_VER ); +#endif } @@ -215,7 +222,11 @@ void LoopFilter::loopFilterPic( CodingStructure& cs continue; } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + xDeblockCU( currCU, EDGE_VER, storeInfo, alfCodingInfo ); +#else xDeblockCU( currCU, EDGE_VER ); +#endif } } } @@ -245,7 +256,11 @@ void LoopFilter::loopFilterPic( CodingStructure& cs continue; } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + xDeblockCU( currCU, EDGE_HOR, storeInfo, alfCodingInfo ); +#else xDeblockCU( currCU, EDGE_HOR ); +#endif } #if JVET_AI0136_ADAPTIVE_DUAL_TREE @@ -266,7 +281,11 @@ void LoopFilter::loopFilterPic( CodingStructure& cs continue; } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + xDeblockCU( currCU, EDGE_HOR, storeInfo, alfCodingInfo ); +#else xDeblockCU( currCU, EDGE_HOR ); +#endif } } } @@ -299,7 +318,11 @@ void LoopFilter::resetFilterLengths() \param cu the CU to be deblocked \param edgeDir the direction of the edge in block boundary (horizontal/vertical), which is added newly */ +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION +void LoopFilter::xDeblockCU( CodingUnit& cu, const DeblockEdgeDir edgeDir, bool storeInfo, PelUnitBuf& alfCodingInfo ) +#else void LoopFilter::xDeblockCU( CodingUnit& cu, const DeblockEdgeDir edgeDir ) +#endif { const PreCalcValues& pcv = *cu.cs->pcv; const Area area = cu.Y().valid() ? cu.Y() : Area( recalcPosition( cu.chromaFormat, cu.chType, CHANNEL_TYPE_LUMA, cu.blocks[cu.chType].pos() ), recalcSize( cu.chromaFormat, cu.chType, CHANNEL_TYPE_LUMA, cu.blocks[cu.chType].size() ) ); @@ -425,6 +448,46 @@ void LoopFilter::xDeblockCU( CodingUnit& cu, const DeblockEdgeDir edgeDir ) } const unsigned uiPelsInPart = pcv.minCUWidth; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + auto storeBoundaryInfo = [this, cu, uiPelsInPart, edgeDir]( char bs, Position pos, ComponentID comp, PelUnitBuf& bsBuf) -> void + { + Size sz; + Position posPQ; + + int edgeLongside = uiPelsInPart; + int edgeShortside = 2; + + int scaleX = getComponentScaleX( comp, cu.chromaFormat ); + int scaleY = getComponentScaleY( comp, cu.chromaFormat ); + + if(edgeDir == EDGE_HOR) + { + sz.width = edgeLongside; + sz.height= edgeShortside << scaleY; + posPQ = Position( pos.x, pos.y - ( 1 << scaleY) ); + } + else + { + sz.height= edgeLongside; + sz.width = edgeShortside << scaleX; + posPQ = Position( pos.x - ( 1 << scaleX) , pos.y); + } + + auto bsComp = BsGet(bs, comp); + Pel toFill = bsComp > 0 ? 1 : 0; + + int bsStride = bsBuf.get(comp).stride; + Pel* bsPtr = bsBuf.get(comp).buf + (posPQ.y >> scaleY) * bsStride + (posPQ.x >> scaleX); + for(int y = 0; y < sz.height >> scaleY; y++) + { + for(int x = 0; x < sz.width >> scaleX; x++) + { + bsPtr[x] = std::max( bsPtr[x], toFill ); + } + bsPtr += bsStride; + } + }; +#endif for( int y = 0; y < area.height; y += uiPelsInPart ) { @@ -442,6 +505,12 @@ void LoopFilter::xDeblockCU( CodingUnit& cu, const DeblockEdgeDir edgeDir ) #endif { bS |= xGetBoundaryStrengthSingle( cu, edgeDir, localPos, CHANNEL_TYPE_LUMA ); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( cu.blocks[COMPONENT_Y].valid() && storeInfo ) + { + storeBoundaryInfo(bS, Position(area.x + x, area.y + y), COMPONENT_Y, alfCodingInfo); + } +#endif } #if !INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS if(cu.treeType != TREE_L && cu.chromaFormat != CHROMA_400 && cu.blocks[COMPONENT_Cb].valid()) diff --git a/source/Lib/CommonLib/LoopFilter.h b/source/Lib/CommonLib/LoopFilter.h index 0059914d42b3a59d22aafab0fbf6747f0e20e3d6..db56b9cea17a8343cf73863054b3f59ff6be63f1 100644 --- a/source/Lib/CommonLib/LoopFilter.h +++ b/source/Lib/CommonLib/LoopFilter.h @@ -129,7 +129,11 @@ public: ~LoopFilter(); /// CU-level deblocking function +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + void xDeblockCU(CodingUnit& cu, const DeblockEdgeDir edgeDir, bool storeInfo, PelUnitBuf& alfCodingInfo); +#else void xDeblockCU(CodingUnit& cu, const DeblockEdgeDir edgeDir); +#endif void initEncPicYuvBuffer(ChromaFormat chromaFormat, const Size &size, const unsigned maxCUSize); PelStorage& getDbEncPicYuvBuffer() { return m_encPicYuvBuffer; } void setEnc(bool b) { m_enc = b; } @@ -139,6 +143,9 @@ public: /// picture-level deblocking filter void loopFilterPic ( CodingStructure& cs +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , PelUnitBuf& alfCodingInfo, bool storeInfo +#endif ); static int getBeta ( const int qp ) diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index 4e132f78c0cf89d346a997752c0b1f695e89ea10..f523b5be70dd2b47289ac351bde04b2c0728083c 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -1775,6 +1775,9 @@ private: bool m_alfEnabledFlag; bool m_ccalfEnabledFlag; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + bool m_alfLumaFixedFilterAdjust; +#endif bool m_wrapAroundEnabledFlag; unsigned m_IBCFlag; #if JVET_AD0208_IBC_ADAPT_FOR_CAM_CAPTURED_CONTENTS @@ -2239,6 +2242,10 @@ public: void setALFEnabledFlag( bool b ) { m_alfEnabledFlag = b; } bool getCCALFEnabledFlag() const { return m_ccalfEnabledFlag; } void setCCALFEnabledFlag( bool b ) { m_ccalfEnabledFlag = b; } +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + bool getAlfLumaFixedFilterAdjust() const { return m_alfLumaFixedFilterAdjust; } + void setAlfLumaFixedFilterAdjust( bool b ) { m_alfLumaFixedFilterAdjust = b; } +#endif void setJointCbCrEnabledFlag(bool bVal) { m_JointCbCrEnabledFlag = bVal; } bool getJointCbCrEnabledFlag() const { return m_JointCbCrEnabledFlag; } diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index c2c5e192c06ced35c0536cd3819d17e6a412e109..6a8daf3b5f335ec492032221c60387793009695f 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -482,6 +482,7 @@ #define JVET_AI0084_ALF_RESIDUALS_SCALING 1 // JVET_AI0084: non-fixed ALF residuals scaling #define JVET_AI0058_ALF_RELAXED_RDO_LUMA 1 // JVET-AI0058: Relaxed ALF Luma RDO #define JVET_AI0166_CCALF_CHROMA_SAO_INPUT 1 // JVET-AI0166: CCALF with Chroma inputs +#define JVET_AJ0188_CODING_INFO_CLASSIFICATION 1 // JVET-AJ0188: Coding Information based Classification for ALF // SIMD optimizations #if IF_12TAP diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index fd0efa2db34a06577a2e59f6067a90a99ca4216f..bd42c0c21426d14f05823d270766a21357508e72 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -1193,14 +1193,302 @@ static void simdFilter9x9Blk(AlfClassifier **classifier, const PelUnitBuf &recDs const Pel *src = srcBuffer.buf + blk.y * srcStride + blk.x; Pel * dst = dstBuffer.buf + blkDst.y * dstStride + blkDst.x; +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) const __m128i mmOffset = _mm_set1_epi32(round); const __m128i mmMin = _mm_set1_epi16(clpRng.min); const __m128i mmMax = _mm_set1_epi16(clpRng.max); +#endif #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER const int padSize = ALF_PADDING_SIZE_FIXED_RESULTS; #endif +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool use256BitSimd = vext >= AVX2 && blkDst.width % 16 == 0 ? true : false; + + if( use256BitSimd ) + { + const __m256i mmOffset = _mm256_set1_epi32(round); + const __m256i mmMin = _mm256_set1_epi16(clpRng.min); + const __m256i mmMax = _mm256_set1_epi16(clpRng.max); + + for (size_t i = 0; i < height; i += stepY) + { + const AlfClassifier *pClass = isChroma(compId) ? nullptr : classifier[blkDst.y + i] + blkDst.x; + for (size_t j = 0; j < width; j += stepX * 2) + { +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + __m256i params[2][2][13]; +#else + __m256i params[2][2][10]; +#endif + for (int k = 0; k < 2; k++) + { +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + __m128i rawCoeffTmp[2][2][4], rawClipTmp[2][2][4], s0Tmp[2], s1Tmp[2], s2Tmp[2], s3Tmp[2]; + __m256i rawCoeff[2][4], rawClip[2][4]; +#else + __m128i rawCoeffTmp[2][2][3], rawClipTmp[2][2][3], s0Tmp[2], s1Tmp[2], s2Tmp[2], s3Tmp[2]; + __m256i rawCoeff[2][3], rawClip[2][3]; +#endif + + for (int l = 0; l < 2; l++) + { + const int transposeIdx0 = pClass ? (pClass[j + 4 * k + 2 * l + 0] & 0x3) : 0; + const int classIdx0 = pClass ? (pClass[j + 4 * k + 2 * l + 0] >> 2) : 0; + + rawCoeffTmp[0][l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF)); + rawCoeffTmp[0][l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 8)); +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + rawCoeffTmp[0][l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawCoeffTmp[0][l][3] = _mm_loadl_epi64((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 24)); +#else + rawCoeffTmp[0][l][2] = _mm_loadl_epi64((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); +#endif + rawClipTmp[0][l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF)); + rawClipTmp[0][l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 8)); +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + rawClipTmp[0][l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawClipTmp[0][l][3] = _mm_loadl_epi64((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 24)); +#else + rawClipTmp[0][l][2] = _mm_loadl_epi64((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); +#endif + + for (int m = 0; m < shuffleTime9[transposeIdx0]; m++) + { + int op0 = shuffleOp9[transposeIdx0][m][0]; + int op1 = shuffleOp9[transposeIdx0][m][1]; + + s0Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab9[transposeIdx0][m][0]); + s1Tmp[0] = _mm_xor_si128(s0Tmp[0], _mm_set1_epi8((char) 0x80)); + s2Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab9[transposeIdx0][m][1]); + s3Tmp[0] = _mm_xor_si128(s2Tmp[0], _mm_set1_epi8((char) 0x80)); + + __m128i rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoeffTmp[0][l][op0], s0Tmp[0]), _mm_shuffle_epi8(rawCoeffTmp[0][l][op1], s1Tmp[0])); + __m128i rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoeffTmp[0][l][op0], s2Tmp[0]), _mm_shuffle_epi8(rawCoeffTmp[0][l][op1], s3Tmp[0])); + rawCoeffTmp[0][l][op0] = rawTmp0; + rawCoeffTmp[0][l][op1] = rawTmp1; + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[0][l][op0], s0Tmp[0]), _mm_shuffle_epi8(rawClipTmp[0][l][op1], s1Tmp[0])); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[0][l][op0], s2Tmp[0]), _mm_shuffle_epi8(rawClipTmp[0][l][op1], s3Tmp[0])); + rawClipTmp[0][l][op0] = rawTmp0; + rawClipTmp[0][l][op1] = rawTmp1; + } + + const int transposeIdx1 = pClass ? (pClass[j + 4 * k + 2 * l + 8] & 0x3) : 0; + const int classIdx1 = pClass ? (pClass[j + 4 * k + 2 * l + 8] >> 2) : 0; + + rawCoeffTmp[1][l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF)); + rawCoeffTmp[1][l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 8)); +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + rawCoeffTmp[1][l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawCoeffTmp[1][l][3] = _mm_loadl_epi64((const __m128i *) (filterSet + classIdx1* MAX_NUM_ALF_LUMA_COEFF + 24)); +#else + rawCoeffTmp[1][l][2] = _mm_loadl_epi64((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); +#endif + rawClipTmp[1][l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF)); + rawClipTmp[1][l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 8)); +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + rawClipTmp[1][l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawClipTmp[1][l][3] = _mm_loadl_epi64((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 24)); +#else + rawClipTmp[1][l][2] = _mm_loadl_epi64((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); +#endif + + for (int m = 0; m < shuffleTime9[transposeIdx1]; m++) + { + int op0 = shuffleOp9[transposeIdx1][m][0]; + int op1 = shuffleOp9[transposeIdx1][m][1]; + + s0Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab9[transposeIdx1][m][0]); + s1Tmp[1] = _mm_xor_si128(s0Tmp[1], _mm_set1_epi8((char) 0x80)); + s2Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab9[transposeIdx1][m][1]); + s3Tmp[1] = _mm_xor_si128(s2Tmp[1], _mm_set1_epi8((char) 0x80)); + + __m128i rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoeffTmp[1][l][op0], s0Tmp[1]), _mm_shuffle_epi8(rawCoeffTmp[1][l][op1], s1Tmp[1])); + __m128i rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoeffTmp[1][l][op0], s2Tmp[1]), _mm_shuffle_epi8(rawCoeffTmp[1][l][op1], s3Tmp[1])); + rawCoeffTmp[1][l][op0] = rawTmp0; + rawCoeffTmp[1][l][op1] = rawTmp1; + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[1][l][op0], s0Tmp[1]), _mm_shuffle_epi8(rawClipTmp[1][l][op1], s1Tmp[1])); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[1][l][op0], s2Tmp[1]), _mm_shuffle_epi8(rawClipTmp[1][l][op1], s3Tmp[1])); + rawClipTmp[1][l][op0] = rawTmp0; + rawClipTmp[1][l][op1] = rawTmp1; + } + + rawCoeff[l][0] = _mm256_castsi128_si256( rawCoeffTmp[0][l][0]); + rawCoeff[l][0] = _mm256_insertf128_si256(rawCoeff[l][0], rawCoeffTmp[1][l][0], 1); + rawCoeff[l][1] = _mm256_castsi128_si256(rawCoeffTmp[0][l][1]); + rawCoeff[l][1] = _mm256_insertf128_si256(rawCoeff[l][1], rawCoeffTmp[1][l][1], 1); + rawCoeff[l][2] = _mm256_castsi128_si256(rawCoeffTmp[0][l][2]); + rawCoeff[l][2] = _mm256_insertf128_si256(rawCoeff[l][2], rawCoeffTmp[1][l][2], 1); + rawCoeff[l][3] = _mm256_castsi128_si256(rawCoeffTmp[0][l][3]); + rawCoeff[l][3] = _mm256_insertf128_si256(rawCoeff[l][3], rawCoeffTmp[1][l][3], 1); + + rawClip[l][0] = _mm256_castsi128_si256(rawClipTmp[0][l][0]); + rawClip[l][0] = _mm256_insertf128_si256(rawClip[l][0], rawClipTmp[1][l][0], 1); + rawClip[l][1] = _mm256_castsi128_si256(rawClipTmp[0][l][1]); + rawClip[l][1] = _mm256_insertf128_si256(rawClip[l][1], rawClipTmp[1][l][1], 1); + rawClip[l][2] = _mm256_castsi128_si256(rawClipTmp[0][l][2]); + rawClip[l][2] = _mm256_insertf128_si256(rawClip[l][2], rawClipTmp[1][l][2], 1); + rawClip[l][3] = _mm256_castsi128_si256(rawClipTmp[0][l][3]); + rawClip[l][3] = _mm256_insertf128_si256(rawClip[l][3], rawClipTmp[1][l][3], 1); + } + + params[k][0][0] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][0], 0x00), _mm256_shuffle_epi32(rawCoeff[1][0], 0x00)); + params[k][0][1] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][0], 0x55), _mm256_shuffle_epi32(rawCoeff[1][0], 0x55)); + params[k][0][2] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][0], 0xaa), _mm256_shuffle_epi32(rawCoeff[1][0], 0xaa)); + params[k][0][3] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][0], 0xff), _mm256_shuffle_epi32(rawCoeff[1][0], 0xff)); + params[k][0][4] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][1], 0x00), _mm256_shuffle_epi32(rawCoeff[1][1], 0x00)); + params[k][0][5] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][1], 0x55), _mm256_shuffle_epi32(rawCoeff[1][1], 0x55)); + params[k][0][6] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][1], 0xaa), _mm256_shuffle_epi32(rawCoeff[1][1], 0xaa)); + params[k][0][7] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][1], 0xff), _mm256_shuffle_epi32(rawCoeff[1][1], 0xff)); + params[k][0][8] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][2], 0x00), _mm256_shuffle_epi32(rawCoeff[1][2], 0x00)); + params[k][0][9] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][2], 0x55), _mm256_shuffle_epi32(rawCoeff[1][2], 0x55)); +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + params[k][0][10] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][2], 0xaa), _mm256_shuffle_epi32(rawCoeff[1][2], 0xaa)); + params[k][0][11] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][2], 0xff), _mm256_shuffle_epi32(rawCoeff[1][2], 0xff)); + params[k][0][12] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoeff[0][3], 0x00), _mm256_shuffle_epi32(rawCoeff[1][3], 0x00)); +#endif + + params[k][1][0] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][0], 0x00), _mm256_shuffle_epi32(rawClip[1][0], 0x00)); + params[k][1][1] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][0], 0x55), _mm256_shuffle_epi32(rawClip[1][0], 0x55)); + params[k][1][2] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][0], 0xaa), _mm256_shuffle_epi32(rawClip[1][0], 0xaa)); + params[k][1][3] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][0], 0xff), _mm256_shuffle_epi32(rawClip[1][0], 0xff)); + params[k][1][4] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][1], 0x00), _mm256_shuffle_epi32(rawClip[1][1], 0x00)); + params[k][1][5] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][1], 0x55), _mm256_shuffle_epi32(rawClip[1][1], 0x55)); + params[k][1][6] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][1], 0xaa), _mm256_shuffle_epi32(rawClip[1][1], 0xaa)); + params[k][1][7] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][1], 0xff), _mm256_shuffle_epi32(rawClip[1][1], 0xff)); + params[k][1][8] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][2], 0x00), _mm256_shuffle_epi32(rawClip[1][2], 0x00)); + params[k][1][9] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][2], 0x55), _mm256_shuffle_epi32(rawClip[1][2], 0x55)); +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + params[k][1][10] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][2], 0xaa), _mm256_shuffle_epi32(rawClip[1][2], 0xaa)); + params[k][1][11] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][2], 0xff), _mm256_shuffle_epi32(rawClip[1][2], 0xff)); + params[k][1][12] = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][3], 0x00), _mm256_shuffle_epi32(rawClip[1][3], 0x00)); +#endif + } + + for (size_t ii = 0; ii < stepY; ii++) + { + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; + + pImg0 = src + j + ii * srcStride; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + pImg7 = pImg5 + srcStride; + pImg8 = pImg6 - srcStride; + +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + const Pel *pImg0FixedBased = fixedFilterResults[fixedFilterSetIdx][blkDst.y + i + ii + padSize + 0] + blkDst.x + j + padSize; + const Pel *pImg1FixedBased = fixedFilterResults[fixedFilterSetIdx][blkDst.y + i + ii + padSize + 1] + blkDst.x + j + padSize; + const Pel *pImg2FixedBased = fixedFilterResults[fixedFilterSetIdx][blkDst.y + i + ii + padSize - 1] + blkDst.x + j + padSize; + const Pel *pImg3FixedBased = fixedFilterResults[fixedFilterSetIdx][blkDst.y + i + ii + padSize + 2] + blkDst.x + j + padSize; + const Pel *pImg4FixedBased = fixedFilterResults[fixedFilterSetIdx][blkDst.y + i + ii + padSize - 2] + blkDst.x + j + padSize; +#endif + __m256i cur = _mm256_loadu_si256((const __m256i *) pImg0); + __m256i accumA = mmOffset; + __m256i accumB = mmOffset; + + auto process2coeffs = [&](const int i, const Pel *ptr0, const Pel *ptr1, const Pel *ptr2, const Pel *ptr3) + { + const __m256i val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr0), cur); + const __m256i val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr2), cur); + const __m256i val01 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr1), cur); + const __m256i val11 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr3), cur); + + __m256i val01A = _mm256_unpacklo_epi16(val00, val10); + __m256i val01B = _mm256_unpackhi_epi16(val00, val10); + __m256i val01C = _mm256_unpacklo_epi16(val01, val11); + __m256i val01D = _mm256_unpackhi_epi16(val01, val11); + + __m256i limit01A = params[0][1][i]; + __m256i limit01B = params[1][1][i]; + + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + val01C = _mm256_min_epi16(val01C, limit01A); + val01D = _mm256_min_epi16(val01D, limit01B); + + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); + val01C = _mm256_max_epi16(val01C, limit01A); + val01D = _mm256_max_epi16(val01D, limit01B); + + val01A = _mm256_add_epi16(val01A, val01C); + val01B = _mm256_add_epi16(val01B, val01D); + + const __m256i coeff01A = params[0][0][i]; + const __m256i coeff01B = params[1][0][i]; + + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); + }; + + process2coeffs(0, pImg7 + 0, pImg8 + 0, pImg5 + 1, pImg6 - 1); + process2coeffs(1, pImg5 + 0, pImg6 + 0, pImg5 - 1, pImg6 + 1); + process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); + process2coeffs(3, pImg3 + 0, pImg4 + 0, pImg3 - 1, pImg4 + 1); + process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 3, pImg2 - 3); + process2coeffs(5, pImg1 + 2, pImg2 - 2, pImg1 + 1, pImg2 - 1); + process2coeffs(6, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1); + process2coeffs(7, pImg1 - 2, pImg2 + 2, pImg1 - 3, pImg2 + 3); + process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + process2coeffs(10, pImg3FixedBased + 0, pImg4FixedBased + 0, pImg1FixedBased + 0, pImg2FixedBased + 0); + process2coeffs(11, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); + + __m256i val00 = _mm256_sub_epi16( _mm256_loadu_si256((const __m256i *) (fixedFilterResults[fixedFilterSetIdx][blkDst.y + i + ii + padSize] + blkDst.x + j + padSize)), cur); + __m256i val10 = _mm256_setzero_si256(); + __m256i val01A = _mm256_unpacklo_epi16(val00, val10); + __m256i val01B = _mm256_unpackhi_epi16(val00, val10); + __m256i limit01A = params[0][1][12]; + __m256i limit01B = params[1][1][12]; + + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); + + __m256i coeff01A = params[0][0][12]; + __m256i coeff01B = params[1][0][12]; + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); +#endif + + accumA = _mm256_srai_epi32(accumA, shift); + accumB = _mm256_srai_epi32(accumB, shift); + + accumA = _mm256_packs_epi32(accumA, accumB); + accumA = _mm256_add_epi16(accumA, cur); + accumA = _mm256_min_epi16(mmMax, _mm256_max_epi16(accumA, mmMin)); + + _mm256_storeu_si256((__m256i *) (dst + ii * dstStride + j), accumA); + + } // for (size_t ii = 0; ii < stepY; ii++) + } // for (size_t j = 0; j < width; j += stepX) + src += srcStride * stepY; + dst += dstStride * stepY; + } + } + else + { + + const __m128i mmOffset = _mm_set1_epi32(round); + const __m128i mmMin = _mm_set1_epi16(clpRng.min); + const __m128i mmMax = _mm_set1_epi16(clpRng.max); +#endif + for (size_t i = 0; i < height; i += stepY) { const AlfClassifier *pClass = isChroma(compId) ? nullptr : classifier[blkDst.y + i] + blkDst.x; @@ -1413,6 +1701,9 @@ static void simdFilter9x9Blk(AlfClassifier **classifier, const PelUnitBuf &recDs src += srcStride * stepY; dst += dstStride * stepY; } +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + }//use 256 Bit Simd +#endif } #endif @@ -3473,7 +3764,9 @@ static void simdFilter13x13BlkExtDbResiDirect( const Pel currBase = 512; int round = 1 << (shift - 1); +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) __m128i curBase = _mm_set_epi16( currBase, currBase, currBase, currBase, currBase, currBase, currBase, currBase ); +#endif #else #if JVET_AG0158_ALF_LUMA_COEFF_PRECISION int shift = coeffBits; @@ -3491,9 +3784,11 @@ static void simdFilter13x13BlkExtDbResiDirect( constexpr size_t stepX = 8; size_t stepY = 1; +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) const __m128i mmOffset = _mm_set1_epi32(round); const __m128i mmMin = _mm_set1_epi16(clpRng.min); const __m128i mmMax = _mm_set1_epi16(clpRng.max); +#endif static_assert(sizeof(*filterSet) == 2, "ALF coeffs must be 16-bit wide"); static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide"); @@ -3509,600 +3804,1798 @@ static void simdFilter13x13BlkExtDbResiDirect( const int padSizeGauss = ALF_PADDING_SIZE_GAUSS_RESULTS; #endif - for (size_t i = 0; i < height; i += stepY) +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool use256BitSimd = vext >= AVX2 && blkDst.width % 16 == 0 ? true : false; + + if( use256BitSimd ) { - const AlfClassifier *pClass = classifier[blkDst.y + i] + blkDst.x; - for (size_t j = 0; j < width; j += stepX) + const __m256i mmOffset = _mm256_set1_epi32(round); + const __m256i mmMin = _mm256_set1_epi16(clpRng.min); + const __m256i mmMax = _mm256_set1_epi16(clpRng.max); +#if JVET_AI0084_ALF_RESIDUALS_SCALING + const __m256i curBase = _mm256_set1_epi16(currBase); +#endif + + for (size_t i = 0; i < height; i += stepY) { + const AlfClassifier *pClass = classifier[blkDst.y + i] + blkDst.x; + for (size_t j = 0; j < width; j += stepX * 2) + { #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - __m128i params[2][2][20]; + __m256i params[2][2][20]; #elif JVET_AD0222_ALF_LONG_FIXFILTER - __m128i params[2][2][17]; + __m256i params[2][2][17]; #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - __m128i params[2][2][19]; + __m256i params[2][2][19]; #else - __m128i params[2][2][16]; + __m256i params[2][2][16]; #endif #else - __m128i params[2][2][13]; + __m256i params[2][2][13]; #endif - for (int k = 0; k < 2; k++) - { + for (int k = 0; k < 2; k++) + { #if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER - __m128i rawCoef[4][5], rawClip[4][5], s0, s1, s2, s3, rawTmp0, rawTmp1; + __m256i rawCoef[4][5], rawClip[4][5], s0, s1; + __m128i rawCoefTmp[2][4][5], rawClipTmp[2][4][5], s0Tmp[2], s1Tmp[2], s2Tmp[2], s3Tmp[2]; #else - __m128i rawCoef[4][4], rawClip[4][4], s0, s1, s2, s3, rawTmp0, rawTmp1; + __m256i rawCoef[4][4], rawClip[4][4], s0, s1; + __m128i rawCoefTmp[2][4][4], rawClipTmp[2][4][4], s0Tmp[2], s1Tmp[2], s2Tmp[2], s3Tmp[2]; #endif - for (int l = 0; l < 4; l++) - { - const int transposeIdx = pClass[j + 4 * k + l] & 0x3; - const int classIdx = pClass[j + 4 * k + l] >> 2; + for (int l = 0; l < 4; l++) + { + const int transposeIdx0 = pClass[j + 4 * k + l + 0] & 0x3; + const int classIdx0 = pClass[j + 4 * k + l + 0] >> 2; - rawCoef[l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF)); - rawCoef[l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 8)); - rawCoef[l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 16)); - rawCoef[l][3] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 24)); + rawCoefTmp[0][l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF)); + rawCoefTmp[0][l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawCoefTmp[0][l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawCoefTmp[0][l][3] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 24)); #if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER - rawCoef[l][4] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 32)); + rawCoefTmp[0][l][4] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 32)); #endif - rawClip[l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF)); - rawClip[l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 8)); - rawClip[l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 16)); - rawClip[l][3] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 24)); + rawClipTmp[0][l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF)); + rawClipTmp[0][l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawClipTmp[0][l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawClipTmp[0][l][3] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 24)); #if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER - rawClip[l][4] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 32)); + rawClipTmp[0][l][4] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 32)); #endif #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - for (int m = 0; m < shuffleTime13FixedBasedLongLength[transposeIdx]; m++) + for (int m = 0; m < shuffleTime13FixedBasedLongLength[transposeIdx0]; m++) #else - for (int m = 0; m < shuffleTime13LongLength[transposeIdx]; m++) + for (int m = 0; m < shuffleTime13LongLength[transposeIdx0]; m++) #endif - { + { #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - int op0 = shuffleOp13FixedBasedLongLength[transposeIdx][m][0]; - int op1 = shuffleOp13FixedBasedLongLength[transposeIdx][m][1]; + int op0 = shuffleOp13FixedBasedLongLength[transposeIdx0][m][0]; + int op1 = shuffleOp13FixedBasedLongLength[transposeIdx0][m][1]; #else - int op0 = shuffleOp13LongLength[transposeIdx][m][0]; - int op1 = shuffleOp13LongLength[transposeIdx][m][1]; + int op0 = shuffleOp13LongLength[transposeIdx0][m][0]; + int op1 = shuffleOp13LongLength[transposeIdx0][m][1]; #endif #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - s0 = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx][m][0]); - s1 = _mm_xor_si128(s0, _mm_set1_epi8((char) 0x80)); - s2 = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx][m][1]); - s3 = _mm_xor_si128(s2, _mm_set1_epi8((char) 0x80)); + s0Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx0][m][0]); + s1Tmp[0] = _mm_xor_si128(s0Tmp[0], _mm_set1_epi8((char) 0x80)); + s2Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx0][m][1]); + s3Tmp[0] = _mm_xor_si128(s2Tmp[0], _mm_set1_epi8((char) 0x80)); +#else + s0Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx0][m][0]); + s1Tmp[0] = _mm_xor_si128(s0Tmp[0], _mm_set1_epi8((char) 0x80)); + s2Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx0][m][1]); + s3Tmp[0] = _mm_xor_si128(s2Tmp[0], _mm_set1_epi8((char) 0x80)); +#endif + + __m128i rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[0][l][op0], s0Tmp[0]), _mm_shuffle_epi8(rawCoefTmp[0][l][op1], s1Tmp[0])); + __m128i rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[0][l][op0], s2Tmp[0]), _mm_shuffle_epi8(rawCoefTmp[0][l][op1], s3Tmp[0])); + rawCoefTmp[0][l][op0] = rawTmp0; + rawCoefTmp[0][l][op1] = rawTmp1; + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[0][l][op0], s0Tmp[0]), _mm_shuffle_epi8(rawClipTmp[0][l][op1], s1Tmp[0])); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[0][l][op0], s2Tmp[0]), _mm_shuffle_epi8(rawClipTmp[0][l][op1], s3Tmp[0])); + rawClipTmp[0][l][op0] = rawTmp0; + rawClipTmp[0][l][op1] = rawTmp1; + } + + const int transposeIdx1 = pClass[j + 4 * k + l + 8] & 0x3; + const int classIdx1 = pClass[j + 4 * k + l + 8] >> 2; + + rawCoefTmp[1][l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF)); + rawCoefTmp[1][l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawCoefTmp[1][l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawCoefTmp[1][l][3] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 24)); +#if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER + rawCoefTmp[1][l][4] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 32)); +#endif + rawClipTmp[1][l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF)); + rawClipTmp[1][l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawClipTmp[1][l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawClipTmp[1][l][3] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 24)); +#if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER + rawClipTmp[1][l][4] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 32)); +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + for (int m = 0; m < shuffleTime13FixedBasedLongLength[transposeIdx1]; m++) #else - s0 = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx][m][0]); - s1 = _mm_xor_si128(s0, _mm_set1_epi8((char) 0x80)); - s2 = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx][m][1]); - s3 = _mm_xor_si128(s2, _mm_set1_epi8((char) 0x80)); + for (int m = 0; m < shuffleTime13LongLength[transposeIdx1]; m++) #endif + { +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + int op0 = shuffleOp13FixedBasedLongLength[transposeIdx1][m][0]; + int op1 = shuffleOp13FixedBasedLongLength[transposeIdx1][m][1]; +#else + int op0 = shuffleOp13LongLength[transposeIdx1][m][0]; + int op1 = shuffleOp13LongLength[transposeIdx1][m][1]; +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + s0Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx1][m][0]); + s1Tmp[1] = _mm_xor_si128(s0Tmp[1], _mm_set1_epi8((char) 0x80)); + s2Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx1][m][1]); + s3Tmp[1] = _mm_xor_si128(s2Tmp[1], _mm_set1_epi8((char) 0x80)); +#else + s0Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx1][m][0]); + s1Tmp[1] = _mm_xor_si128(s0Tmp[1], _mm_set1_epi8((char) 0x80)); + s2Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx1][m][1]); + s3Tmp[1] = _mm_xor_si128(s2Tmp[1], _mm_set1_epi8((char) 0x80)); +#endif + + __m128i rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[1][l][op0], s0Tmp[1]), _mm_shuffle_epi8(rawCoefTmp[1][l][op1], s1Tmp[1])); + __m128i rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[1][l][op0], s2Tmp[1]), _mm_shuffle_epi8(rawCoefTmp[1][l][op1], s3Tmp[1])); + rawCoefTmp[1][l][op0] = rawTmp0; + rawCoefTmp[1][l][op1] = rawTmp1; + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[1][l][op0], s0Tmp[1]), _mm_shuffle_epi8(rawClipTmp[1][l][op1], s1Tmp[1])); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[1][l][op0], s2Tmp[1]), _mm_shuffle_epi8(rawClipTmp[1][l][op1], s3Tmp[1])); + rawClipTmp[1][l][op0] = rawTmp0; + rawClipTmp[1][l][op1] = rawTmp1; + } - rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoef[l][op0], s0), _mm_shuffle_epi8(rawCoef[l][op1], s1)); - rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoef[l][op0], s2), _mm_shuffle_epi8(rawCoef[l][op1], s3)); - rawCoef[l][op0] = rawTmp0; - rawCoef[l][op1] = rawTmp1; - - rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClip[l][op0], s0), _mm_shuffle_epi8(rawClip[l][op1], s1)); - rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClip[l][op0], s2), _mm_shuffle_epi8(rawClip[l][op1], s3)); - rawClip[l][op0] = rawTmp0; - rawClip[l][op1] = rawTmp1; - } - } // for l + rawCoef[l][0] = _mm256_castsi128_si256(rawCoefTmp[0][l][0]); + rawCoef[l][0] = _mm256_insertf128_si256(rawCoef[l][0], rawCoefTmp[1][l][0], 1); + rawCoef[l][1] = _mm256_castsi128_si256(rawCoefTmp[0][l][1]); + rawCoef[l][1] = _mm256_insertf128_si256(rawCoef[l][1], rawCoefTmp[1][l][1], 1); + rawCoef[l][2] = _mm256_castsi128_si256(rawCoefTmp[0][l][2]); + rawCoef[l][2] = _mm256_insertf128_si256(rawCoef[l][2], rawCoefTmp[1][l][2], 1); + rawCoef[l][3] = _mm256_castsi128_si256(rawCoefTmp[0][l][3]); + rawCoef[l][3] = _mm256_insertf128_si256(rawCoef[l][3], rawCoefTmp[1][l][3], 1); + rawCoef[l][4] = _mm256_castsi128_si256(rawCoefTmp[0][l][4]); + rawCoef[l][4] = _mm256_insertf128_si256(rawCoef[l][4], rawCoefTmp[1][l][4], 1); + + rawClip[l][0] = _mm256_castsi128_si256(rawClipTmp[0][l][0]); + rawClip[l][0] = _mm256_insertf128_si256(rawClip[l][0], rawClipTmp[1][l][0], 1); + rawClip[l][1] = _mm256_castsi128_si256(rawClipTmp[0][l][1]); + rawClip[l][1] = _mm256_insertf128_si256(rawClip[l][1], rawClipTmp[1][l][1], 1); + rawClip[l][2] = _mm256_castsi128_si256(rawClipTmp[0][l][2]); + rawClip[l][2] = _mm256_insertf128_si256(rawClip[l][2], rawClipTmp[1][l][2], 1); + rawClip[l][3] = _mm256_castsi128_si256(rawClipTmp[0][l][3]); + rawClip[l][3] = _mm256_insertf128_si256(rawClip[l][3], rawClipTmp[1][l][3], 1); + rawClip[l][4] = _mm256_castsi128_si256(rawClipTmp[0][l][4]); + rawClip[l][4] = _mm256_insertf128_si256(rawClip[l][4], rawClipTmp[1][l][4], 1); + } // for l #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - int limR, lim0, lim1, lim2, lim3; + int limR, lim0, lim1, lim2, lim3; #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - limR = 5, lim0 = 5, lim1 = 5, lim2 = 5, lim3 = 5; + limR = 5, lim0 = 5, lim1 = 5, lim2 = 5, lim3 = 5; #elif JVET_AD0222_ALF_LONG_FIXFILTER - limR = 5, lim0 = 5, lim1 = 4, lim2 = 4, lim3 = 4; + limR = 5, lim0 = 5, lim1 = 4, lim2 = 4, lim3 = 4; #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - limR = 5, lim0 = 5, lim1 = 5, lim2 = 5, lim3 = 4; + limR = 5, lim0 = 5, lim1 = 5, lim2 = 5, lim3 = 4; #else - limR = 4, lim0 = 4, lim1 = 4, lim2 = 4, lim3 = 4; + limR = 4, lim0 = 4, lim1 = 4, lim2 = 4, lim3 = 4; #endif - for (unsigned char l = 0; l < limR; l++) + for (unsigned char l = 0; l < limR; l++) #else - for (unsigned char l = 0; l < 4; l++) + for (unsigned char l = 0; l < 4; l++) #endif - { - int m = l << 2; -#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - if (l < lim0) { + int m = l << 2; +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + if (l < lim0) + { #endif - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0x00), _mm_shuffle_epi32(rawCoef[1][l], 0x00)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0x00), _mm_shuffle_epi32(rawCoef[3][l], 0x00)); - params[k][0][0 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0x00), _mm_shuffle_epi32(rawClip[1][l], 0x00)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0x00), _mm_shuffle_epi32(rawClip[3][l], 0x00)); - params[k][1][0 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0x00), _mm256_shuffle_epi32(rawCoef[1][l], 0x00)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0x00), _mm256_shuffle_epi32(rawCoef[3][l], 0x00)); + params[k][0][0 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0x00), _mm256_shuffle_epi32(rawClip[1][l], 0x00)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0x00), _mm256_shuffle_epi32(rawClip[3][l], 0x00)); + params[k][1][0 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - } - if (l < lim1) - { + } + if (l < lim1) + { #endif - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0x55), _mm_shuffle_epi32(rawCoef[1][l], 0x55)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0x55), _mm_shuffle_epi32(rawCoef[3][l], 0x55)); - params[k][0][1 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0x55), _mm_shuffle_epi32(rawClip[1][l], 0x55)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0x55), _mm_shuffle_epi32(rawClip[3][l], 0x55)); - params[k][1][1 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0x55), _mm256_shuffle_epi32(rawCoef[1][l], 0x55)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0x55), _mm256_shuffle_epi32(rawCoef[3][l], 0x55)); + params[k][0][1 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0x55), _mm256_shuffle_epi32(rawClip[1][l], 0x55)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0x55), _mm256_shuffle_epi32(rawClip[3][l], 0x55)); + params[k][1][1 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - } - if (l < lim2) - { + } + if (l < lim2) + { #endif - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0xaa), _mm_shuffle_epi32(rawCoef[1][l], 0xaa)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0xaa), _mm_shuffle_epi32(rawCoef[3][l], 0xaa)); - params[k][0][2 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0xaa), _mm_shuffle_epi32(rawClip[1][l], 0xaa)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0xaa), _mm_shuffle_epi32(rawClip[3][l], 0xaa)); - params[k][1][2 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0xaa), _mm256_shuffle_epi32(rawCoef[1][l], 0xaa)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0xaa), _mm256_shuffle_epi32(rawCoef[3][l], 0xaa)); + params[k][0][2 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0xaa), _mm256_shuffle_epi32(rawClip[1][l], 0xaa)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0xaa), _mm256_shuffle_epi32(rawClip[3][l], 0xaa)); + params[k][1][2 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - } - if (l < lim3) - { + } + if (l < lim3) + { #endif - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0xff), _mm_shuffle_epi32(rawCoef[1][l], 0xff)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0xff), _mm_shuffle_epi32(rawCoef[3][l], 0xff)); - params[k][0][3 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); - s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0xff), _mm_shuffle_epi32(rawClip[1][l], 0xff)); - s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0xff), _mm_shuffle_epi32(rawClip[3][l], 0xff)); - params[k][1][3 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0xff), _mm256_shuffle_epi32(rawCoef[1][l], 0xff)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0xff), _mm256_shuffle_epi32(rawCoef[3][l], 0xff)); + params[k][0][3 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0xff), _mm256_shuffle_epi32(rawClip[1][l], 0xff)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0xff), _mm256_shuffle_epi32(rawClip[3][l], 0xff)); + params[k][1][3 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - } + } #endif - } // for l - } // for k + } // for l + } // for k - const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; #if !JVET_AD0222_ALF_LONG_FIXFILTER - const Pel *pImg9, *pImg10, *pImg11, *pImg12; + const Pel *pImg9, *pImg10, *pImg11, *pImg12; #endif - const Pel *pImgP0; + const Pel *pImgP0; - pImg0 = src + j; - pImg1 = pImg0 + srcStride; - pImg2 = pImg0 - srcStride; - pImg3 = pImg1 + srcStride; - pImg4 = pImg2 - srcStride; - pImg5 = pImg3 + srcStride; - pImg6 = pImg4 - srcStride; - pImg7 = pImg5 + srcStride; - pImg8 = pImg6 - srcStride; -#if !JVET_AD0222_ALF_LONG_FIXFILTER - pImg9 = pImg7 + srcStride; - pImg10 = pImg8 - srcStride; - pImg11 = pImg9 + srcStride; - pImg12 = pImg10 - srcStride; -#endif -#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + pImg0 = src + j; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + pImg7 = pImg5 + srcStride; + pImg8 = pImg6 - srcStride; +#if !JVET_AD0222_ALF_LONG_FIXFILTER + pImg9 = pImg7 + srcStride; + pImg10 = pImg8 - srcStride; + pImg11 = pImg9 + srcStride; + pImg12 = pImg10 - srcStride; +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AE0139_ALF_IMPROVED_FIXFILTER - int filterSetIdx = 2 + fixedFilterSetIdx; + int filterSetIdx = 2 + fixedFilterSetIdx; #else - int filterSetIdx = 0 + fixedFilterSetIdx; + int filterSetIdx = 0 + fixedFilterSetIdx; #endif - const Pel *pImg0FixedBased, *pImg1FixedBased, *pImg2FixedBased, *pImg3FixedBased, *pImg4FixedBased; + const Pel *pImg0FixedBased, *pImg1FixedBased, *pImg2FixedBased, *pImg3FixedBased, *pImg4FixedBased; #if JVET_AD0222_ALF_LONG_FIXFILTER - const Pel *pImg5FixedBased, *pImg6FixedBased, *pImg7FixedBased, *pImg8FixedBased, *pImg9FixedBased, *pImg10FixedBased, *pImg11FixedBased, *pImg12FixedBased; + const Pel *pImg5FixedBased, *pImg6FixedBased, *pImg7FixedBased, *pImg8FixedBased, *pImg9FixedBased, + *pImg10FixedBased, *pImg11FixedBased, *pImg12FixedBased; #endif - if (isFixedFilterPaddedPerCtu) - { - pImg0FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 0] + j + padSize; - pImg1FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 1] + j + padSize; - pImg2FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 1] + j + padSize; - pImg3FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 2] + j + padSize; - pImg4FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 2] + j + padSize; + if (isFixedFilterPaddedPerCtu) + { + pImg0FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 0] + j + padSize; + pImg1FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 1] + j + padSize; + pImg2FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 1] + j + padSize; + pImg3FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 2] + j + padSize; + pImg4FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 2] + j + padSize; #if JVET_AD0222_ALF_LONG_FIXFILTER - pImg5FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 3] + j + padSize; - pImg6FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 3] + j + padSize; - pImg7FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 4] + j + padSize; - pImg8FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 4] + j + padSize; - pImg9FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 5] + j + padSize; - pImg10FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 5] + j + padSize; - pImg11FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 6] + j + padSize; - pImg12FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 6] + j + padSize; + pImg5FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 3] + j + padSize; + pImg6FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 3] + j + padSize; + pImg7FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 4] + j + padSize; + pImg8FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 4] + j + padSize; + pImg9FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 5] + j + padSize; + pImg10FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 5] + j + padSize; + pImg11FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 6] + j + padSize; + pImg12FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 6] + j + padSize; #endif - } - else - { - pImg0FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 0] + blkDst.x + j + padSize; - pImg1FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 1] + blkDst.x + j + padSize; - pImg2FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 1] + blkDst.x + j + padSize; - pImg3FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 2] + blkDst.x + j + padSize; - pImg4FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 2] + blkDst.x + j + padSize; + } + else + { + pImg0FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 0] + blkDst.x + j + padSize; + pImg1FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 1] + blkDst.x + j + padSize; + pImg2FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 1] + blkDst.x + j + padSize; + pImg3FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 2] + blkDst.x + j + padSize; + pImg4FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 2] + blkDst.x + j + padSize; #if JVET_AD0222_ALF_LONG_FIXFILTER - pImg5FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 3] + blkDst.x + j + padSize; - pImg6FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 3] + blkDst.x + j + padSize; - pImg7FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 4] + blkDst.x + j + padSize; - pImg8FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 4] + blkDst.x + j + padSize; - pImg9FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 5] + blkDst.x + j + padSize; - pImg10FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 5] + blkDst.x + j + padSize; - pImg11FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 6] + blkDst.x + j + padSize; - pImg12FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 6] + blkDst.x + j + padSize; + pImg5FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 3] + blkDst.x + j + padSize; + pImg6FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 3] + blkDst.x + j + padSize; + pImg7FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 4] + blkDst.x + j + padSize; + pImg8FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 4] + blkDst.x + j + padSize; + pImg9FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 5] + blkDst.x + j + padSize; + pImg10FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 5] + blkDst.x + j + padSize; + pImg11FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 6] + blkDst.x + j + padSize; + pImg12FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 6] + blkDst.x + j + padSize; #endif - } + } #endif #if JVET_AD0222_ADDITONAL_ALF_FIXFILTER - const Pel *pImg0Gauss[NUM_GAUSS_FILTERED_SOURCE]; - const Pel *pImg1Gauss[NUM_GAUSS_FILTERED_SOURCE], *pImg2Gauss[NUM_GAUSS_FILTERED_SOURCE]; - const Pel *pImg3Gauss[NUM_GAUSS_FILTERED_SOURCE], *pImg4Gauss[NUM_GAUSS_FILTERED_SOURCE]; + const Pel *pImg0Gauss[NUM_GAUSS_FILTERED_SOURCE]; + const Pel *pImg1Gauss[NUM_GAUSS_FILTERED_SOURCE], *pImg2Gauss[NUM_GAUSS_FILTERED_SOURCE]; + const Pel *pImg3Gauss[NUM_GAUSS_FILTERED_SOURCE], *pImg4Gauss[NUM_GAUSS_FILTERED_SOURCE]; - for( int gaussIdx = 0; gaussIdx < NUM_GAUSS_FILTERED_SOURCE; gaussIdx++ ) - { - if( isFixedFilterPaddedPerCtu ) - { - pImg0Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 0] + j + padSizeGauss; - pImg1Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 1] + j + padSizeGauss; - pImg2Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss - 1] + j + padSizeGauss; - pImg3Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 2] + j + padSizeGauss; - pImg4Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss - 2] + j + padSizeGauss; - } - else + for (int gaussIdx = 0; gaussIdx < NUM_GAUSS_FILTERED_SOURCE; gaussIdx++) { - pImg0Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 0] + blkDst.x + j + padSizeGauss; - pImg1Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 1] + blkDst.x + j + padSizeGauss; - pImg2Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss - 1] + blkDst.x + j + padSizeGauss; - pImg3Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 2] + blkDst.x + j + padSizeGauss; - pImg4Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss - 2] + blkDst.x + j + padSizeGauss; + if (isFixedFilterPaddedPerCtu) + { + pImg0Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 0] + j + padSizeGauss; + pImg1Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 1] + j + padSizeGauss; + pImg2Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss - 1] + j + padSizeGauss; + pImg3Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 2] + j + padSizeGauss; + pImg4Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss - 2] + j + padSizeGauss; + } + else + { + pImg0Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 0] + blkDst.x + j + padSizeGauss; + pImg1Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 1] + blkDst.x + j + padSizeGauss; + pImg2Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss - 1] + blkDst.x + j + padSizeGauss; + pImg3Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 2] + blkDst.x + j + padSizeGauss; + pImg4Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss - 2] + blkDst.x + j + padSizeGauss; + } } - } #endif - __m128i cur = _mm_loadu_si128((const __m128i *) pImg0); - __m128i accumA = mmOffset; - __m128i accumB = mmOffset; + __m256i cur = _mm256_loadu_si256((const __m256i *) pImg0); + __m256i accumA = mmOffset; + __m256i accumB = mmOffset; - auto process2coeffs = [&](const int i, const Pel *ptr0, const Pel *ptr1, const Pel *ptr2, const Pel *ptr3) - { - const __m128i val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr0), cur); - const __m128i val10 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr2), cur); - const __m128i val01 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr1), cur); - const __m128i val11 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr3), cur); + auto process2coeffs = [&](const int i, const Pel *ptr0, const Pel *ptr1, const Pel *ptr2, const Pel *ptr3) + { + const __m256i val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr0), cur); + const __m256i val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr2), cur); + const __m256i val01 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr1), cur); + const __m256i val11 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr3), cur); - __m128i val01A = _mm_unpacklo_epi16(val00, val10); - __m128i val01B = _mm_unpackhi_epi16(val00, val10); - __m128i val01C = _mm_unpacklo_epi16(val01, val11); - __m128i val01D = _mm_unpackhi_epi16(val01, val11); + __m256i val01A = _mm256_unpacklo_epi16(val00, val10); + __m256i val01B = _mm256_unpackhi_epi16(val00, val10); + __m256i val01C = _mm256_unpacklo_epi16(val01, val11); + __m256i val01D = _mm256_unpackhi_epi16(val01, val11); - __m128i limit01A = params[0][1][i]; - __m128i limit01B = params[1][1][i]; + __m256i limit01A = params[0][1][i]; + __m256i limit01B = params[1][1][i]; - val01A = _mm_min_epi16(val01A, limit01A); - val01B = _mm_min_epi16(val01B, limit01B); - val01C = _mm_min_epi16(val01C, limit01A); - val01D = _mm_min_epi16(val01D, limit01B); + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + val01C = _mm256_min_epi16(val01C, limit01A); + val01D = _mm256_min_epi16(val01D, limit01B); - limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); - limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); - val01A = _mm_max_epi16(val01A, limit01A); - val01B = _mm_max_epi16(val01B, limit01B); - val01C = _mm_max_epi16(val01C, limit01A); - val01D = _mm_max_epi16(val01D, limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); + val01C = _mm256_max_epi16(val01C, limit01A); + val01D = _mm256_max_epi16(val01D, limit01B); - val01A = _mm_add_epi16(val01A, val01C); - val01B = _mm_add_epi16(val01B, val01D); + val01A = _mm256_add_epi16(val01A, val01C); + val01B = _mm256_add_epi16(val01B, val01D); - const __m128i coeff01A = params[0][0][i]; - const __m128i coeff01B = params[1][0][i]; + const __m256i coeff01A = params[0][0][i]; + const __m256i coeff01B = params[1][0][i]; - accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); - accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); - }; + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); + }; #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); - process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); - process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); - process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); - process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); - process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); - process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); - process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); - process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); - process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); - process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); - process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); - process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); - process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); - process2coeffs(14, pImg3Gauss[0] - 0, pImg4Gauss[0] + 0, pImg1Gauss[0] - 0, pImg2Gauss[0] + 0); - process2coeffs(15, pImg0Gauss[0] - 2, pImg0Gauss[0] + 2, pImg0Gauss[0] - 1, pImg0Gauss[0] + 1); + process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); + process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); + process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); + process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); + process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); + process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); + process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); + process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); + process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); + process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); + process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); + process2coeffs(14, pImg3Gauss[0] - 0, pImg4Gauss[0] + 0, pImg1Gauss[0] - 0, pImg2Gauss[0] + 0); + process2coeffs(15, pImg0Gauss[0] - 2, pImg0Gauss[0] + 2, pImg0Gauss[0] - 1, pImg0Gauss[0] + 1); #elif JVET_AD0222_ALF_LONG_FIXFILTER - process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); - process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); - process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); - process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); - process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); - process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); - process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); - process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); - process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); - process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); - process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); - process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); - process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); - process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); + process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); + process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); + process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); + process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); + process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); + process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); + process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); + process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); + process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); + process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); + process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); - process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); - process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); - process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); - process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); - process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); - process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); - process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); - process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); - process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); - process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); - process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); - process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); - process2coeffs(13, pImg3Gauss[0] - 0, pImg4Gauss[0] + 0, pImg1Gauss[0] - 0, pImg2Gauss[0] + 0); - process2coeffs(14, pImg0Gauss[0] - 2, pImg0Gauss[0] + 2, pImg0Gauss[0] - 1, pImg0Gauss[0] + 1); -#else - process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); - process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); - process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); - process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); - process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); - process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); - process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); - process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); - process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); - process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); + process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); + process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); + process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); + process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); + process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); + process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); + process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); + process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); + process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); + process2coeffs(13, pImg3Gauss[0] - 0, pImg4Gauss[0] + 0, pImg1Gauss[0] - 0, pImg2Gauss[0] + 0); + process2coeffs(14, pImg0Gauss[0] - 2, pImg0Gauss[0] + 2, pImg0Gauss[0] - 1, pImg0Gauss[0] + 1); +#else + process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); + process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); + process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); + process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); + process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); + process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); + process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); + process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); - process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); - process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); + process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); + process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); + process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); #endif #endif - pImg0 = srcBeforeDb + j; - pImg1 = pImg0 + srcBeforeDbStride; - pImg2 = pImg0 - srcBeforeDbStride; + pImg0 = srcBeforeDb + j; + pImg1 = pImg0 + srcBeforeDbStride; + pImg2 = pImg0 - srcBeforeDbStride; - pImgP0 = srcResi + j; + pImgP0 = srcResi + j; #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - process2coeffs(16, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); + process2coeffs(16, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); #elif JVET_AD0222_ALF_LONG_FIXFILTER - process2coeffs(14, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); + process2coeffs(14, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - process2coeffs(15, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); + process2coeffs(15, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); #else - process2coeffs(13, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); + process2coeffs(13, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); #endif #else - process2coeffs(10, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); + process2coeffs(10, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); #endif #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - __m128i val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *)(fixedFilterResults[0 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), - cur); - __m128i val10 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *)(fixedFilterResults[2 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), - cur); + __m256i val00 = _mm256_sub_epi16( _mm256_loadu_si256((const __m256i *) (fixedFilterResults[0 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), cur); + __m256i val10 = _mm256_sub_epi16( _mm256_loadu_si256((const __m256i *) (fixedFilterResults[2 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), cur); #else - __m128i val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) pImg0), cur); - __m128i val10 = _mm_sub_epi16( - _mm_loadu_si128((const __m128i *) (fixedFilterResults[fixedFilterSetIdx][blkDst.y + i] + blkDst.x + j)), cur); + __m256i val00 = _mm265_sub_epi16(_mm256_loadu_si256((const __m256i *) pImg0), cur); + __m256i val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) (fixedFilterResults[fixedFilterSetIdx][blkDst.y + i] + blkDst.x + j)), cur); #endif - __m128i val01A = _mm_unpacklo_epi16(val00, val10); - __m128i val01B = _mm_unpackhi_epi16(val00, val10); + __m256i val01A = _mm256_unpacklo_epi16(val00, val10); + __m256i val01B = _mm256_unpackhi_epi16(val00, val10); #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - __m128i limit01A = params[0][1][17]; - __m128i limit01B = params[1][1][17]; + __m256i limit01A = params[0][1][17]; + __m256i limit01B = params[1][1][17]; #elif JVET_AD0222_ALF_LONG_FIXFILTER - __m128i limit01A = params[0][1][15]; - __m128i limit01B = params[1][1][15]; + __m256i limit01A = params[0][1][15]; + __m256i limit01B = params[1][1][15]; #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - __m128i limit01A = params[0][1][16]; - __m128i limit01B = params[1][1][16]; + __m256i limit01A = params[0][1][16]; + __m256i limit01B = params[1][1][16]; #else - __m128i limit01A = params[0][1][14]; - __m128i limit01B = params[1][1][14]; + __m256i limit01A = params[0][1][14]; + __m256i limit01B = params[1][1][14]; #endif #else - __m128i limit01A = params[0][1][11]; - __m128i limit01B = params[1][1][11]; + __m256i limit01A = params[0][1][11]; + __m256i limit01B = params[1][1][11]; #endif - val01A = _mm_min_epi16(val01A, limit01A); - val01B = _mm_min_epi16(val01B, limit01B); - limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); - limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); - val01A = _mm_max_epi16(val01A, limit01A); - val01B = _mm_max_epi16(val01B, limit01B); + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - __m128i coeff01A = params[0][0][17]; - __m128i coeff01B = params[1][0][17]; + __m256i coeff01A = params[0][0][17]; + __m256i coeff01B = params[1][0][17]; #elif JVET_AD0222_ALF_LONG_FIXFILTER - __m128i coeff01A = params[0][0][15]; - __m128i coeff01B = params[1][0][15]; + __m256i coeff01A = params[0][0][15]; + __m256i coeff01B = params[1][0][15]; #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - __m128i coeff01A = params[0][0][16]; - __m128i coeff01B = params[1][0][16]; + __m256i coeff01A = params[0][0][16]; + __m256i coeff01B = params[1][0][16]; #else - __m128i coeff01A = params[0][0][14]; - __m128i coeff01B = params[1][0][14]; + __m256i coeff01A = params[0][0][14]; + __m256i coeff01B = params[1][0][14]; #endif #else - __m128i coeff01A = params[0][0][11]; - __m128i coeff01B = params[1][0][11]; + __m256i coeff01A = params[0][0][11]; + __m256i coeff01B = params[1][0][11]; #endif - accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); - accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); - // start prediction fixed filter - __m128i zero = _mm_setzero_si128(); - val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) (pImg0)), cur); - val10 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) pImgP0), zero); - val01A = _mm_unpacklo_epi16(val00, val10); - val01B = _mm_unpackhi_epi16(val00, val10); + // start prediction fixed filter + __m256i zero = _mm256_setzero_si256(); + val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) (pImg0)), cur); + val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) pImgP0), zero); + val01A = _mm256_unpacklo_epi16(val00, val10); + val01B = _mm256_unpackhi_epi16(val00, val10); #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - limit01A = params[0][1][18]; - limit01B = params[1][1][18]; + limit01A = params[0][1][18]; + limit01B = params[1][1][18]; #elif JVET_AD0222_ALF_LONG_FIXFILTER - limit01A = params[0][1][16]; - limit01B = params[1][1][16]; + limit01A = params[0][1][16]; + limit01B = params[1][1][16]; #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - limit01A = params[0][1][17]; - limit01B = params[1][1][17]; + limit01A = params[0][1][17]; + limit01B = params[1][1][17]; #else - limit01A = params[0][1][15]; - limit01B = params[1][1][15]; + limit01A = params[0][1][15]; + limit01B = params[1][1][15]; #endif - val01A = _mm_min_epi16(val01A, limit01A); - val01B = _mm_min_epi16(val01B, limit01B); - limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + coeff01A = params[0][0][18]; + coeff01B = params[1][0][18]; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + coeff01A = params[0][0][16]; + coeff01B = params[1][0][16]; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + coeff01A = params[0][0][17]; + coeff01B = params[1][0][17]; +#else + coeff01A = params[0][0][15]; + coeff01B = params[1][0][15]; +#endif + + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); + // end prediction fixed filter +#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER + val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) (pImg0Gauss[0])), cur); + val10 = _mm256_setzero_si256(); + val01A = _mm256_unpacklo_epi16(val00, val10); + val01B = _mm256_unpackhi_epi16(val00, val10); +#if JVET_AD0222_ALF_LONG_FIXFILTER + limit01A = params[0][1][19]; + limit01B = params[1][1][19]; +#else + limit01A = params[0][1][18]; + limit01B = params[1][1][18]; +#endif + + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); +#if JVET_AD0222_ALF_LONG_FIXFILTER + coeff01A = params[0][0][19]; + coeff01B = params[1][0][19]; +#else + coeff01A = params[0][0][18]; + coeff01B = params[1][0][18]; +#endif + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); +#endif + accumA = _mm256_srai_epi32(accumA, shift); + accumB = _mm256_srai_epi32(accumB, shift); + + accumA = _mm256_packs_epi32(accumA, accumB); +#if JVET_AI0084_ALF_RESIDUALS_SCALING + if( bScalingCorr ) + { + accumA = _mm256_add_epi16(accumA, curBase); + } + else +#endif + accumA = _mm256_add_epi16(accumA, cur); + accumA = _mm256_min_epi16(mmMax, _mm256_max_epi16(accumA, mmMin)); + + _mm256_storeu_si256((__m256i *) (dst + j), accumA); + } // for j + src += srcStride * stepY; + dst += dstStride * stepY; + srcBeforeDb += srcBeforeDbStride * stepY; + srcResi += srcResiStride * stepY; + } // for i + } + else + { + + const __m128i mmOffset = _mm_set1_epi32(round); + const __m128i mmMin = _mm_set1_epi16(clpRng.min); + const __m128i mmMax = _mm_set1_epi16(clpRng.max); +#if JVET_AI0084_ALF_RESIDUALS_SCALING + const __m128i curBase = _mm_set1_epi16(currBase); +#endif +#endif//Use AVX2 SIMD + for (size_t i = 0; i < height; i += stepY) + { + const AlfClassifier *pClass = classifier[blkDst.y + i] + blkDst.x; + for (size_t j = 0; j < width; j += stepX) + { +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + __m128i params[2][2][20]; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + __m128i params[2][2][17]; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + __m128i params[2][2][19]; +#else + __m128i params[2][2][16]; +#endif +#else + __m128i params[2][2][13]; +#endif + for (int k = 0; k < 2; k++) + { +#if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER + __m128i rawCoef[4][5], rawClip[4][5], s0, s1, s2, s3, rawTmp0, rawTmp1; +#else + __m128i rawCoef[4][4], rawClip[4][4], s0, s1, s2, s3, rawTmp0, rawTmp1; +#endif + for (int l = 0; l < 4; l++) + { + const int transposeIdx = pClass[j + 4 * k + l] & 0x3; + const int classIdx = pClass[j + 4 * k + l] >> 2; + + rawCoef[l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF)); + rawCoef[l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawCoef[l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawCoef[l][3] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 24)); +#if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER + rawCoef[l][4] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 32)); +#endif + rawClip[l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF)); + rawClip[l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawClip[l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawClip[l][3] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 24)); +#if JVET_AD0222_ALF_LONG_FIXFILTER || JVET_AD0222_ADDITONAL_ALF_FIXFILTER + rawClip[l][4] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx * MAX_NUM_ALF_LUMA_COEFF + 32)); +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + for (int m = 0; m < shuffleTime13FixedBasedLongLength[transposeIdx]; m++) +#else + for (int m = 0; m < shuffleTime13LongLength[transposeIdx]; m++) +#endif + { +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + int op0 = shuffleOp13FixedBasedLongLength[transposeIdx][m][0]; + int op1 = shuffleOp13FixedBasedLongLength[transposeIdx][m][1]; +#else + int op0 = shuffleOp13LongLength[transposeIdx][m][0]; + int op1 = shuffleOp13LongLength[transposeIdx][m][1]; +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + s0 = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx][m][0]); + s1 = _mm_xor_si128(s0, _mm_set1_epi8((char) 0x80)); + s2 = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx][m][1]); + s3 = _mm_xor_si128(s2, _mm_set1_epi8((char) 0x80)); +#else + s0 = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx][m][0]); + s1 = _mm_xor_si128(s0, _mm_set1_epi8((char) 0x80)); + s2 = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx][m][1]); + s3 = _mm_xor_si128(s2, _mm_set1_epi8((char) 0x80)); +#endif + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoef[l][op0], s0), _mm_shuffle_epi8(rawCoef[l][op1], s1)); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoef[l][op0], s2), _mm_shuffle_epi8(rawCoef[l][op1], s3)); + rawCoef[l][op0] = rawTmp0; + rawCoef[l][op1] = rawTmp1; + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClip[l][op0], s0), _mm_shuffle_epi8(rawClip[l][op1], s1)); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClip[l][op0], s2), _mm_shuffle_epi8(rawClip[l][op1], s3)); + rawClip[l][op0] = rawTmp0; + rawClip[l][op1] = rawTmp1; + } + } // for l +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + int limR, lim0, lim1, lim2, lim3; +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + limR = 5, lim0 = 5, lim1 = 5, lim2 = 5, lim3 = 5; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + limR = 5, lim0 = 5, lim1 = 4, lim2 = 4, lim3 = 4; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + limR = 5, lim0 = 5, lim1 = 5, lim2 = 5, lim3 = 4; +#else + limR = 4, lim0 = 4, lim1 = 4, lim2 = 4, lim3 = 4; +#endif + for (unsigned char l = 0; l < limR; l++) +#else + for (unsigned char l = 0; l < 4; l++) +#endif + { + int m = l << 2; +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + if (l < lim0) + { +#endif + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0x00), _mm_shuffle_epi32(rawCoef[1][l], 0x00)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0x00), _mm_shuffle_epi32(rawCoef[3][l], 0x00)); + params[k][0][0 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0x00), _mm_shuffle_epi32(rawClip[1][l], 0x00)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0x00), _mm_shuffle_epi32(rawClip[3][l], 0x00)); + params[k][1][0 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } + if (l < lim1) + { +#endif + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0x55), _mm_shuffle_epi32(rawCoef[1][l], 0x55)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0x55), _mm_shuffle_epi32(rawCoef[3][l], 0x55)); + params[k][0][1 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0x55), _mm_shuffle_epi32(rawClip[1][l], 0x55)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0x55), _mm_shuffle_epi32(rawClip[3][l], 0x55)); + params[k][1][1 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } + if (l < lim2) + { +#endif + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0xaa), _mm_shuffle_epi32(rawCoef[1][l], 0xaa)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0xaa), _mm_shuffle_epi32(rawCoef[3][l], 0xaa)); + params[k][0][2 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0xaa), _mm_shuffle_epi32(rawClip[1][l], 0xaa)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0xaa), _mm_shuffle_epi32(rawClip[3][l], 0xaa)); + params[k][1][2 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } + if (l < lim3) + { +#endif + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[0][l], 0xff), _mm_shuffle_epi32(rawCoef[1][l], 0xff)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawCoef[2][l], 0xff), _mm_shuffle_epi32(rawCoef[3][l], 0xff)); + params[k][0][3 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[0][l], 0xff), _mm_shuffle_epi32(rawClip[1][l], 0xff)); + s1 = _mm_unpacklo_epi64(_mm_shuffle_epi32(rawClip[2][l], 0xff), _mm_shuffle_epi32(rawClip[3][l], 0xff)); + params[k][1][3 + m] = _mm_blend_epi16(_mm_shuffle_epi32(s0, 0x88), _mm_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } +#endif + } // for l + } // for k + + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; +#if !JVET_AD0222_ALF_LONG_FIXFILTER + const Pel *pImg9, *pImg10, *pImg11, *pImg12; +#endif + const Pel *pImgP0; + + pImg0 = src + j; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + pImg7 = pImg5 + srcStride; + pImg8 = pImg6 - srcStride; +#if !JVET_AD0222_ALF_LONG_FIXFILTER + pImg9 = pImg7 + srcStride; + pImg10 = pImg8 - srcStride; + pImg11 = pImg9 + srcStride; + pImg12 = pImg10 - srcStride; +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AE0139_ALF_IMPROVED_FIXFILTER + int filterSetIdx = 2 + fixedFilterSetIdx; +#else + int filterSetIdx = 0 + fixedFilterSetIdx; +#endif + const Pel *pImg0FixedBased, *pImg1FixedBased, *pImg2FixedBased, *pImg3FixedBased, *pImg4FixedBased; +#if JVET_AD0222_ALF_LONG_FIXFILTER + const Pel *pImg5FixedBased, *pImg6FixedBased, *pImg7FixedBased, *pImg8FixedBased, *pImg9FixedBased, *pImg10FixedBased, *pImg11FixedBased, *pImg12FixedBased; +#endif + if (isFixedFilterPaddedPerCtu) + { + pImg0FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 0] + j + padSize; + pImg1FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 1] + j + padSize; + pImg2FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 1] + j + padSize; + pImg3FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 2] + j + padSize; + pImg4FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 2] + j + padSize; +#if JVET_AD0222_ALF_LONG_FIXFILTER + pImg5FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 3] + j + padSize; + pImg6FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 3] + j + padSize; + pImg7FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 4] + j + padSize; + pImg8FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 4] + j + padSize; + pImg9FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 5] + j + padSize; + pImg10FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 5] + j + padSize; + pImg11FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 6] + j + padSize; + pImg12FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 6] + j + padSize; +#endif + } + else + { + pImg0FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 0] + blkDst.x + j + padSize; + pImg1FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 1] + blkDst.x + j + padSize; + pImg2FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 1] + blkDst.x + j + padSize; + pImg3FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 2] + blkDst.x + j + padSize; + pImg4FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 2] + blkDst.x + j + padSize; +#if JVET_AD0222_ALF_LONG_FIXFILTER + pImg5FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 3] + blkDst.x + j + padSize; + pImg6FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 3] + blkDst.x + j + padSize; + pImg7FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 4] + blkDst.x + j + padSize; + pImg8FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 4] + blkDst.x + j + padSize; + pImg9FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 5] + blkDst.x + j + padSize; + pImg10FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 5] + blkDst.x + j + padSize; + pImg11FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 6] + blkDst.x + j + padSize; + pImg12FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 6] + blkDst.x + j + padSize; +#endif + } +#endif +#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER + const Pel *pImg0Gauss[NUM_GAUSS_FILTERED_SOURCE]; + const Pel *pImg1Gauss[NUM_GAUSS_FILTERED_SOURCE], *pImg2Gauss[NUM_GAUSS_FILTERED_SOURCE]; + const Pel *pImg3Gauss[NUM_GAUSS_FILTERED_SOURCE], *pImg4Gauss[NUM_GAUSS_FILTERED_SOURCE]; + + for( int gaussIdx = 0; gaussIdx < NUM_GAUSS_FILTERED_SOURCE; gaussIdx++ ) + { + if( isFixedFilterPaddedPerCtu ) + { + pImg0Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 0] + j + padSizeGauss; + pImg1Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 1] + j + padSizeGauss; + pImg2Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss - 1] + j + padSizeGauss; + pImg3Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 2] + j + padSizeGauss; + pImg4Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss - 2] + j + padSizeGauss; + } + else + { + pImg0Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 0] + blkDst.x + j + padSizeGauss; + pImg1Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 1] + blkDst.x + j + padSizeGauss; + pImg2Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss - 1] + blkDst.x + j + padSizeGauss; + pImg3Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 2] + blkDst.x + j + padSizeGauss; + pImg4Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss - 2] + blkDst.x + j + padSizeGauss; + } + } +#endif + __m128i cur = _mm_loadu_si128((const __m128i *) pImg0); + __m128i accumA = mmOffset; + __m128i accumB = mmOffset; + + auto process2coeffs = [&](const int i, const Pel *ptr0, const Pel *ptr1, const Pel *ptr2, const Pel *ptr3) + { + const __m128i val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr0), cur); + const __m128i val10 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr2), cur); + const __m128i val01 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr1), cur); + const __m128i val11 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) ptr3), cur); + + __m128i val01A = _mm_unpacklo_epi16(val00, val10); + __m128i val01B = _mm_unpackhi_epi16(val00, val10); + __m128i val01C = _mm_unpacklo_epi16(val01, val11); + __m128i val01D = _mm_unpackhi_epi16(val01, val11); + + __m128i limit01A = params[0][1][i]; + __m128i limit01B = params[1][1][i]; + + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01B); + val01C = _mm_min_epi16(val01C, limit01A); + val01D = _mm_min_epi16(val01D, limit01B); + + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); + + val01A = _mm_max_epi16(val01A, limit01A); + val01B = _mm_max_epi16(val01B, limit01B); + val01C = _mm_max_epi16(val01C, limit01A); + val01D = _mm_max_epi16(val01D, limit01B); + + val01A = _mm_add_epi16(val01A, val01C); + val01B = _mm_add_epi16(val01B, val01D); + + const __m128i coeff01A = params[0][0][i]; + const __m128i coeff01B = params[1][0][i]; + + accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); + accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); + }; +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); + process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); + process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); + process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); + process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); + process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); + process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); + process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); + process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); + process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); + process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); + process2coeffs(14, pImg3Gauss[0] - 0, pImg4Gauss[0] + 0, pImg1Gauss[0] - 0, pImg2Gauss[0] + 0); + process2coeffs(15, pImg0Gauss[0] - 2, pImg0Gauss[0] + 2, pImg0Gauss[0] - 1, pImg0Gauss[0] + 1); +#elif JVET_AD0222_ALF_LONG_FIXFILTER + process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); + process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); + process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); + process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); + process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); + process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); + process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); + process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); + process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); + process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); + process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); + process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); + process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); + process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); + process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); + process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); + process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); + process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); + process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); + process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); + process2coeffs(13, pImg3Gauss[0] - 0, pImg4Gauss[0] + 0, pImg1Gauss[0] - 0, pImg2Gauss[0] + 0); + process2coeffs(14, pImg0Gauss[0] - 2, pImg0Gauss[0] + 2, pImg0Gauss[0] - 1, pImg0Gauss[0] + 1); +#else + process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); + process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); + process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); + process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); + process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); + process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); + process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); + process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); + process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); + process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); +#endif +#endif + pImg0 = srcBeforeDb + j; + pImg1 = pImg0 + srcBeforeDbStride; + pImg2 = pImg0 - srcBeforeDbStride; + + pImgP0 = srcResi + j; + +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + process2coeffs(16, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); +#elif JVET_AD0222_ALF_LONG_FIXFILTER + process2coeffs(14, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + process2coeffs(15, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); +#else + process2coeffs(13, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); +#endif +#else + process2coeffs(10, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); +#endif + +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + __m128i val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *)(fixedFilterResults[0 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), + cur); + __m128i val10 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *)(fixedFilterResults[2 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), + cur); +#else + __m128i val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) pImg0), cur); + __m128i val10 = _mm_sub_epi16( + _mm_loadu_si128((const __m128i *) (fixedFilterResults[fixedFilterSetIdx][blkDst.y + i] + blkDst.x + j)), cur); +#endif + __m128i val01A = _mm_unpacklo_epi16(val00, val10); + __m128i val01B = _mm_unpackhi_epi16(val00, val10); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + __m128i limit01A = params[0][1][17]; + __m128i limit01B = params[1][1][17]; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + __m128i limit01A = params[0][1][15]; + __m128i limit01B = params[1][1][15]; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + __m128i limit01A = params[0][1][16]; + __m128i limit01B = params[1][1][16]; +#else + __m128i limit01A = params[0][1][14]; + __m128i limit01B = params[1][1][14]; +#endif +#else + __m128i limit01A = params[0][1][11]; + __m128i limit01B = params[1][1][11]; +#endif + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01B); + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); + val01A = _mm_max_epi16(val01A, limit01A); + val01B = _mm_max_epi16(val01B, limit01B); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + __m128i coeff01A = params[0][0][17]; + __m128i coeff01B = params[1][0][17]; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + __m128i coeff01A = params[0][0][15]; + __m128i coeff01B = params[1][0][15]; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + __m128i coeff01A = params[0][0][16]; + __m128i coeff01B = params[1][0][16]; +#else + __m128i coeff01A = params[0][0][14]; + __m128i coeff01B = params[1][0][14]; +#endif +#else + __m128i coeff01A = params[0][0][11]; + __m128i coeff01B = params[1][0][11]; +#endif + accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); + accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); + + // start prediction fixed filter + __m128i zero = _mm_setzero_si128(); + val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) (pImg0)), cur); + val10 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) pImgP0), zero); + val01A = _mm_unpacklo_epi16(val00, val10); + val01B = _mm_unpackhi_epi16(val00, val10); +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + limit01A = params[0][1][18]; + limit01B = params[1][1][18]; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + limit01A = params[0][1][16]; + limit01B = params[1][1][16]; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + limit01A = params[0][1][17]; + limit01B = params[1][1][17]; +#else + limit01A = params[0][1][15]; + limit01B = params[1][1][15]; +#endif + + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01B); + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); + val01A = _mm_max_epi16(val01A, limit01A); + val01B = _mm_max_epi16(val01B, limit01B); +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + coeff01A = params[0][0][18]; + coeff01B = params[1][0][18]; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + coeff01A = params[0][0][16]; + coeff01B = params[1][0][16]; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + coeff01A = params[0][0][17]; + coeff01B = params[1][0][17]; +#else + coeff01A = params[0][0][15]; + coeff01B = params[1][0][15]; +#endif + + accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); + accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); + // end prediction fixed filter +#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER + val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) (pImg0Gauss[0])), cur); + val10 = _mm_setzero_si128(); + val01A = _mm_unpacklo_epi16(val00, val10); + val01B = _mm_unpackhi_epi16(val00, val10); +#if JVET_AD0222_ALF_LONG_FIXFILTER + limit01A = params[0][1][19]; + limit01B = params[1][1][19]; +#else + limit01A = params[0][1][18]; + limit01B = params[1][1][18]; +#endif + + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01B); + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); val01A = _mm_max_epi16(val01A, limit01A); val01B = _mm_max_epi16(val01B, limit01B); +#if JVET_AD0222_ALF_LONG_FIXFILTER + coeff01A = params[0][0][19]; + coeff01B = params[1][0][19]; +#else + coeff01A = params[0][0][18]; + coeff01B = params[1][0][18]; +#endif + accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); + accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); +#endif + accumA = _mm_srai_epi32(accumA, shift); + accumB = _mm_srai_epi32(accumB, shift); + + accumA = _mm_packs_epi32(accumA, accumB); +#if JVET_AI0084_ALF_RESIDUALS_SCALING + if ( bScalingCorr ) + { + accumA = _mm_add_epi16(accumA, curBase); + } + else +#endif + accumA = _mm_add_epi16(accumA, cur); + accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); + + _mm_storeu_si128((__m128i *) (dst + j), accumA); + } // for j + src += srcStride * stepY; + dst += dstStride * stepY; + srcBeforeDb += srcBeforeDbStride * stepY; + srcResi += srcResiStride * stepY; + } // for i +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + }//Use 256 Bit Simd + #endif +} + +template<X86_VEXT vext> +static void simdFilter13x13BlkExtDbResi( + AlfClassifier * *classifier, const PelUnitBuf &recDst, const PelUnitBuf &recBeforeDb, const PelUnitBuf &resi, + const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, const short *filterSet, +#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT + const Pel *fClipSet +#else + const short *fClipSet +#endif + ,const ClpRng &clpRng, CodingStructure &cs, Pel ***fixedFilterResults, Pel ***fixedFilterResiResults, int fixedFilterSetIdx +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + ,Pel ***fixedFilterResultsPerCtu, bool isFixedFilterPaddedPerCtu +#endif +#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER + , Pel ***gaussPic, Pel ***gaussCtu +#endif +#if JVET_AG0158_ALF_LUMA_COEFF_PRECISION + , char coeffBits +#endif +) +{ + const CPelBuf srcBuffer = recSrc.get(compId); + PelBuf dstBuffer = recDst.get(compId); + const CPelBuf scrBufferBeforeDb = recBeforeDb.get(compId); + const CPelBuf scrBufferResi = resi.get(compId); + + const size_t srcStride = srcBuffer.stride; + const size_t dstStride = dstBuffer.stride; + const size_t srcBeforeDbStride = scrBufferBeforeDb.stride; + const size_t srcResiStride = scrBufferResi.stride; +#if JVET_AI0084_ALF_RESIDUALS_SCALING + int adjustShift = coeffBits - 1; + const bool bScalingCorr = isLuma(compId) && fixedFilterSetIdx < 0; + if ( bScalingCorr ) + { + fixedFilterSetIdx = -fixedFilterSetIdx - 1; + adjustShift -= shiftPrecis; // add more precision + } + const int shift = adjustShift; + const Pel currBase = 512; + int round = 1 << (shift - 1); + +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) + __m128i curBase = _mm_set_epi16( currBase, currBase, currBase, currBase, currBase, currBase, currBase, currBase ); +#endif +#else +#if JVET_AG0158_ALF_LUMA_COEFF_PRECISION + int shift = coeffBits; + shift -= 1; + int round = 1 << (shift - 1); +#else + constexpr int shift = AdaptiveLoopFilter::m_NUM_BITS - 1; + constexpr int round = 1 << (shift - 1); +#endif +#endif + + const size_t width = blk.width; + const size_t height = blk.height; + + constexpr size_t stepX = 8; + size_t stepY = 1; + +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) + const __m128i mmOffset = _mm_set1_epi32(round); + const __m128i mmMin = _mm_set1_epi16(clpRng.min); + const __m128i mmMax = _mm_set1_epi16(clpRng.max); +#endif + + static_assert(sizeof(*filterSet) == 2, "ALF coeffs must be 16-bit wide"); + static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide"); + + const Pel *src = srcBuffer.buf + blk.y * srcStride + blk.x; + Pel *dst = dstBuffer.buf + blkDst.y * dstStride + blkDst.x; + const Pel *srcBeforeDb = scrBufferBeforeDb.buf + blk.y * srcBeforeDbStride + blk.x; + const Pel *srcResi = scrBufferResi.buf + blk.y * srcResiStride + blk.x; +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + const int padSize = ALF_PADDING_SIZE_FIXED_RESULTS; +#endif +#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER + const int padSizeGauss = ALF_PADDING_SIZE_GAUSS_RESULTS; +#endif + +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool use256BitSimd = vext >= AVX2 && blkDst.width % 16 == 0 ? true : false; + + if( use256BitSimd ) + { + const __m256i mmOffset = _mm256_set1_epi32(round); + const __m256i mmMin = _mm256_set1_epi16(clpRng.min); + const __m256i mmMax = _mm256_set1_epi16(clpRng.max); +#if JVET_AI0084_ALF_RESIDUALS_SCALING + const __m256i curBase = _mm256_set1_epi16(currBase); +#endif + for (size_t i = 0; i < height; i += stepY) + { + const AlfClassifier *pClass = classifier[blkDst.y + i] + blkDst.x; + for (size_t j = 0; j < width; j += stepX * 2) + { +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER + __m256i params[2][2][18]; +#else + __m256i params[2][2][17]; +#endif +#else + __m256i params[2][2][13]; +#endif + for (int k = 0; k < 2; k++) + { + __m256i rawCoef[4][5], rawClip[4][5], s0, s1; + __m128i rawCoefTmp[2][4][5], rawClipTmp[2][4][5], s0Tmp[2], s1Tmp[2], s2Tmp[2], s3Tmp[2]; + for (int l = 0; l < 4; l++) + { + const int transposeIdx0 = pClass[j + 4 * k + l + 0] & 0x3; + const int classIdx0 = pClass[j + 4 * k + l + 0] >> 2; + + rawCoefTmp[0][l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF)); + rawCoefTmp[0][l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawCoefTmp[0][l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawCoefTmp[0][l][3] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 24)); + rawCoefTmp[0][l][4] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 32)); + + rawClipTmp[0][l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF)); + rawClipTmp[0][l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawClipTmp[0][l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawClipTmp[0][l][3] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 24)); + rawClipTmp[0][l][4] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx0 * MAX_NUM_ALF_LUMA_COEFF + 32)); + + const int transposeIdx1 = pClass[j + 4 * k + l + 8] & 0x3; + const int classIdx1 = pClass[j + 4 * k + l + 8] >> 2; + + rawCoefTmp[1][l][0] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF)); + rawCoefTmp[1][l][1] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawCoefTmp[1][l][2] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawCoefTmp[1][l][3] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 24)); + rawCoefTmp[1][l][4] = _mm_loadu_si128((const __m128i *) (filterSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 32)); + + rawClipTmp[1][l][0] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF)); + rawClipTmp[1][l][1] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 8)); + rawClipTmp[1][l][2] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 16)); + rawClipTmp[1][l][3] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 24)); + rawClipTmp[1][l][4] = _mm_loadu_si128((const __m128i *) (fClipSet + classIdx1 * MAX_NUM_ALF_LUMA_COEFF + 32)); + +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + for (int m = 0; m < shuffleTime13FixedBasedLongLength[transposeIdx0]; m++) +#else + for (int m = 0; m < shuffleTime13LongLength[transposeIdx0]; m++) +#endif + { +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + int op0 = shuffleOp13FixedBasedLongLength[transposeIdx0][m][0]; + int op1 = shuffleOp13FixedBasedLongLength[transposeIdx0][m][1]; +#else + int op0 = shuffleOp13LongLength[transposeIdx0][m][0]; + int op1 = shuffleOp13LongLength[transposeIdx0][m][1]; +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + s0Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx0][m][0]); + s1Tmp[0] = _mm_xor_si128(s0Tmp[0], _mm_set1_epi8((char) 0x80)); + s2Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx0][m][1]); + s3Tmp[0] = _mm_xor_si128(s2Tmp[0], _mm_set1_epi8((char) 0x80)); +#else + s0Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx0][m][0]); + s1Tmp[0] = _mm_xor_si128(s0Tmp[0], _mm_set1_epi8((char) 0x80)); + s2Tmp[0] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx0][m][1]); + s3Tmp[0] = _mm_xor_si128(s2Tmp[0], _mm_set1_epi8((char) 0x80)); +#endif + + __m128i rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[0][l][op0], s0Tmp[0]), _mm_shuffle_epi8(rawCoefTmp[0][l][op1], s1Tmp[0])); + __m128i rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[0][l][op0], s2Tmp[0]), _mm_shuffle_epi8(rawCoefTmp[0][l][op1], s3Tmp[0])); + rawCoefTmp[0][l][op0] = rawTmp0; + rawCoefTmp[0][l][op1] = rawTmp1; + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[0][l][op0], s0Tmp[0]), _mm_shuffle_epi8(rawClipTmp[0][l][op1], s1Tmp[0])); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[0][l][op0], s2Tmp[0]), _mm_shuffle_epi8(rawClipTmp[0][l][op1], s3Tmp[0])); + rawClipTmp[0][l][op0] = rawTmp0; + rawClipTmp[0][l][op1] = rawTmp1; + } + +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + for (int m = 0; m < shuffleTime13FixedBasedLongLength[transposeIdx1]; m++) +#else + for (int m = 0; m < shuffleTime13LongLength[transposeIdx1]; m++) +#endif + { +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + int op0 = shuffleOp13FixedBasedLongLength[transposeIdx1][m][0]; + int op1 = shuffleOp13FixedBasedLongLength[transposeIdx1][m][1]; +#else + int op0 = shuffleOp13LongLength[transposeIdx1][m][0]; + int op1 = shuffleOp13LongLength[transposeIdx1][m][1]; +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + s0Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx1][m][0]); + s1Tmp[1] = _mm_xor_si128(s0Tmp[1], _mm_set1_epi8((char) 0x80)); + s2Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13FixedBasedLongLength[transposeIdx1][m][1]); + s3Tmp[1] = _mm_xor_si128(s2Tmp[1], _mm_set1_epi8((char) 0x80)); +#else + s0Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx1][m][0]); + s1Tmp[1] = _mm_xor_si128(s0Tmp[1], _mm_set1_epi8((char) 0x80)); + s2Tmp[1] = _mm_loadu_si128((const __m128i *) shuffleTab13LongLength[transposeIdx1][m][1]); + s3Tmp[1] = _mm_xor_si128(s2Tmp[1], _mm_set1_epi8((char) 0x80)); +#endif + + __m128i rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[1][l][op0], s0Tmp[1]), _mm_shuffle_epi8(rawCoefTmp[1][l][op1], s1Tmp[1])); + __m128i rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawCoefTmp[1][l][op0], s2Tmp[1]), _mm_shuffle_epi8(rawCoefTmp[1][l][op1], s3Tmp[1])); + rawCoefTmp[1][l][op0] = rawTmp0; + rawCoefTmp[1][l][op1] = rawTmp1; + + rawTmp0 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[1][l][op0], s0Tmp[1]), _mm_shuffle_epi8(rawClipTmp[1][l][op1], s1Tmp[1])); + rawTmp1 = _mm_or_si128(_mm_shuffle_epi8(rawClipTmp[1][l][op0], s2Tmp[1]), _mm_shuffle_epi8(rawClipTmp[1][l][op1], s3Tmp[1])); + rawClipTmp[1][l][op0] = rawTmp0; + rawClipTmp[1][l][op1] = rawTmp1; + } + + rawCoef[l][0] = _mm256_castsi128_si256(rawCoefTmp[0][l][0] ); + rawCoef[l][0] = _mm256_insertf128_si256(rawCoef[l][0], rawCoefTmp[1][l][0], 1 ); + rawCoef[l][1] = _mm256_castsi128_si256(rawCoefTmp[0][l][1]); + rawCoef[l][1] = _mm256_insertf128_si256(rawCoef[l][1], rawCoefTmp[1][l][1], 1); + rawCoef[l][2] = _mm256_castsi128_si256(rawCoefTmp[0][l][2]); + rawCoef[l][2] = _mm256_insertf128_si256(rawCoef[l][2], rawCoefTmp[1][l][2], 1); + rawCoef[l][3] = _mm256_castsi128_si256(rawCoefTmp[0][l][3]); + rawCoef[l][3] = _mm256_insertf128_si256(rawCoef[l][3], rawCoefTmp[1][l][3], 1); + rawCoef[l][4] = _mm256_castsi128_si256(rawCoefTmp[0][l][4]); + rawCoef[l][4] = _mm256_insertf128_si256(rawCoef[l][4], rawCoefTmp[1][l][4], 1); + + rawClip[l][0] = _mm256_castsi128_si256(rawClipTmp[0][l][0]); + rawClip[l][0] = _mm256_insertf128_si256(rawClip[l][0], rawClipTmp[1][l][0], 1); + rawClip[l][1] = _mm256_castsi128_si256(rawClipTmp[0][l][1]); + rawClip[l][1] = _mm256_insertf128_si256(rawClip[l][1], rawClipTmp[1][l][1], 1); + rawClip[l][2] = _mm256_castsi128_si256(rawClipTmp[0][l][2]); + rawClip[l][2] = _mm256_insertf128_si256(rawClip[l][2], rawClipTmp[1][l][2], 1); + rawClip[l][3] = _mm256_castsi128_si256(rawClipTmp[0][l][3]); + rawClip[l][3] = _mm256_insertf128_si256(rawClip[l][3], rawClipTmp[1][l][3], 1); + rawClip[l][4] = _mm256_castsi128_si256(rawClipTmp[0][l][4]); + rawClip[l][4] = _mm256_insertf128_si256(rawClip[l][4], rawClipTmp[1][l][4], 1); + } // for l + +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + int limR, lim0, lim1, lim2, lim3; +#if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER + limR = 5, lim0 = 5, lim1 = 5, lim2 = 4, lim3 = 4; +#elif JVET_AD0222_ALF_LONG_FIXFILTER + limR = 5, lim0 = 5, lim1 = 5, lim2 = 4, lim3 = 4; +#elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER + limR = 5, lim0 = 5, lim1 = 4, lim2 = 4, lim3 = 4; +#else + limR = 5, lim0 = 5, lim1 = 4, lim2 = 4, lim3 = 4; +#endif + for (unsigned char l = 0; l < limR; l++) +#else + for (unsigned char l = 0; l < 5; l++) +#endif + { + int m = l << 2; +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + if (l < lim0) + { +#endif + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0x00), _mm256_shuffle_epi32(rawCoef[1][l], 0x00)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0x00), _mm256_shuffle_epi32(rawCoef[3][l], 0x00)); + params[k][0][0 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0x00), _mm256_shuffle_epi32(rawClip[1][l], 0x00)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0x00), _mm256_shuffle_epi32(rawClip[3][l], 0x00)); + params[k][1][0 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } + if (l < lim1) + { +#endif + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0x55), _mm256_shuffle_epi32(rawCoef[1][l], 0x55)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0x55), _mm256_shuffle_epi32(rawCoef[3][l], 0x55)); + params[k][0][1 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0x55), _mm256_shuffle_epi32(rawClip[1][l], 0x55)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0x55), _mm256_shuffle_epi32(rawClip[3][l], 0x55)); + params[k][1][1 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } + if (l < lim2) + { +#endif + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0xaa), _mm256_shuffle_epi32(rawCoef[1][l], 0xaa)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0xaa), _mm256_shuffle_epi32(rawCoef[3][l], 0xaa)); + params[k][0][2 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0xaa), _mm256_shuffle_epi32(rawClip[1][l], 0xaa)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0xaa), _mm256_shuffle_epi32(rawClip[3][l], 0xaa)); + params[k][1][2 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } + if (l < lim3) + { +#endif + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0xff), _mm256_shuffle_epi32(rawCoef[1][l], 0xff)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0xff), _mm256_shuffle_epi32(rawCoef[3][l], 0xff)); + params[k][0][3 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0xff), _mm256_shuffle_epi32(rawClip[1][l], 0xff)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0xff), _mm256_shuffle_epi32(rawClip[3][l], 0xff)); + params[k][1][3 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + } +#endif + } // for l + } // for k + + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; +#if !JVET_AD0222_ALF_LONG_FIXFILTER + const Pel *pImg9, *pImg10, *pImg11, *pImg12; +#endif + const Pel *pImgP0; + + pImg0 = src + j; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + pImg7 = pImg5 + srcStride; + pImg8 = pImg6 - srcStride; +#if !JVET_AD0222_ALF_LONG_FIXFILTER + pImg9 = pImg7 + srcStride; + pImg10 = pImg8 - srcStride; + pImg11 = pImg9 + srcStride; + pImg12 = pImg10 - srcStride; +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AE0139_ALF_IMPROVED_FIXFILTER + int filterSetIdx = 2 + fixedFilterSetIdx; +#else + int filterSetIdx = 0 + fixedFilterSetIdx; +#endif + const Pel *pImg0FixedBased, *pImg1FixedBased, *pImg2FixedBased, *pImg3FixedBased, *pImg4FixedBased; +#if JVET_AD0222_ALF_LONG_FIXFILTER + const Pel *pImg5FixedBased, *pImg6FixedBased, *pImg7FixedBased, *pImg8FixedBased, *pImg9FixedBased, + *pImg10FixedBased, *pImg11FixedBased, *pImg12FixedBased; +#endif + if (isFixedFilterPaddedPerCtu) + { + pImg0FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 0] + j + padSize; + pImg1FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 1] + j + padSize; + pImg2FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 1] + j + padSize; + pImg3FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 2] + j + padSize; + pImg4FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 2] + j + padSize; +#if JVET_AD0222_ALF_LONG_FIXFILTER + pImg5FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 3] + j + padSize; + pImg6FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 3] + j + padSize; + pImg7FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 4] + j + padSize; + pImg8FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 4] + j + padSize; + pImg9FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 5] + j + padSize; + pImg10FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 5] + j + padSize; + pImg11FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize + 6] + j + padSize; + pImg12FixedBased = fixedFilterResultsPerCtu[filterSetIdx][i + padSize - 6] + j + padSize; +#endif + } + else + { + pImg0FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 0] + blkDst.x + j + padSize; + pImg1FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 1] + blkDst.x + j + padSize; + pImg2FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 1] + blkDst.x + j + padSize; + pImg3FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 2] + blkDst.x + j + padSize; + pImg4FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 2] + blkDst.x + j + padSize; +#if JVET_AD0222_ALF_LONG_FIXFILTER + pImg5FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 3] + blkDst.x + j + padSize; + pImg6FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 3] + blkDst.x + j + padSize; + pImg7FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 4] + blkDst.x + j + padSize; + pImg8FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 4] + blkDst.x + j + padSize; + pImg9FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 5] + blkDst.x + j + padSize; + pImg10FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 5] + blkDst.x + j + padSize; + pImg11FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize + 6] + blkDst.x + j + padSize; + pImg12FixedBased = fixedFilterResults[filterSetIdx][blkDst.y + i + padSize - 6] + blkDst.x + j + padSize; +#endif + } +#endif +#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER + const Pel *pImg0Gauss[NUM_GAUSS_FILTERED_SOURCE]; + + for (int gaussIdx = 0; gaussIdx < NUM_GAUSS_FILTERED_SOURCE; gaussIdx++) + { + if (isFixedFilterPaddedPerCtu) + { + pImg0Gauss[gaussIdx] = gaussCtu[gaussIdx][i + padSizeGauss + 0] + j + padSizeGauss; + } + else + { + pImg0Gauss[gaussIdx] = gaussPic[gaussIdx][blkDst.y + i + padSizeGauss + 0] + blkDst.x + j + padSizeGauss; + } + } +#endif + __m256i cur = _mm256_loadu_si256((const __m256i *) pImg0); + __m256i accumA = mmOffset; + __m256i accumB = mmOffset; + + auto process2coeffs = [&](const int i, const Pel *ptr0, const Pel *ptr1, const Pel *ptr2, const Pel *ptr3) + { + const __m256i val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr0), cur); + const __m256i val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr2), cur); + const __m256i val01 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr1), cur); + const __m256i val11 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr3), cur); + + __m256i val01A = _mm256_unpacklo_epi16(val00, val10); + __m256i val01B = _mm256_unpackhi_epi16(val00, val10); + __m256i val01C = _mm256_unpacklo_epi16(val01, val11); + __m256i val01D = _mm256_unpackhi_epi16(val01, val11); + + __m256i limit01A = params[0][1][i]; + __m256i limit01B = params[1][1][i]; + + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + val01C = _mm256_min_epi16(val01C, limit01A); + val01D = _mm256_min_epi16(val01D, limit01B); + + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); + val01C = _mm256_max_epi16(val01C, limit01A); + val01D = _mm256_max_epi16(val01D, limit01B); + + val01A = _mm256_add_epi16(val01A, val01C); + val01B = _mm256_add_epi16(val01B, val01D); + + const __m256i coeff01A = params[0][0][i]; + const __m256i coeff01B = params[1][0][i]; + + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); + }; #if JVET_AD0222_ALF_LONG_FIXFILTER && JVET_AD0222_ADDITONAL_ALF_FIXFILTER - coeff01A = params[0][0][18]; - coeff01B = params[1][0][18]; + process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); + process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); + process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); + process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); + process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); + process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); + process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); + process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); + process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); + process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); + process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); #elif JVET_AD0222_ALF_LONG_FIXFILTER - coeff01A = params[0][0][16]; - coeff01B = params[1][0][16]; + process2coeffs(0, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(1, pImg3 + 0, pImg4 - 0, pImg1 + 1, pImg2 - 1); + process2coeffs(2, pImg1 + 0, pImg2 - 0, pImg1 - 1, pImg2 + 1); + process2coeffs(3, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(4, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(5, pImg12FixedBased - 0, pImg11FixedBased + 0, pImg10FixedBased - 0, pImg9FixedBased + 0); + process2coeffs(6, pImg8FixedBased - 0, pImg7FixedBased + 0, pImg6FixedBased - 0, pImg5FixedBased + 0); + process2coeffs(7, pImg4FixedBased - 1, pImg3FixedBased + 1, pImg4FixedBased - 0, pImg3FixedBased + 0); + process2coeffs(8, pImg4FixedBased + 1, pImg3FixedBased - 1, pImg2FixedBased - 2, pImg1FixedBased + 2); + process2coeffs(9, pImg2FixedBased - 1, pImg1FixedBased + 1, pImg2FixedBased - 0, pImg1FixedBased + 0); + process2coeffs(10, pImg2FixedBased + 1, pImg1FixedBased - 1, pImg2FixedBased + 2, pImg1FixedBased - 2); + process2coeffs(11, pImg0FixedBased - 6, pImg0FixedBased + 6, pImg0FixedBased - 5, pImg0FixedBased + 5); + process2coeffs(12, pImg0FixedBased - 4, pImg0FixedBased + 4, pImg0FixedBased - 3, pImg0FixedBased + 3); + process2coeffs(13, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); #elif JVET_AD0222_ADDITONAL_ALF_FIXFILTER - coeff01A = params[0][0][17]; - coeff01B = params[1][0][17]; + process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); + process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); + process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); + process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); + process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); + process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); + process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); + process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); + process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); + process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); #else - coeff01A = params[0][0][15]; - coeff01B = params[1][0][15]; + process2coeffs(0, pImg11 + 0, pImg12 - 0, pImg9 + 0, pImg10 - 0); + process2coeffs(1, pImg7 + 0, pImg8 - 0, pImg5 + 0, pImg6 - 0); + process2coeffs(2, pImg3 + 2, pImg4 - 2, pImg3 + 1, pImg4 - 1); + process2coeffs(3, pImg3 + 0, pImg4 - 0, pImg3 - 1, pImg4 + 1); + process2coeffs(4, pImg3 - 2, pImg4 + 2, pImg1 + 2, pImg2 - 2); + process2coeffs(5, pImg1 + 1, pImg2 - 1, pImg1 + 0, pImg2 - 0); + process2coeffs(6, pImg1 - 1, pImg2 + 1, pImg1 - 2, pImg2 + 2); + process2coeffs(7, pImg0 + 6, pImg0 - 6, pImg0 + 5, pImg0 - 5); + process2coeffs(8, pImg0 + 4, pImg0 - 4, pImg0 + 3, pImg0 - 3); + process2coeffs(9, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + process2coeffs(10, pImg4FixedBased - 0, pImg3FixedBased + 0, pImg2FixedBased - 1, pImg1FixedBased + 1); + process2coeffs(11, pImg2FixedBased - 0, pImg1FixedBased + 0, pImg2FixedBased + 1, pImg1FixedBased - 1); + process2coeffs(12, pImg0FixedBased - 2, pImg0FixedBased + 2, pImg0FixedBased - 1, pImg0FixedBased + 1); #endif +#endif + pImg0 = srcBeforeDb + j; + pImg1 = pImg0 + srcBeforeDbStride; + pImg2 = pImg0 - srcBeforeDbStride; + pImgP0 = srcResi + j; - accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); - accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); - // end prediction fixed filter -#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER - val00 = _mm_sub_epi16(_mm_loadu_si128((const __m128i *) (pImg0Gauss[0])), cur); - val10 = _mm_setzero_si128(); - val01A = _mm_unpacklo_epi16(val00, val10); - val01B = _mm_unpackhi_epi16(val00, val10); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AD0222_ALF_LONG_FIXFILTER - limit01A = params[0][1][19]; - limit01B = params[1][1][19]; + process2coeffs(14, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); #else - limit01A = params[0][1][18]; - limit01B = params[1][1][18]; + process2coeffs(13, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); +#endif +#else + process2coeffs(10, pImg1 + 0, pImg2 + 0, pImg0 + 1, pImg0 - 1); #endif - val01A = _mm_min_epi16(val01A, limit01A); - val01B = _mm_min_epi16(val01B, limit01B); - limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); - limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); - val01A = _mm_max_epi16(val01A, limit01A); - val01B = _mm_max_epi16(val01B, limit01B); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS + __m256i val00 = _mm256_sub_epi16( _mm256_loadu_si256((const __m256i *) (fixedFilterResults[0 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), cur); + __m256i val10 = _mm256_sub_epi16( _mm256_loadu_si256((const __m256i *) (fixedFilterResults[2 + fixedFilterSetIdx][blkDst.y + i + padSize] + blkDst.x + j + padSize)), cur); +#else + __m256i val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) pImg0), cur); + __m256i val10 = _mm256_sub_epi16( _mm256_loadu_si256((const __m256i *) (fixedFilterResults[fixedFilterSetIdx][blkDst.y + i] + blkDst.x + j)), cur); +#endif + __m256i val01A = _mm256_unpacklo_epi16(val00, val10); + __m256i val01B = _mm256_unpackhi_epi16(val00, val10); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS #if JVET_AD0222_ALF_LONG_FIXFILTER - coeff01A = params[0][0][19]; - coeff01B = params[1][0][19]; + __m256i limit01A = params[0][1][15]; + __m256i limit01B = params[1][1][15]; #else - coeff01A = params[0][0][18]; - coeff01B = params[1][0][18]; + __m256i limit01A = params[0][1][14]; + __m256i limit01B = params[1][1][14]; #endif - accumA = _mm_add_epi32(accumA, _mm_madd_epi16(val01A, coeff01A)); - accumB = _mm_add_epi32(accumB, _mm_madd_epi16(val01B, coeff01B)); +#else + __m256i limit01A = params[0][1][11]; + __m256i limit01B = params[1][1][11]; #endif - accumA = _mm_srai_epi32(accumA, shift); - accumB = _mm_srai_epi32(accumB, shift); - - accumA = _mm_packs_epi32(accumA, accumB); -#if JVET_AI0084_ALF_RESIDUALS_SCALING - if ( bScalingCorr ) - { - accumA = _mm_add_epi16(accumA, curBase); - } - else + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER + __m256i coeff01A = params[0][0][15]; + __m256i coeff01B = params[1][0][15]; +#else + __m256i coeff01A = params[0][0][14]; + __m256i coeff01B = params[1][0][14]; #endif - accumA = _mm_add_epi16(accumA, cur); - accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); +#else + __m256i coeff01A = params[0][0][11]; + __m256i coeff01B = params[1][0][11]; +#endif + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); - _mm_storeu_si128((__m128i *) (dst + j), accumA); - } // for j - src += srcStride * stepY; - dst += dstStride * stepY; - srcBeforeDb += srcBeforeDbStride * stepY; - srcResi += srcResiStride * stepY; - } // for i -} + // start residual fixed filter + __m256i zero = _mm256_setzero_si256(); + val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) (fixedFilterResiResults[1 - fixedFilterSetIdx][blkDst.y + i] + blkDst.x + j)), zero); + val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) (pImg0)), cur); + val01A = _mm256_unpacklo_epi16(val00, val10); + val01B = _mm256_unpackhi_epi16(val00, val10); +#if JVET_AD0222_ALF_LONG_FIXFILTER + limit01A = params[0][1][16]; + limit01B = params[1][1][16]; +#else + limit01A = params[0][1][15]; + limit01B = params[1][1][15]; +#endif -template<X86_VEXT vext> -static void simdFilter13x13BlkExtDbResi( - AlfClassifier * *classifier, const PelUnitBuf &recDst, const PelUnitBuf &recBeforeDb, const PelUnitBuf &resi, - const CPelUnitBuf &recSrc, const Area &blkDst, const Area &blk, const ComponentID compId, const short *filterSet, -#if JVET_R0351_HIGH_BIT_DEPTH_SUPPORT - const Pel *fClipSet + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); +#if JVET_AD0222_ALF_LONG_FIXFILTER + coeff01A = params[0][0][16]; + coeff01B = params[1][0][16]; #else - const short *fClipSet + coeff01A = params[0][0][15]; + coeff01B = params[1][0][15]; #endif - ,const ClpRng &clpRng, CodingStructure &cs, Pel ***fixedFilterResults, Pel ***fixedFilterResiResults, int fixedFilterSetIdx + + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); + // end residual fixed filter + #if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - ,Pel ***fixedFilterResultsPerCtu, bool isFixedFilterPaddedPerCtu -#endif + val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) pImgP0), zero); #if JVET_AD0222_ADDITONAL_ALF_FIXFILTER - , Pel ***gaussPic, Pel ***gaussCtu -#endif -#if JVET_AG0158_ALF_LUMA_COEFF_PRECISION - , char coeffBits + val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) (pImg0Gauss[0])), cur); +#else + val10 = _mm256_sub_epi16(cur, cur); #endif -) -{ - const CPelBuf srcBuffer = recSrc.get(compId); - PelBuf dstBuffer = recDst.get(compId); - const CPelBuf scrBufferBeforeDb = recBeforeDb.get(compId); - const CPelBuf scrBufferResi = resi.get(compId); - - const size_t srcStride = srcBuffer.stride; - const size_t dstStride = dstBuffer.stride; - const size_t srcBeforeDbStride = scrBufferBeforeDb.stride; - const size_t srcResiStride = scrBufferResi.stride; -#if JVET_AI0084_ALF_RESIDUALS_SCALING - int adjustShift = coeffBits - 1; - const bool bScalingCorr = isLuma(compId) && fixedFilterSetIdx < 0; - if ( bScalingCorr ) - { - fixedFilterSetIdx = -fixedFilterSetIdx - 1; - adjustShift -= shiftPrecis; // add more precision - } - const int shift = adjustShift; - const Pel currBase = 512; - int round = 1 << (shift - 1); - __m128i curBase = _mm_set_epi16( currBase, currBase, currBase, currBase, currBase, currBase, currBase, currBase ); + val01A = _mm256_unpacklo_epi16(val00, val10); + val01B = _mm256_unpackhi_epi16(val00, val10); #else -#if JVET_AG0158_ALF_LUMA_COEFF_PRECISION - int shift = coeffBits; - shift -= 1; - int round = 1 << (shift - 1); + __m256i val = _mm256_sub_epi16( _mm256_loadu_si256( (const __m256i *) (fixedFilterResults[EXT_LENGTH + fixedFilterSetIdx][blkDst.y + i] + blkDst.x + j)), cur); + val01A = _mm256_shuffle_epi8(val, _mm256_setr_epi8(0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7)); + val01B = _mm256_shuffle_epi8(val, _mm256_setr_epi8(8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15)); +#endif +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER + limit01A = params[0][1][17]; + limit01B = params[1][1][17]; #else - constexpr int shift = AdaptiveLoopFilter::m_NUM_BITS - 1; - constexpr int round = 1 << (shift - 1); + limit01A = params[0][1][16]; + limit01B = params[1][1][16]; +#endif +#else + limit01A = params[0][1][12]; + limit01B = params[1][1][12]; +#endif + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); +#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS +#if JVET_AD0222_ALF_LONG_FIXFILTER + coeff01A = params[0][0][17]; + coeff01B = params[1][0][17]; +#else + coeff01A = params[0][0][16]; + coeff01B = params[1][0][16]; #endif +#else + coeff01A = params[0][0][12]; + coeff01B = params[1][0][12]; #endif + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); - const size_t width = blk.width; - const size_t height = blk.height; + accumA = _mm256_srai_epi32(accumA, shift); + accumB = _mm256_srai_epi32(accumB, shift); - constexpr size_t stepX = 8; - size_t stepY = 1; + accumA = _mm256_packs_epi32(accumA, accumB); +#if JVET_AI0084_ALF_RESIDUALS_SCALING + if( bScalingCorr ) + { + accumA = _mm256_add_epi16(accumA, curBase); + } + else +#endif + accumA = _mm256_add_epi16(accumA, cur); + accumA = _mm256_min_epi16(mmMax, _mm256_max_epi16(accumA, mmMin)); + + _mm256_storeu_si256((__m256i *) (dst + j), accumA); + } // for j + src += srcStride * stepY; + dst += dstStride * stepY; + srcBeforeDb += srcBeforeDbStride * stepY; + srcResi += srcResiStride * stepY; + } // for i + } + else + { const __m128i mmOffset = _mm_set1_epi32(round); const __m128i mmMin = _mm_set1_epi16(clpRng.min); const __m128i mmMax = _mm_set1_epi16(clpRng.max); - - static_assert(sizeof(*filterSet) == 2, "ALF coeffs must be 16-bit wide"); - static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide"); - - const Pel *src = srcBuffer.buf + blk.y * srcStride + blk.x; - Pel *dst = dstBuffer.buf + blkDst.y * dstStride + blkDst.x; - const Pel *srcBeforeDb = scrBufferBeforeDb.buf + blk.y * srcBeforeDbStride + blk.x; - const Pel *srcResi = scrBufferResi.buf + blk.y * srcResiStride + blk.x; -#if JVET_AB0184_ALF_MORE_FIXED_FILTER_OUTPUT_TAPS - const int padSize = ALF_PADDING_SIZE_FIXED_RESULTS; -#endif -#if JVET_AD0222_ADDITONAL_ALF_FIXFILTER - const int padSizeGauss = ALF_PADDING_SIZE_GAUSS_RESULTS; +#if JVET_AI0084_ALF_RESIDUALS_SCALING + const __m128i curBase = _mm_set1_epi16(currBase); #endif +#endif //Use AVX2 SIMD for (size_t i = 0; i < height; i += stepY) { const AlfClassifier *pClass = classifier[blkDst.y + i] + blkDst.x; @@ -4586,13 +6079,20 @@ static void simdFilter13x13BlkExtDbResi( srcBeforeDb += srcBeforeDbStride * stepY; srcResi += srcResiStride * stepY; } // for i +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + }//Use 256 Bit Simd + #endif } #endif #if JVET_AD0222_ADDITONAL_ALF_FIXFILTER //Gauss Filter template<X86_VEXT vext> -static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx ) +static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelBuf &srcLuma, const Area &blkDst, const Area &blk, const ClpRng &clpRng, const Pel clippingValues[4], int filterSetIdx, int storeIdx +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, AlfClassifier** classifierCodingInfo +#endif + ) { int16_t gaussCoefTable[NUM_GAUSS_FILTERED_SOURCE][25] = { @@ -4617,8 +6117,18 @@ static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelB } int16_t diffTH = 32; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + + const bool useBounCondition = applyCodingInfo && !(!isSpsAdjust && isIntraSlice); + const bool useResiCondition = applyCodingInfo && (isSpsAdjust || !isSpsAdjust) && !isIntraSlice && false; + const int offsetClipValue = 1 << ( clpRng.bd - 1 ); +#endif +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) const __m128i offsetMax = _mm_set1_epi16(diffTH); const __m128i offsetMin = _mm_sub_epi16(_mm_setzero_si128(), offsetMax); +#endif const CPelBuf srcBuffer = srcLuma; const int srcStride = srcBuffer.stride; @@ -4632,9 +6142,11 @@ static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelB constexpr int stepX = 8; int stepY = 1; +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) const __m128i mmOffset = _mm_set1_epi32(round); const __m128i mmMin = _mm_set1_epi16(clpRng.min); const __m128i mmMax = _mm_set1_epi16(clpRng.max); +#endif static_assert(sizeof(*gaussCoefTable[0]) == 2, "ALF coeffs must be 16-bit wide"); static_assert(sizeof(*gaussClipTable ) == 2, "ALF clip values must be 16-bit wide"); @@ -4642,8 +6154,296 @@ static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelB const Pel *src = srcBuffer.buf + blk.y * srcStride + blk.x; const int padSizeGauss = ALF_PADDING_SIZE_GAUSS_RESULTS; +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool use256BitSimd = vext >= AVX2 && blkDst.width % 16 == 0 ? true : false; + + if( use256BitSimd ) + { +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m256i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m256i mmOriOffset; + __m256i mmSignOffsetP, mmSignOffsetN; + __m256i mmAbsOffset; + __m256i mmAdjOffset; + __m256i mmZeroVector = _mm256_set1_epi16( 0 ); + __m256i mm01Vector = _mm256_set1_epi16( 1 ); + __m256i mm08Vector = _mm256_set1_epi16( 8 ); + __m256i mm16Vector = _mm256_set1_epi16( 16 ); + __m256i mmPOffsetClipVector = _mm256_set1_epi16( +offsetClipValue ); + __m256i mmNOffsetClipVector = _mm256_set1_epi16( -offsetClipValue ); + // Set Factor + __m256i mmBsFactor = isIntraSlice ? _mm256_set1_epi16( 4 + 2 ) : _mm256_set1_epi16( 3 + 2 ); + __m256i mmResiFactor = isIntraSlice ? _mm256_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0)) : _mm256_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0)); +#endif + const __m256i offsetMax = _mm256_set1_epi16(diffTH); + const __m256i offsetMin = _mm256_sub_epi16(_mm256_set1_epi16( 0 ), offsetMax); + const __m256i mmOffset = _mm256_set1_epi32(round); + const __m256i mmMin = _mm256_set1_epi16(clpRng.min); + const __m256i mmMax = _mm256_set1_epi16(clpRng.max); + + for (int i = 0; i < height; i += stepY) + { +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if( useBounCondition || useResiCondition ) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif + for (int j = 0; j < width; j += stepX * 2) + { + __m256i params[2][2][6]; + + for (int k = 0; k < 2; k++) + { + __m256i rawCoef[4][2], rawClip[4][2], s0, s1; + __m128i rawCoefTmp[4][2], rawClipTmp[4][2]; + + for (int l = 0; l < 4; l++) + { + rawCoefTmp[l][0] = _mm_loadu_si128((const __m128i *) (gaussCoefTable[filterSetIdx] + 0)); + rawCoefTmp[l][1] = _mm_loadu_si128((const __m128i *) (gaussCoefTable[filterSetIdx] + 8)); + + rawClipTmp[l][0] = _mm_loadu_si128((const __m128i *) (gaussClipTable + 0)); + rawClipTmp[l][1] = _mm_loadu_si128((const __m128i *) (gaussClipTable + 8)); + + rawCoef[l][0] = _mm256_castsi128_si256( rawCoefTmp[l][0]); + rawCoef[l][0] = _mm256_insertf128_si256(rawCoef[l][0], rawCoefTmp[l][0], 1); + rawCoef[l][1] = _mm256_castsi128_si256( rawCoefTmp[l][1]); + rawCoef[l][1] = _mm256_insertf128_si256(rawCoef[l][1], rawCoefTmp[l][1], 1); + + rawClip[l][0] = _mm256_castsi128_si256(rawClipTmp[l][0]); + rawClip[l][0] = _mm256_insertf128_si256(rawClip[l][0], rawClipTmp[l][0], 1); + rawClip[l][1] = _mm256_castsi128_si256(rawClipTmp[l][1]); + rawClip[l][1] = _mm256_insertf128_si256(rawClip[l][1], rawClipTmp[l][1], 1); + } // for l + + for (unsigned char l = 0; l < 2; l++) + { + int m = l << 2; + + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0x00), _mm256_shuffle_epi32(rawCoef[1][l], 0x00)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0x00), _mm256_shuffle_epi32(rawCoef[3][l], 0x00)); + params[k][0][0 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0x00), _mm256_shuffle_epi32(rawClip[1][l], 0x00)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0x00), _mm256_shuffle_epi32(rawClip[3][l], 0x00)); + params[k][1][0 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0x55), _mm256_shuffle_epi32(rawCoef[1][l], 0x55)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0x55), _mm256_shuffle_epi32(rawCoef[3][l], 0x55)); + params[k][0][1 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0x55), _mm256_shuffle_epi32(rawClip[1][l], 0x55)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0x55), _mm256_shuffle_epi32(rawClip[3][l], 0x55)); + params[k][1][1 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + + if (l < 1) + { + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0xaa), _mm256_shuffle_epi32(rawCoef[1][l], 0xaa)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0xaa), _mm256_shuffle_epi32(rawCoef[3][l], 0xaa)); + params[k][0][2 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0xaa), _mm256_shuffle_epi32(rawClip[1][l], 0xaa)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0xaa), _mm256_shuffle_epi32(rawClip[3][l], 0xaa)); + params[k][1][2 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[0][l], 0xff), _mm256_shuffle_epi32(rawCoef[1][l], 0xff)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawCoef[2][l], 0xff), _mm256_shuffle_epi32(rawCoef[3][l], 0xff)); + params[k][0][3 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + s0 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[0][l], 0xff), _mm256_shuffle_epi32(rawClip[1][l], 0xff)); + s1 = _mm256_unpacklo_epi64(_mm256_shuffle_epi32(rawClip[2][l], 0xff), _mm256_shuffle_epi32(rawClip[3][l], 0xff)); + params[k][1][3 + m] = _mm256_blend_epi16(_mm256_shuffle_epi32(s0, 0x88), _mm256_shuffle_epi32(s1, 0x88), 0xf0); + } + } // for l + } // for k +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm256_set1_epi16( 0 ); + mmClassIdxBsN = _mm256_set1_epi16( 0 ); + if( useBounCondition ) + { + mmClassIdxTmp = _mm256_loadu_si256((const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16(mmClassIdxTmp, 1); + mmClassIdxBsN = _mm256_sub_epi16( mm01Vector, mmClassIdxBsP); + } + mmClassIdxResiP = _mm256_set1_epi16( 0 ); + mmClassIdxResiN = _mm256_set1_epi16( 0 ); + if( useResiCondition ) + { + mmClassIdxTmp = _mm256_loadu_si256((const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16(mmClassIdxTmp, 1); + mmClassIdxResiP = _mm256_sub_epi16(mmClassIdxTmp, _mm256_add_epi16(mmClassIdxBsP, mmClassIdxBsP)); + mmClassIdxResiN = _mm256_sub_epi16( mm01Vector, mmClassIdxResiP); + } +#endif + + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6; + pImg0 = src + j; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + + __m256i cur = _mm256_loadu_si256((const __m256i *) pImg0); + __m256i accumA = mmOffset; + __m256i accumB = mmOffset; + + auto process2coeffs = [&](const int i, const Pel *ptr0, const Pel *ptr1, const Pel *ptr2, const Pel *ptr3) + { + const __m256i val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr0), cur); + const __m256i val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr2), cur); + const __m256i val01 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr1), cur); + const __m256i val11 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr3), cur); + + __m256i val01A = _mm256_unpacklo_epi16(val00, val10); + __m256i val01B = _mm256_unpackhi_epi16(val00, val10); + __m256i val01C = _mm256_unpacklo_epi16(val01, val11); + __m256i val01D = _mm256_unpackhi_epi16(val01, val11); + + __m256i limit01A = params[0][1][i]; + __m256i limit01B = params[1][1][i]; + + val01A = _mm256_min_epi16(val01A, limit01A); + val01B = _mm256_min_epi16(val01B, limit01B); + val01C = _mm256_min_epi16(val01C, limit01A); + val01D = _mm256_min_epi16(val01D, limit01B); + + limit01A = _mm256_sub_epi16(_mm256_setzero_si256(), limit01A); + limit01B = _mm256_sub_epi16(_mm256_setzero_si256(), limit01B); + + val01A = _mm256_max_epi16(val01A, limit01A); + val01B = _mm256_max_epi16(val01B, limit01B); + val01C = _mm256_max_epi16(val01C, limit01A); + val01D = _mm256_max_epi16(val01D, limit01B); + + val01A = _mm256_add_epi16(val01A, val01C); + val01B = _mm256_add_epi16(val01B, val01D); + + const __m256i coeff01A = params[0][0][i]; + const __m256i coeff01B = params[1][0][i]; + + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff01A)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff01B)); + }; + + process2coeffs(0, pImg6 - 0, pImg5 + 0, pImg4 - 1, pImg3 + 1); + process2coeffs(1, pImg4 - 0, pImg3 + 0, pImg4 + 1, pImg3 - 1); + process2coeffs(2, pImg2 - 2, pImg1 + 2, pImg2 - 1, pImg1 + 1); + process2coeffs(3, pImg2 - 0, pImg1 + 0, pImg2 + 1, pImg1 - 1); + process2coeffs(4, pImg2 + 2, pImg1 - 2, pImg0 - 3, pImg0 + 3); + process2coeffs(5, pImg0 - 2, pImg0 + 2, pImg0 - 1, pImg0 + 1); + + accumA = _mm256_srai_epi32(accumA, shift); + accumB = _mm256_srai_epi32(accumB, shift); + + accumA = _mm256_packs_epi32(accumA, accumB); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if ( useBounCondition ) + { + accumA = _mm256_min_epi16( mmPOffsetClipVector, accumA); + accumA = _mm256_max_epi16( mmNOffsetClipVector, accumA); + // accumA is Ori Offset + mmOriOffset = accumA; + // Calc Sign + // P = 1, N = 0 + mmSignOffsetP = _mm256_abs_epi16( _mm256_cmpgt_epi16(mmOriOffset, mmZeroVector) ); + // P = 0, N = 1 + mmSignOffsetN = _mm256_abs_epi16( _mm256_sub_epi16( mm01Vector, mmSignOffsetP)); + // Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16( mmOriOffset ); + // BS based Adjustment + mmAdjOffset = _mm256_mullo_epi16(mmAbsOffset, _mm256_add_epi16( mm16Vector, mmBsFactor)); + mmAdjOffset = _mm256_add_epi16(mmAdjOffset, mm08Vector); + mmAdjOffset = _mm256_srai_epi16(mmAdjOffset, 4); + + __m256i mmTmpAdj = _mm256_mullo_epi16(mmClassIdxBsP, mmAdjOffset); + __m256i mmTmpOrg = _mm256_mullo_epi16(mmClassIdxBsN, mmAbsOffset); + + __m256i mmTmpFin = _mm256_add_epi16(mmTmpAdj, mmTmpOrg); + + __m256i mmTmpSignP = _mm256_mullo_epi16(mmSignOffsetP, mmTmpFin); + __m256i mmTmpSignN = _mm256_sub_epi16( mmZeroVector, _mm256_mullo_epi16(mmSignOffsetN, mmTmpFin) ); + + accumA = _mm256_add_epi16(mmTmpSignP, mmTmpSignN); + } + + if ( useResiCondition ) + { + accumA = _mm256_min_epi16( mmPOffsetClipVector, accumA); + accumA = _mm256_max_epi16( mmNOffsetClipVector, accumA); + // accumA is Ori Offset + mmOriOffset = accumA; + // Calc Sign + // P = 1, N = 0 + mmSignOffsetP = _mm256_abs_epi16( _mm256_cmpgt_epi16(mmOriOffset, mmZeroVector)); + // P = 0, N = 1 + mmSignOffsetN = _mm256_abs_epi16( _mm256_sub_epi16( mm01Vector, mmSignOffsetP)); + // Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16(mmOriOffset); + // Resi based Adjustment + mmAdjOffset = _mm256_mullo_epi16(mmAbsOffset, _mm256_add_epi16( mm16Vector, mmResiFactor)); + mmAdjOffset = _mm256_add_epi16(mmAdjOffset, mm08Vector); + mmAdjOffset = _mm256_srai_epi16(mmAdjOffset, 4); + + __m256i mmTmpAdj = _mm256_mullo_epi16(mmClassIdxResiP, mmAdjOffset); + __m256i mmTmpOrg = _mm256_mullo_epi16(mmClassIdxResiN, mmAbsOffset); + + __m256i mmTmpFin = _mm256_add_epi16(mmTmpAdj, mmTmpOrg); + + __m256i mmTmpSignP = _mm256_mullo_epi16(mmSignOffsetP, mmTmpFin); + __m256i mmTmpSignN = _mm256_sub_epi16(mmZeroVector, _mm256_mullo_epi16(mmSignOffsetN, mmTmpFin)); + + accumA = _mm256_add_epi16(mmTmpSignP, mmTmpSignN); + } +#endif + // Clip Offset + accumA = _mm256_min_epi16(accumA, offsetMax); + accumA = _mm256_max_epi16(accumA, offsetMin); + + accumA = _mm256_add_epi16(accumA, cur); + accumA = _mm256_min_epi16(mmMax, _mm256_max_epi16(accumA, mmMin)); + + int curY = blkDst.y + i + padSizeGauss; + int curX = blkDst.x + j + padSizeGauss; + + _mm256_storeu_si256((__m256i *) (gaussPic[storeIdx][curY] + curX), accumA); + } // for j + src += srcStride * stepY; + } // for i + } + else //use256BitSimd + { + + const __m128i offsetMax = _mm_set1_epi16(diffTH); + const __m128i offsetMin = _mm_sub_epi16(_mm_setzero_si128(), offsetMax); + const __m128i mmOffset = _mm_set1_epi32(round); + const __m128i mmMin = _mm_set1_epi16(clpRng.min); + const __m128i mmMax = _mm_set1_epi16(clpRng.max); +#endif //Use AVX2 SIMD +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m128i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m128i mmOriOffset; + __m128i mmSignOffsetP, mmSignOffsetN; + __m128i mmAbsOffset; + __m128i mmAdjOffset; + __m128i mmZeroVector = _mm_set1_epi16( 0 ); + __m128i mm01Vector = _mm_set1_epi16( 1 ); + __m128i mm08Vector = _mm_set1_epi16( 8 ); + __m128i mm16Vector = _mm_set1_epi16( 16 ); + __m128i mmPOffsetClipVector = _mm_set1_epi16( +offsetClipValue ); + __m128i mmNOffsetClipVector = _mm_set1_epi16( -offsetClipValue ); + //Set Factor + __m128i mmBsFactor = isIntraSlice ? _mm_set1_epi16( 4 + 2 ) : _mm_set1_epi16( 3 + 2 ); + __m128i mmResiFactor = isIntraSlice ? _mm_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0) ) : _mm_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0) ); +#endif for (int i = 0; i < height; i += stepY) { +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if( useBounCondition || useResiCondition ) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif for (int j = 0; j < width; j += stepX) { __m128i params[2][2][6]; @@ -4697,6 +6497,25 @@ static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelB } }//for l }//for k +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm_set1_epi16( 0 ); + mmClassIdxBsN = _mm_set1_epi16( 0 ); + if( useBounCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxBsN = _mm_sub_epi16( mm01Vector, mmClassIdxBsP ); + } + mmClassIdxResiP = _mm_set1_epi16( 0 ); + mmClassIdxResiN = _mm_set1_epi16( 0 ); + if( useResiCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxResiP = _mm_sub_epi16( mmClassIdxTmp, _mm_add_epi16( mmClassIdxBsP, mmClassIdxBsP) ); + mmClassIdxResiN = _mm_sub_epi16( mm01Vector, mmClassIdxResiP ); + } +#endif const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6; pImg0 = src + j; @@ -4759,6 +6578,65 @@ static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelB accumB = _mm_srai_epi32(accumB, shift); accumA = _mm_packs_epi32(accumA, accumB); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //BS based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmBsFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxBsP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxBsN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } + + if( useResiCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP)); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //Resi based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmResiFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxResiP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxResiN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } +#endif //Clip Offset accumA = _mm_min_epi16(accumA, offsetMax); @@ -4774,6 +6652,9 @@ static void simdGaussFiltering(CodingStructure &cs, Pel ***gaussPic, const CPelB }//for j src += srcStride * stepY; }//for i +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + }//use256BitSimd +#endif } #endif @@ -4893,7 +6774,11 @@ static void simdFixFilter13x13Db9Blk( AlfClassifier **classifier, const CPelBuf #if JVET_AE0139_ALF_IMPROVED_FIXFILTER const CPelBuf &srcLumaBeforeDb, #endif - Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] ) + Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ) #else static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLuma, const Area& curBlk, #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY @@ -4925,6 +6810,15 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu const __m128i mm11 = _mm_set1_epi8(1); const __m128i mm3 = _mm_set1_epi16(3); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + const bool useCodingInfo = true; + + const bool useBounCondition = applyCodingInfo && !( !isSpsAdjust && isIntraSlice ) && useCodingInfo; + const bool useResiCondition = applyCodingInfo && (isSpsAdjust || !isSpsAdjust) && !isIntraSlice && useCodingInfo; + const int offsetClipValue = 1 << (clpRng.bd - 1); +#endif #if JVET_AE0139_ALF_IMPROVED_FIXFILTER const int srcBeforeDbStride = srcLumaBeforeDb.stride; const Pel *srcBeforeDb = srcLumaBeforeDb.buf + curBlk.y * srcBeforeDbStride + curBlk.x; @@ -4946,6 +6840,22 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu mmClippingValues256 = _mm256_insertf128_si256(mmClippingValues256, mmClippingValues, 1); const __m256i mm11 = _mm256_set1_epi8(1); const __m256i mm3 = _mm256_set1_epi16(3); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m256i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m256i mmOriOffset; + __m256i mmSignOffsetP, mmSignOffsetN; + __m256i mmAbsOffset; + __m256i mmAdjOffset; + __m256i mmZeroVector = _mm256_set1_epi16( 0 ); + __m256i mm01Vector = _mm256_set1_epi16( 1 ); + __m256i mm08Vector = _mm256_set1_epi16( 8 ); + __m256i mm16Vector = _mm256_set1_epi16( 16 ); + __m256i mmPOffsetClipVector = _mm256_set1_epi16( +offsetClipValue ); + __m256i mmNOffsetClipVector = _mm256_set1_epi16( -offsetClipValue ); + //Set Factor + __m256i mmBsFactor = isIntraSlice ? _mm256_set1_epi16( 4 ) : _mm256_set1_epi16( 3 ); + __m256i mmResiFactor = isIntraSlice ? _mm256_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0) ) : _mm256_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0) ); +#endif for (int i = 0; i < height; i += stepY) { #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY @@ -4953,6 +6863,13 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu #else const AlfClassifier *pClass = classifier[curBlk.y + i] + curBlk.x; #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if( useBounCondition || useResiCondition ) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif for (int j = 0; j < width; j += stepX * 2) { @@ -5125,6 +7042,25 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu params[29] = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(rawCoef[0][8], rawCoef[1][8]), _mm256_unpacklo_epi32(rawCoef[2][8], rawCoef[3][8])); params[30] = _mm256_unpacklo_epi64(_mm256_unpackhi_epi32(rawCoef[0][8], rawCoef[1][8]), _mm256_unpackhi_epi32(rawCoef[2][8], rawCoef[3][8])); params[31] = _mm256_unpackhi_epi64(_mm256_unpackhi_epi32(rawCoef[0][8], rawCoef[1][8]), _mm256_unpackhi_epi32(rawCoef[2][8], rawCoef[3][8])); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm256_set1_epi16( 0 ); + mmClassIdxBsN = _mm256_set1_epi16( 0 ); + if( useBounCondition ) + { + mmClassIdxTmp = _mm256_loadu_si256( (const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxBsN = _mm256_sub_epi16( mm01Vector, mmClassIdxBsP ); + } + mmClassIdxResiP = _mm256_set1_epi16( 0 ); + mmClassIdxResiN = _mm256_set1_epi16( 0 ); + if( useResiCondition ) + { + mmClassIdxTmp = _mm256_loadu_si256( (const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxResiP = _mm256_sub_epi16( mmClassIdxTmp, _mm256_add_epi16( mmClassIdxBsP, mmClassIdxBsP ) ); + mmClassIdxResiN = _mm256_sub_epi16( mm01Vector, mmClassIdxResiP ); + } #endif for (int ii = 0; ii < stepY; ii++) { @@ -5269,6 +7205,65 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu accumB = _mm256_srai_epi32(accumB, shift); accumA = _mm256_blend_epi16(accumA, _mm256_slli_si256(accumB, 2), 0xAA); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + accumA = _mm256_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm256_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm256_abs_epi16( _mm256_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm256_abs_epi16( _mm256_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16( mmOriOffset ); + //BS based Adjustment + mmAdjOffset = _mm256_mullo_epi16( mmAbsOffset, _mm256_add_epi16( mm16Vector, mmBsFactor ) ); + mmAdjOffset = _mm256_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm256_srai_epi16( mmAdjOffset, 4 ); + + __m256i mmTmpAdj = _mm256_mullo_epi16( mmClassIdxBsP, mmAdjOffset ); + __m256i mmTmpOrg = _mm256_mullo_epi16( mmClassIdxBsN, mmAbsOffset ); + + __m256i mmTmpFin = _mm256_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m256i mmTmpSignP = _mm256_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m256i mmTmpSignN = _mm256_sub_epi16( mmZeroVector, _mm256_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm256_add_epi16( mmTmpSignP, mmTmpSignN ); + } + + if( useResiCondition ) + { + accumA = _mm256_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm256_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 0, N = 1 + mmSignOffsetP = _mm256_abs_epi16( _mm256_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 1, N = 0 + mmSignOffsetN = _mm256_abs_epi16( _mm256_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16( mmOriOffset ); + //Resi based Adjustment + mmAdjOffset = _mm256_mullo_epi16( mmAbsOffset, _mm256_add_epi16( mm16Vector, mmResiFactor ) ); + mmAdjOffset = _mm256_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm256_srai_epi16( mmAdjOffset, 4 ); + + __m256i mmTmpAdj = _mm256_mullo_epi16( mmClassIdxResiP, mmAdjOffset ); + __m256i mmTmpOrg = _mm256_mullo_epi16( mmClassIdxResiN, mmAbsOffset ); + + __m256i mmTmpFin = _mm256_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m256i mmTmpSignP = _mm256_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m256i mmTmpSignN = _mm256_sub_epi16( mmZeroVector, _mm256_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm256_add_epi16( mmTmpSignP, mmTmpSignN ); + } +#endif accumA = _mm256_add_epi16(accumA, cur); accumA = _mm256_min_epi16(mmMax, _mm256_max_epi16(accumA, mmMin)); @@ -5301,6 +7296,22 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu const __m128i mmClippingValues = _mm_loadl_epi64((const __m128i *)clippingValues); const __m128i mm11 = _mm_set1_epi8(1); const __m128i mm3 = _mm_set1_epi16(3); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m128i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m128i mmOriOffset; + __m128i mmSignOffsetP, mmSignOffsetN; + __m128i mmAbsOffset; + __m128i mmAdjOffset; + __m128i mmZeroVector = _mm_set1_epi16( 0 ); + __m128i mm01Vector = _mm_set1_epi16( 1 ); + __m128i mm08Vector = _mm_set1_epi16( 8 ); + __m128i mm16Vector = _mm_set1_epi16( 16 ); + __m128i mmPOffsetClipVector = _mm_set1_epi16( +offsetClipValue ); + __m128i mmNOffsetClipVector = _mm_set1_epi16( -offsetClipValue ); + //Set Factor + __m128i mmBsFactor = isIntraSlice ? _mm_set1_epi16( 4 ) : _mm_set1_epi16( 3 ); + __m128i mmResiFactor = isIntraSlice ? _mm_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0) ) : _mm_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0) ); #endif for (int i = 0; i < height; i += stepY) { @@ -5309,6 +7320,13 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu #else const AlfClassifier *pClass = classifier[curBlk.y + i] + curBlk.x; #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if( useBounCondition || useResiCondition ) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif for (int j = 0; j < width; j += stepX) { @@ -5414,6 +7432,25 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu params[30] = _mm_unpacklo_epi64(_mm_unpackhi_epi32(rawCoef[0][8], rawCoef[1][8]), _mm_unpackhi_epi32(rawCoef[2][8], rawCoef[3][8])); params[31] = _mm_unpackhi_epi64(_mm_unpackhi_epi32(rawCoef[0][8], rawCoef[1][8]), _mm_unpackhi_epi32(rawCoef[2][8], rawCoef[3][8])); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm_set1_epi16( 0 ); + mmClassIdxBsN = _mm_set1_epi16( 0 ); + if( useBounCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) ( pClassCodingInfo + j ) ); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxBsN = _mm_sub_epi16( mm01Vector, mmClassIdxBsP ); + } + mmClassIdxResiP = _mm_set1_epi16( 0 ); + mmClassIdxResiN = _mm_set1_epi16( 0 ); + if( useResiCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) ( pClassCodingInfo + j ) ); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxResiP = _mm_sub_epi16( mmClassIdxTmp , _mm_add_epi16( mmClassIdxBsP, mmClassIdxBsP) ); + mmClassIdxResiN = _mm_sub_epi16( mm01Vector, mmClassIdxResiP ); + } +#endif for (int ii = 0; ii < stepY; ii++) { @@ -5558,6 +7595,65 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu accumB = _mm_srai_epi32(accumB, shift); accumA = _mm_blend_epi16(accumA, _mm_slli_si128(accumB, 2), 0xAA); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //BS based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmBsFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxBsP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxBsN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } + + if( useResiCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //Resi based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmResiFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxResiP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxResiN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } +#endif accumA = _mm_add_epi16(accumA, cur); accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); @@ -5588,7 +7684,11 @@ static void simdFilter13x13Blk( AlfClassifier **classifier, const CPelBuf &srcLu #if JVET_AE0139_ALF_IMPROVED_FIXFILTER template<X86_VEXT vext> -static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area& curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]) +static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &srcLuma, const Area& curBlk, const Area &blkDst, const CPelBuf &srcLumaBeforeDb, Pel ***fixedFilterResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ) { const int srcStride = srcLuma.stride; constexpr int shift = AdaptiveLoopFilter::m_NUM_BITS_FIXED_FILTER - 1; @@ -5613,6 +7713,15 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr const __m128i mm11 = _mm_set1_epi8(1); const __m128i mm3 = _mm_set1_epi16(3); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + const bool useCodingInfo = true; + + const bool useBounCondition = applyCodingInfo && !( !isSpsAdjust && isIntraSlice ) && useCodingInfo; + const bool useResiCondition = applyCodingInfo && (isSpsAdjust || !isSpsAdjust) && !isIntraSlice && useCodingInfo; + const int offsetClipValue = 1 << ( clpRng.bd - 1 ); +#endif const int srcBeforeDbStride = srcLumaBeforeDb.stride; const Pel *srcBeforeDb = srcLumaBeforeDb.buf + curBlk.y * srcBeforeDbStride + curBlk.x; @@ -5630,6 +7739,22 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr mmClippingValues256 = _mm256_insertf128_si256(mmClippingValues256, mmClippingValues, 1); const __m256i mm11 = _mm256_set1_epi8(1); const __m256i mm3 = _mm256_set1_epi16(3); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m256i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m256i mmOriOffset; + __m256i mmSignOffsetP, mmSignOffsetN; + __m256i mmAbsOffset; + __m256i mmAdjOffset; + __m256i mmZeroVector = _mm256_set1_epi16( 0 ); + __m256i mm01Vector = _mm256_set1_epi16( 1 ); + __m256i mm08Vector = _mm256_set1_epi16( 8 ); + __m256i mm16Vector = _mm256_set1_epi16( 16 ); + __m256i mmPOffsetClipVector = _mm256_set1_epi16( +offsetClipValue ); + __m256i mmNOffsetClipVector = _mm256_set1_epi16( -offsetClipValue ); + //Set Factor + __m256i mmBsFactor = isIntraSlice ? _mm256_set1_epi16( 4 ) : _mm256_set1_epi16( 3 ); + __m256i mmResiFactor = isIntraSlice ? _mm256_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0) ) : _mm256_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0) ); +#endif for (int i = 0; i < height; i += stepY) { #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY @@ -5637,6 +7762,13 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr #else const AlfClassifier *pClass = classifier[curBlk.y + i] + curBlk.x; #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if( useBounCondition || useResiCondition ) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif for (int j = 0; j < width; j += stepX * 2) { @@ -5735,7 +7867,25 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr params[18] = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(rawCoef[0][5], rawCoef[1][5]), _mm256_unpacklo_epi32(rawCoef[2][5], rawCoef[3][5])); params[19] = _mm256_unpacklo_epi64(_mm256_unpackhi_epi32(rawCoef[0][5], rawCoef[1][5]), _mm256_unpackhi_epi32(rawCoef[2][5], rawCoef[3][5])); params[20] = _mm256_unpackhi_epi64(_mm256_unpackhi_epi32(rawCoef[0][5], rawCoef[1][5]), _mm256_unpackhi_epi32(rawCoef[2][5], rawCoef[3][5])); - +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm256_set1_epi16( 0 ); + mmClassIdxBsN = _mm256_set1_epi16( 0 ); + if( useBounCondition ) + { + mmClassIdxTmp = _mm256_loadu_si256( (const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxBsN = _mm256_sub_epi16( mm01Vector, mmClassIdxBsP ); + } + mmClassIdxResiP = _mm256_set1_epi16( 0 ); + mmClassIdxResiN = _mm256_set1_epi16( 0 ); + if( useResiCondition ) + { + mmClassIdxTmp = _mm256_loadu_si256( (const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxResiP = _mm256_sub_epi16( mmClassIdxTmp, _mm256_add_epi16( mmClassIdxBsP, mmClassIdxBsP ) ); + mmClassIdxResiN = _mm256_sub_epi16( mm01Vector, mmClassIdxResiP ); + } +#endif for (int ii = 0; ii < stepY; ii++) { const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; @@ -5836,6 +7986,65 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr accumB = _mm256_srai_epi32(accumB, shift); accumA = _mm256_blend_epi16(accumA, _mm256_slli_si256(accumB, 2), 0xAA); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + accumA = _mm256_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm256_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm256_abs_epi16( _mm256_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm256_abs_epi16( _mm256_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16( mmOriOffset ); + //BS based Adjustment + mmAdjOffset = _mm256_mullo_epi16( mmAbsOffset, _mm256_add_epi16( mm16Vector, mmBsFactor ) ); + mmAdjOffset = _mm256_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm256_srai_epi16( mmAdjOffset, 4 ); + + __m256i mmTmpAdj = _mm256_mullo_epi16( mmClassIdxBsP, mmAdjOffset ); + __m256i mmTmpOrg = _mm256_mullo_epi16( mmClassIdxBsN, mmAbsOffset ); + + __m256i mmTmpFin = _mm256_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m256i mmTmpSignP = _mm256_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m256i mmTmpSignN = _mm256_sub_epi16( mmZeroVector, _mm256_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm256_add_epi16( mmTmpSignP, mmTmpSignN ); + } + + if( useResiCondition ) + { + accumA = _mm256_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm256_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm256_abs_epi16( _mm256_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm256_abs_epi16( _mm256_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16( mmOriOffset ); + //Resi based Adjustment + mmAdjOffset = _mm256_mullo_epi16( mmAbsOffset, _mm256_add_epi16( mm16Vector, mmResiFactor ) ); + mmAdjOffset = _mm256_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm256_srai_epi16( mmAdjOffset, 4 ); + + __m256i mmTmpAdj = _mm256_mullo_epi16( mmClassIdxResiP, mmAdjOffset ); + __m256i mmTmpOrg = _mm256_mullo_epi16( mmClassIdxResiN, mmAbsOffset ); + + __m256i mmTmpFin = _mm256_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m256i mmTmpSignP = _mm256_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m256i mmTmpSignN = _mm256_sub_epi16( mmZeroVector, _mm256_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm256_add_epi16( mmTmpSignP, mmTmpSignN ); + } +#endif accumA = _mm256_add_epi16(accumA, cur); accumA = _mm256_min_epi16(mmMax, _mm256_max_epi16(accumA, mmMin)); @@ -5866,6 +8075,22 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr const __m128i mmClippingValues = _mm_loadl_epi64((const __m128i *)clippingValues); const __m128i mm11 = _mm_set1_epi8(1); const __m128i mm3 = _mm_set1_epi16(3); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m128i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m128i mmOriOffset; + __m128i mmSignOffsetP, mmSignOffsetN; + __m128i mmAbsOffset; + __m128i mmAdjOffset; + __m128i mmZeroVector = _mm_set1_epi16( 0 ); + __m128i mm01Vector = _mm_set1_epi16( 1 ); + __m128i mm08Vector = _mm_set1_epi16( 8 ); + __m128i mm16Vector = _mm_set1_epi16( 16 ); + __m128i mmPOffsetClipVector = _mm_set1_epi16( +offsetClipValue ); + __m128i mmNOffsetClipVector = _mm_set1_epi16( -offsetClipValue ); + //Set Factor + __m128i mmBsFactor = isIntraSlice ? _mm_set1_epi16( 4 ) : _mm_set1_epi16( 3 ); + __m128i mmResiFactor = isIntraSlice ? _mm_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0) ) : _mm_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0) ); #endif for (int i = 0; i < height; i += stepY) { @@ -5874,6 +8099,13 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr #else const AlfClassifier *pClass = classifier[curBlk.y + i] + curBlk.x; #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if( useBounCondition || useResiCondition ) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif for (int j = 0; j < width; j += stepX) { @@ -5936,6 +8168,25 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr params[19] = _mm_unpacklo_epi64(_mm_unpackhi_epi32(rawCoef[0][5], rawCoef[1][5]), _mm_unpackhi_epi32(rawCoef[2][5], rawCoef[3][5])); params[20] = _mm_unpackhi_epi64(_mm_unpackhi_epi32(rawCoef[0][5], rawCoef[1][5]), _mm_unpackhi_epi32(rawCoef[2][5], rawCoef[3][5])); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm_set1_epi16( 0 ); + mmClassIdxBsN = _mm_set1_epi16( 0 ); + if( useBounCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxBsN = _mm_sub_epi16( mm01Vector, mmClassIdxBsP ); + } + mmClassIdxResiP = _mm_set1_epi16( 0 ); + mmClassIdxResiN = _mm_set1_epi16( 0 ); + if( useResiCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxResiP = _mm_sub_epi16( mmClassIdxTmp, _mm_add_epi16( mmClassIdxBsP, mmClassIdxBsP) ); + mmClassIdxResiN = _mm_sub_epi16( mm01Vector, mmClassIdxResiP ); + } +#endif for (int ii = 0; ii < stepY; ii++) { const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; @@ -6036,6 +8287,65 @@ static void simdFixFilter9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &sr accumB = _mm_srai_epi32(accumB, shift); accumA = _mm_blend_epi16(accumA, _mm_slli_si128(accumB, 2), 0xAA); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //BS based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmBsFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxBsP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxBsN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } + + if( useResiCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //Resi based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmResiFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxResiP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxResiN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } +#endif accumA = _mm_add_epi16(accumA, cur); accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); @@ -6370,126 +8680,456 @@ static void simdDeriveVariance(const CPelBuf &srcLuma, const Area &blkDst, const __m128i xx4 = _mm_alignr_epi8(xx8, xx0, 8); __m128i xx6 = _mm_alignr_epi8(xx8, xx0, 12); - x0 = _mm_add_epi32(x0, y0); - s0 = _mm_add_epi32(s0, s2); + x0 = _mm_add_epi32(x0, y0); + s0 = _mm_add_epi32(s0, s2); + + __m128i x2 = _mm_alignr_epi8(s0, x0, 4); + __m128i x4 = _mm_alignr_epi8(s0, x0, 8); + __m128i x6 = _mm_alignr_epi8(s0, x0, 12); + + yy0 = _mm_add_epi32(xx0, xx2); + xx0 = _mm_add_epi32(xx4, xx6); + yy0 = _mm_add_epi32(yy0, xx8); + + y0 = _mm_add_epi32(x0, x2); + x4 = _mm_add_epi32(x4, x6); + y0 = _mm_add_epi32(y0, s0); + + __m128i sum2 = _mm_add_epi32(yy0, xx0); + __m128i sum = _mm_add_epi32(y0, x4); + + x0 = x8; + y0 = y8; + + _mm_storeu_si128((__m128i *) &variance[2][iOffset][jOffset], sum); + _mm_storeu_si128((__m128i *) &variance[3][iOffset][jOffset], sum2); + + if (i == 8) + { + x8 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 4][jOffset]); + y8 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 4][jOffset]); + x6 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 3][jOffset]); + __m128i y6 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 3][jOffset]); + x4 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 2][jOffset]); + __m128i y4 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 2][jOffset]); + x2 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 1][jOffset]); + __m128i y2 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 1][jOffset]); + + x8 = _mm_add_epi32(sum, x8); + y8 = _mm_add_epi32(sum2, y8); + + x4 = _mm_add_epi32(x6, x4); + y4 = _mm_add_epi32(y6, y4); + + x2 = _mm_add_epi32(x8, x2); + y2 = _mm_add_epi32(y8, y2); + + sum = _mm_add_epi32(x4, x2); + sum2 = _mm_add_epi32(y4, y2); + _mm_storeu_si128((__m128i *) &variance[0][iOffset - 4][jOffset], sum); + _mm_storeu_si128((__m128i *) &variance[1][iOffset - 4][jOffset], sum2); + + sum2 = _mm_mullo_epi32(sum2, n); + sum = _mm_mullo_epi32(sum, sum); + sum2 = _mm_add_epi32(sum2, o); + sum2 = _mm_sub_epi32(sum2, sum); + sum2 = _mm_srli_epi32(sum2, 3); + sum2 = _mm_mullo_epi32(sum2, m13); + sum2 = _mm_srli_epi32(sum2, 14); + _mm_storeu_si128((__m128i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); + } + else if (i > 8) + { + x8 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 5][jOffset]); + y8 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 5][jOffset]); + x6 = _mm_loadu_si128((__m128i *)&variance[0][iOffset - 5][jOffset]); + __m128i y6 = _mm_loadu_si128((__m128i *)&variance[1][iOffset - 5][jOffset]); + + x6 = _mm_sub_epi32(x6, x8); + y6 = _mm_sub_epi32(y6, y8); + + sum = _mm_add_epi32(x6, sum); + sum2 = _mm_add_epi32(y6, sum2); + _mm_storeu_si128((__m128i *) &variance[0][iOffset - 4][jOffset], sum); + _mm_storeu_si128((__m128i *) &variance[1][iOffset - 4][jOffset], sum2); + + sum2 = _mm_mullo_epi32(sum2, n); + sum = _mm_mullo_epi32(sum, sum); + sum2 = _mm_add_epi32(sum2, o); + sum2 = _mm_sub_epi32(sum2, sum); + sum2 = _mm_srli_epi32(sum2, 3); + sum2 = _mm_mullo_epi32(sum2, m13); + sum2 = _mm_srli_epi32(sum2, 14); + _mm_storeu_si128((__m128i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); + } + } + + } +#if USE_AVX2 + } +#endif +} +#endif + +#if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT +#if JVET_AE0139_ALF_IMPROVED_FIXFILTER +template<X86_VEXT vext> +static void simdFilterResi9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &srcResiLuma, const Area &curBlk, const Area &blkDst, Pel ***fixedFilterResiResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4] +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + , bool applyCodingInfo, CodingStructure &cs, AlfClassifier** classifierCodingInfo +#endif + ) +{ + const int srcStride = srcResiLuma.stride; + constexpr int shift = AdaptiveLoopFilter::m_NUM_BITS_FIXED_FILTER - 1; + constexpr int round = 1 << (shift - 1); + + const int width = curBlk.width; + const int height = curBlk.height; + + constexpr int stepX = 8; + constexpr int stepY = 2; + + const Pel *src = srcResiLuma.buf + curBlk.y * srcStride + curBlk.x; +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool isIntraSlice = cs.slice->isIntra(); + const bool isSpsAdjust = cs.sps->getAlfLumaFixedFilterAdjust(); + const bool useCodingInfo = isSpsAdjust ? true : false; + const bool useBounCondition = applyCodingInfo && !( !isSpsAdjust && isIntraSlice ) && useCodingInfo; + const bool useResiCondition = applyCodingInfo && (isSpsAdjust || !isSpsAdjust) && !isIntraSlice && useCodingInfo; + const int offsetClipValue = 1 << ( clpRng.bd - 1 ); +#endif +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) + const __m128i mmOffset = _mm_set1_epi32(round); +#endif + + const int clpRngmin = -clpRng.max; + const int clpRngmax = clpRng.max; +#if !( USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION ) + const __m128i mmMin = _mm_set1_epi16(clpRngmin); + const __m128i mmMax = _mm_set1_epi16(clpRngmax); + + const __m128i mmClippingValues = _mm_loadl_epi64((const __m128i *) clippingValues); + const __m128i mm11 = _mm_set1_epi8(1); + const __m128i mm3 = _mm_set1_epi16(3); +#endif + const std::array<std::array<short, FIX_FILTER_NUM_COEFF_DB_COMBINE_9_DB_9 + 1>, NUM_FIXED_FILTERS>& filterCoeffFixed = packedDataFixedFilters9Db9Combine[fixedFiltQpInd]; + const Pel zeros[8] = { 0 }; + +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool use256BitSimd = vext >= AVX2 && blkDst.width % 16 == 0; + + if( use256BitSimd ) + { +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m256i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m256i mmOriOffset; + __m256i mmSignOffsetP, mmSignOffsetN; + __m256i mmAbsOffset; + __m256i mmAdjOffset; + __m256i mmZeroVector = _mm256_set1_epi16(0); + __m256i mm01Vector = _mm256_set1_epi16(1); + __m256i mm08Vector = _mm256_set1_epi16(8); + __m256i mm16Vector = _mm256_set1_epi16(16); + __m256i mmPOffsetClipVector = _mm256_set1_epi16(+offsetClipValue); + __m256i mmNOffsetClipVector = _mm256_set1_epi16(-offsetClipValue); + // Set Factor + __m256i mmBsFactor = isIntraSlice ? _mm256_set1_epi16( 4 ) : _mm256_set1_epi16( 3 ); + __m256i mmResiFactor = isIntraSlice ? _mm256_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0)) : _mm256_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0) ); +#endif + + const __m256i mmOffset = _mm256_set1_epi32(round); + + const int clpRngmin = -clpRng.max; + const int clpRngmax = clpRng.max; + const __m256i mmMin = _mm256_set1_epi16(clpRngmin); + const __m256i mmMax = _mm256_set1_epi16(clpRngmax); + + const __m128i mmClippingValues = _mm_loadl_epi64((const __m128i *) clippingValues); + __m256i mmClippingValues256 = _mm256_castsi128_si256(mmClippingValues); + mmClippingValues256 = _mm256_insertf128_si256(mmClippingValues256, mmClippingValues, 1); + const __m256i mm11 = _mm256_set1_epi8(1); + const __m256i mm3 = _mm256_set1_epi16(3); + const std::array<std::array<short, FIX_FILTER_NUM_COEFF_DB_COMBINE_9_DB_9 + 1>, NUM_FIXED_FILTERS> + &filterCoeffFixed = packedDataFixedFilters9Db9Combine[fixedFiltQpInd]; + const Pel zeros[16] = { 0 }; + + for (int i = 0; i < height; i += stepY) + { +#if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY + const AlfClassifier *pClass = classifier[blkDst.y + i] + blkDst.x; +#else + const AlfClassifier *pClass = classifier[curBlk.y + i] + curBlk.x; +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if (useBounCondition || useResiCondition) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif + + for (int j = 0; j < width; j += stepX * 2) + { + __m256i params[11]; + __m256i rawCoef[4][3]; + for (int m = 0; m < 4; m++) + { + int transposeIdx0 = pClass[j + 2 * m] & 0x3; + const int filterIdx0 = classIndFixed[pClass[j + 2 * m] >> 2]; + + __m128i rawCoef00 = _mm_loadu_si128((const __m128i *) (filterCoeffFixed[filterIdx0].data())); + __m128i rawCoef01 = _mm_loadu_si128((const __m128i *) (filterCoeffFixed[filterIdx0].data() + 8)); + __m128i rawCoef02 = _mm_loadu_si128((const __m128i *) (filterCoeffFixed[filterIdx0].data() + 14)); + // transpose0 + if (transposeIdx0 != 0) + { + const __m128i s00 = _mm_loadu_si128((const __m128i *) shTab9Db9[transposeIdx0][0]); + const __m128i s01 = _mm_loadu_si128((const __m128i *) shTab9Db9[transposeIdx0][1]); + const __m128i s02 = _mm_loadu_si128((const __m128i *) shTab9Db9[transposeIdx0][2]); + + rawCoef00 = _mm_shuffle_epi8(rawCoef00, s00); + rawCoef01 = _mm_shuffle_epi8(rawCoef01, s01); + rawCoef02 = _mm_shuffle_epi8(rawCoef02, s02); + } + + int transposeIdx1 = pClass[j + 2 * m + 8] & 0x3; + const int filterIdx1 = classIndFixed[pClass[j + 2 * m + 8] >> 2]; + + __m128i rawCoef10 = _mm_loadu_si128((const __m128i *) (filterCoeffFixed[filterIdx1].data())); + __m128i rawCoef11 = _mm_loadu_si128((const __m128i *) (filterCoeffFixed[filterIdx1].data() + 8)); + __m128i rawCoef12 = _mm_loadu_si128((const __m128i *) (filterCoeffFixed[filterIdx1].data() + 14)); + // transpose1 + if (transposeIdx1 != 0) + { + const __m128i s10 = _mm_loadu_si128((const __m128i *) shTab9Db9[transposeIdx1][0]); + const __m128i s11 = _mm_loadu_si128((const __m128i *) shTab9Db9[transposeIdx1][1]); + const __m128i s12 = _mm_loadu_si128((const __m128i *) shTab9Db9[transposeIdx1][2]); + + rawCoef10 = _mm_shuffle_epi8(rawCoef10, s10); + rawCoef11 = _mm_shuffle_epi8(rawCoef11, s11); + rawCoef12 = _mm_shuffle_epi8(rawCoef12, s12); + } + + rawCoef[m][0] = _mm256_castsi128_si256(rawCoef00); + rawCoef[m][0] = _mm256_insertf128_si256(rawCoef[m][0], rawCoef10, 1); + rawCoef[m][1] = _mm256_castsi128_si256(rawCoef01); + rawCoef[m][1] = _mm256_insertf128_si256(rawCoef[m][1], rawCoef11, 1); + rawCoef[m][2] = _mm256_castsi128_si256(rawCoef02); + rawCoef[m][2] = _mm256_insertf128_si256(rawCoef[m][2], rawCoef12, 1); + } // for(m) + + params[0] = _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(rawCoef[0][0], rawCoef[1][0]), _mm256_unpacklo_epi32(rawCoef[2][0], rawCoef[3][0])); + params[1] = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(rawCoef[0][0], rawCoef[1][0]), _mm256_unpacklo_epi32(rawCoef[2][0], rawCoef[3][0])); + params[2] = _mm256_unpacklo_epi64(_mm256_unpackhi_epi32(rawCoef[0][0], rawCoef[1][0]), _mm256_unpackhi_epi32(rawCoef[2][0], rawCoef[3][0])); + params[3] = _mm256_unpackhi_epi64(_mm256_unpackhi_epi32(rawCoef[0][0], rawCoef[1][0]), _mm256_unpackhi_epi32(rawCoef[2][0], rawCoef[3][0])); + + params[4] = _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(rawCoef[0][1], rawCoef[1][1]), _mm256_unpacklo_epi32(rawCoef[2][1], rawCoef[3][1])); + params[5] = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(rawCoef[0][1], rawCoef[1][1]), _mm256_unpacklo_epi32(rawCoef[2][1], rawCoef[3][1])); + params[6] = _mm256_unpacklo_epi64(_mm256_unpackhi_epi32(rawCoef[0][1], rawCoef[1][1]), _mm256_unpackhi_epi32(rawCoef[2][1], rawCoef[3][1])); + + params[7] = _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(rawCoef[0][2], rawCoef[1][2]), _mm256_unpacklo_epi32(rawCoef[2][2], rawCoef[3][2])); + params[8] = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(rawCoef[0][2], rawCoef[1][2]), _mm256_unpacklo_epi32(rawCoef[2][2], rawCoef[3][2])); + params[9] = _mm256_unpacklo_epi64(_mm256_unpackhi_epi32(rawCoef[0][2], rawCoef[1][2]), _mm256_unpackhi_epi32(rawCoef[2][2], rawCoef[3][2])); + params[10] = _mm256_unpackhi_epi64(_mm256_unpackhi_epi32(rawCoef[0][2], rawCoef[1][2]), _mm256_unpackhi_epi32(rawCoef[2][2], rawCoef[3][2])); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm256_set1_epi16( 0 ); + mmClassIdxBsN = _mm256_set1_epi16( 0 ); + if (useBounCondition) + { + mmClassIdxTmp = _mm256_loadu_si256((const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16(mmClassIdxTmp, 1); + mmClassIdxBsN = _mm256_sub_epi16(mm01Vector, mmClassIdxBsP); + } + mmClassIdxResiP = _mm256_set1_epi16( 0 ); + mmClassIdxResiN = _mm256_set1_epi16( 0 ); + if (useResiCondition) + { + mmClassIdxTmp = _mm256_loadu_si256((const __m256i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm256_srai_epi16(mmClassIdxTmp, 1); + mmClassIdxResiP = _mm256_sub_epi16(mmClassIdxTmp, _mm256_add_epi16(mmClassIdxBsP, mmClassIdxBsP)); + mmClassIdxResiN = _mm256_sub_epi16(mm01Vector, mmClassIdxResiP); + } +#endif + for (int ii = 0; ii < stepY; ii++) + { + const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; + pImg0 = src + j + ii * srcStride; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + pImg7 = pImg5 + srcStride; + pImg8 = pImg6 - srcStride; + + __m256i cur = _mm256_loadu_si256((const __m256i *) pImg0); + __m256i accumA = mmOffset; + __m256i accumB = mmOffset; - __m128i x2 = _mm_alignr_epi8(s0, x0, 4); - __m128i x4 = _mm_alignr_epi8(s0, x0, 8); - __m128i x6 = _mm_alignr_epi8(s0, x0, 12); + auto process2coeffs = [&](const int i, const Pel *ptr0, const Pel *ptr1, const Pel *ptr2, const Pel *ptr3) + { + const __m256i val00 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr0), cur); + const __m256i val10 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr2), cur); + const __m256i val01 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr1), cur); + const __m256i val11 = _mm256_sub_epi16(_mm256_loadu_si256((const __m256i *) ptr3), cur); - yy0 = _mm_add_epi32(xx0, xx2); - xx0 = _mm_add_epi32(xx4, xx6); - yy0 = _mm_add_epi32(yy0, xx8); + __m256i val01A = _mm256_blend_epi16(val00, _mm256_slli_si256(val10, 2), 0xAA); + __m256i val01B = _mm256_blend_epi16(_mm256_srli_si256(val00, 2), val10, 0xAA); + __m256i val01C = _mm256_blend_epi16(val01, _mm256_slli_si256(val11, 2), 0xAA); + __m256i val01D = _mm256_blend_epi16(_mm256_srli_si256(val01, 2), val11, 0xAA); - y0 = _mm_add_epi32(x0, x2); - x4 = _mm_add_epi32(x4, x6); - y0 = _mm_add_epi32(y0, s0); + __m256i mmClippingFixed = _mm256_and_si256(params[i], mm3); - __m128i sum2 = _mm_add_epi32(yy0, xx0); - __m128i sum = _mm_add_epi32(y0, x4); + __m256i mmClippingFixed2 = _mm256_packs_epi16(mmClippingFixed, mmClippingFixed); + mmClippingFixed2 = _mm256_add_epi8(mmClippingFixed2, mmClippingFixed2); + __m256i xmm2 = _mm256_add_epi8(mmClippingFixed2, mm11); + __m256i xmmA = _mm256_unpacklo_epi8(mmClippingFixed2, xmm2); + __m256i limit = _mm256_shuffle_epi8(mmClippingValues256, xmmA); - x0 = x8; - y0 = y8; + val01A = _mm256_min_epi16(val01A, limit); + val01B = _mm256_min_epi16(val01B, limit); + val01C = _mm256_min_epi16(val01C, limit); + val01D = _mm256_min_epi16(val01D, limit); - _mm_storeu_si128((__m128i *) &variance[2][iOffset][jOffset], sum); - _mm_storeu_si128((__m128i *) &variance[3][iOffset][jOffset], sum2); + limit = _mm256_sub_epi16(_mm256_setzero_si256(), limit); - if (i == 8) - { - x8 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 4][jOffset]); - y8 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 4][jOffset]); - x6 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 3][jOffset]); - __m128i y6 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 3][jOffset]); - x4 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 2][jOffset]); - __m128i y4 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 2][jOffset]); - x2 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 1][jOffset]); - __m128i y2 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 1][jOffset]); + val01A = _mm256_max_epi16(val01A, limit); + val01B = _mm256_max_epi16(val01B, limit); + val01C = _mm256_max_epi16(val01C, limit); + val01D = _mm256_max_epi16(val01D, limit); - x8 = _mm_add_epi32(sum, x8); - y8 = _mm_add_epi32(sum2, y8); + val01A = _mm256_add_epi16(val01A, val01C); + val01B = _mm256_add_epi16(val01B, val01D); - x4 = _mm_add_epi32(x6, x4); - y4 = _mm_add_epi32(y6, y4); + const __m256i coeff = _mm256_srai_epi16(params[i], 2); - x2 = _mm_add_epi32(x8, x2); - y2 = _mm_add_epi32(y8, y2); + accumA = _mm256_add_epi32(accumA, _mm256_madd_epi16(val01A, coeff)); + accumB = _mm256_add_epi32(accumB, _mm256_madd_epi16(val01B, coeff)); + }; - sum = _mm_add_epi32(x4, x2); - sum2 = _mm_add_epi32(y4, y2); - _mm_storeu_si128((__m128i *) &variance[0][iOffset - 4][jOffset], sum); - _mm_storeu_si128((__m128i *) &variance[1][iOffset - 4][jOffset], sum2); + process2coeffs(0, pImg8 + 0, pImg7 + 0, pImg6 - 1, pImg5 + 1); + process2coeffs(1, pImg4 - 2, pImg3 + 2, pImg2 - 3, pImg1 + 3); + process2coeffs(2, pImg0 - 4, pImg0 + 4, pImg6 + 1, pImg5 - 1); + process2coeffs(3, pImg4 + 2, pImg3 - 2, pImg2 + 3, pImg1 - 3); - sum2 = _mm_mullo_epi32(sum2, n); - sum = _mm_mullo_epi32(sum, sum); - sum2 = _mm_add_epi32(sum2, o); - sum2 = _mm_sub_epi32(sum2, sum); - sum2 = _mm_srli_epi32(sum2, 3); - sum2 = _mm_mullo_epi32(sum2, m13); - sum2 = _mm_srli_epi32(sum2, 14); - _mm_storeu_si128((__m128i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); - } - else if (i > 8) - { - x8 = _mm_loadu_si128((__m128i *)&variance[2][iOffset - 5][jOffset]); - y8 = _mm_loadu_si128((__m128i *)&variance[3][iOffset - 5][jOffset]); - x6 = _mm_loadu_si128((__m128i *)&variance[0][iOffset - 5][jOffset]); - __m128i y6 = _mm_loadu_si128((__m128i *)&variance[1][iOffset - 5][jOffset]); + process2coeffs(4, pImg6 + 0, pImg5 - 0, pImg4 - 1, pImg3 + 1); + process2coeffs(5, pImg2 - 2, pImg1 + 2, pImg0 - 3, pImg0 + 3); + process2coeffs(6, pImg4 + 1, pImg3 - 1, pImg2 + 2, pImg1 - 2); - x6 = _mm_sub_epi32(x6, x8); - y6 = _mm_sub_epi32(y6, y8); + process2coeffs(7, pImg4 + 0, pImg3 - 0, pImg2 - 1, pImg1 + 1); + process2coeffs(8, pImg0 - 2, pImg0 + 2, pImg2 + 1, pImg1 - 1); + process2coeffs(9, pImg2 + 0, pImg1 - 0, pImg0 - 1, pImg0 + 1); + process2coeffs(10, zeros, pImg0, pImg0, pImg0); - sum = _mm_add_epi32(x6, sum); - sum2 = _mm_add_epi32(y6, sum2); - _mm_storeu_si128((__m128i *) &variance[0][iOffset - 4][jOffset], sum); - _mm_storeu_si128((__m128i *) &variance[1][iOffset - 4][jOffset], sum2); + accumA = _mm256_srai_epi32(accumA, shift); + accumB = _mm256_srai_epi32(accumB, shift); - sum2 = _mm_mullo_epi32(sum2, n); - sum = _mm_mullo_epi32(sum, sum); - sum2 = _mm_add_epi32(sum2, o); - sum2 = _mm_sub_epi32(sum2, sum); - sum2 = _mm_srli_epi32(sum2, 3); - sum2 = _mm_mullo_epi32(sum2, m13); - sum2 = _mm_srli_epi32(sum2, 14); - _mm_storeu_si128((__m128i *) &variance[VARIANCE][iOffset - 4][jOffset], sum2); - } - } + accumA = _mm256_blend_epi16(accumA, _mm256_slli_si256(accumB, 2), 0xAA); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if (useBounCondition) + { + accumA = _mm256_min_epi16(mmPOffsetClipVector, accumA); + accumA = _mm256_max_epi16(mmNOffsetClipVector, accumA); + // accumA is Ori Offset + mmOriOffset = accumA; + // Calc Sign + // P = 1, N = 0 + mmSignOffsetP = _mm256_abs_epi16(_mm256_cmpgt_epi16(mmOriOffset, mmZeroVector)); + // P = 0, N = 1 + mmSignOffsetN = _mm256_abs_epi16(_mm256_sub_epi16(mm01Vector, mmSignOffsetP)); + // Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16(mmOriOffset); + // BS based Adjustment + mmAdjOffset = _mm256_mullo_epi16(mmAbsOffset, _mm256_add_epi16(mm16Vector, mmBsFactor)); + mmAdjOffset = _mm256_add_epi16(mmAdjOffset, mm08Vector); + mmAdjOffset = _mm256_srai_epi16(mmAdjOffset, 4); + + __m256i mmTmpAdj = _mm256_mullo_epi16(mmClassIdxBsP, mmAdjOffset); + __m256i mmTmpOrg = _mm256_mullo_epi16(mmClassIdxBsN, mmAbsOffset); + + __m256i mmTmpFin = _mm256_add_epi16(mmTmpAdj, mmTmpOrg); + + __m256i mmTmpSignP = _mm256_mullo_epi16(mmSignOffsetP, mmTmpFin); + __m256i mmTmpSignN = _mm256_sub_epi16(mmZeroVector, _mm256_mullo_epi16(mmSignOffsetN, mmTmpFin)); + + accumA = _mm256_add_epi16(mmTmpSignP, mmTmpSignN); + } - } -#if USE_AVX2 - } -#endif -} + if (useResiCondition) + { + accumA = _mm256_min_epi16(mmPOffsetClipVector, accumA); + accumA = _mm256_max_epi16(mmNOffsetClipVector, accumA); + // accumA is Ori Offset + mmOriOffset = accumA; + // Calc Sign + // P = 1, N = 0 + mmSignOffsetP = _mm256_abs_epi16(_mm256_cmpgt_epi16(mmOriOffset, mmZeroVector)); + // P = 0, N = 1 + mmSignOffsetN = _mm256_abs_epi16(_mm256_sub_epi16(mm01Vector, mmSignOffsetP)); + // Calc Abs Offset + mmAbsOffset = _mm256_abs_epi16(mmOriOffset); + // Resi based Adjustment + mmAdjOffset = _mm256_mullo_epi16(mmAbsOffset, _mm256_add_epi16(mm16Vector, mmResiFactor)); + mmAdjOffset = _mm256_add_epi16(mmAdjOffset, mm08Vector); + mmAdjOffset = _mm256_srai_epi16(mmAdjOffset, 4); + + __m256i mmTmpAdj = _mm256_mullo_epi16(mmClassIdxResiP, mmAdjOffset); + __m256i mmTmpOrg = _mm256_mullo_epi16(mmClassIdxResiN, mmAbsOffset); + + __m256i mmTmpFin = _mm256_add_epi16(mmTmpAdj, mmTmpOrg); + + __m256i mmTmpSignP = _mm256_mullo_epi16(mmSignOffsetP, mmTmpFin); + __m256i mmTmpSignN = _mm256_sub_epi16(mmZeroVector, _mm256_mullo_epi16(mmSignOffsetN, mmTmpFin)); + + accumA = _mm256_add_epi16(mmTmpSignP, mmTmpSignN); + } #endif + accumA = _mm256_add_epi16(accumA, cur); + accumA = _mm256_min_epi16(mmMax, _mm256_max_epi16(accumA, mmMin)); -#if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT -#if JVET_AE0139_ALF_IMPROVED_FIXFILTER -template<X86_VEXT vext> -static void simdFilterResi9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &srcResiLuma, const Area &curBlk, const Area &blkDst, Pel ***fixedFilterResiResults, int picWidth, const int fixedFiltInd, const short classIndFixed[NUM_CLASSES_FIX], int fixedFiltQpInd, int dirWindSize, const ClpRng &clpRng, const Pel clippingValues[4]) -{ - const int srcStride = srcResiLuma.stride; - constexpr int shift = AdaptiveLoopFilter::m_NUM_BITS_FIXED_FILTER - 1; - constexpr int round = 1 << (shift - 1); - - const int width = curBlk.width; - const int height = curBlk.height; - - constexpr int stepX = 8; - constexpr int stepY = 2; +#if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY + _mm256_storeu_si256((__m256i *) (&(fixedFilterResiResults[fixedFiltInd][blkDst.y + i + ii][blkDst.x + j])), accumA); +#else + _mm256_storeu_si256((__m256i *) (&(fixedFilterResiResults[fixedFiltInd][curBlk.y + i + ii][curBlk.x + j])), accumA); +#endif + } // for (size_t ii = 0; ii < stepY; ii++) + } // for (size_t j = 0; j < width; j += stepX) + src += srcStride * stepY; + } - const Pel *src = srcResiLuma.buf + curBlk.y * srcStride + curBlk.x; + } + else + { const __m128i mmOffset = _mm_set1_epi32(round); - const int clpRngmin = -clpRng.max; - const int clpRngmax = clpRng.max; const __m128i mmMin = _mm_set1_epi16(clpRngmin); const __m128i mmMax = _mm_set1_epi16(clpRngmax); const __m128i mmClippingValues = _mm_loadl_epi64((const __m128i *) clippingValues); const __m128i mm11 = _mm_set1_epi8(1); const __m128i mm3 = _mm_set1_epi16(3); - const std::array<std::array<short, FIX_FILTER_NUM_COEFF_DB_COMBINE_9_DB_9 + 1>, NUM_FIXED_FILTERS>& filterCoeffFixed = packedDataFixedFilters9Db9Combine[fixedFiltQpInd]; - const Pel zeros[8] = { 0 }; +#endif //Use Avx2 Simd + +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + __m128i mmClassIdxBsP, mmClassIdxResiP, mmClassIdxBsN, mmClassIdxResiN, mmClassIdxTmp; + __m128i mmOriOffset; + __m128i mmSignOffsetP, mmSignOffsetN; + __m128i mmAbsOffset; + __m128i mmAdjOffset; + __m128i mmZeroVector = _mm_set1_epi16( 0 ); + __m128i mm01Vector = _mm_set1_epi16( 1 ); + __m128i mm08Vector = _mm_set1_epi16( 8 ); + __m128i mm16Vector = _mm_set1_epi16( 16 ); + __m128i mmPOffsetClipVector = _mm_set1_epi16( +offsetClipValue ); + __m128i mmNOffsetClipVector = _mm_set1_epi16( -offsetClipValue ); + //Set Factor + __m128i mmBsFactor = isIntraSlice ? _mm_set1_epi16( 4 ) : _mm_set1_epi16( 3 ); + __m128i mmResiFactor = isIntraSlice ? _mm_set1_epi16( 0 >> (!isSpsAdjust ? 1 : 0) ) : _mm_set1_epi16( 3 >> (!isSpsAdjust ? 1 : 0) ); +#endif + for (int i = 0; i < height; i += stepY) { #if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY @@ -6497,6 +9137,13 @@ static void simdFilterResi9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &s #else const AlfClassifier *pClass = classifier[curBlk.y + i] + curBlk.x; #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + AlfClassifier *pClassCodingInfo = nullptr; + if( useBounCondition || useResiCondition ) + { + pClassCodingInfo = classifierCodingInfo[blkDst.y + i] + blkDst.x; + } +#endif for (int j = 0; j < width; j += stepX) { @@ -6536,6 +9183,25 @@ static void simdFilterResi9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &s params[8] = _mm_unpackhi_epi64(_mm_unpacklo_epi32(rawCoef[0][2], rawCoef[1][2]), _mm_unpacklo_epi32(rawCoef[2][2], rawCoef[3][2])); params[9] = _mm_unpacklo_epi64(_mm_unpackhi_epi32(rawCoef[0][2], rawCoef[1][2]), _mm_unpackhi_epi32(rawCoef[2][2], rawCoef[3][2])); params[10] = _mm_unpackhi_epi64(_mm_unpackhi_epi32(rawCoef[0][2], rawCoef[1][2]), _mm_unpackhi_epi32(rawCoef[2][2], rawCoef[3][2])); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + mmClassIdxBsP = _mm_set1_epi16( 0 ); + mmClassIdxBsN = _mm_set1_epi16( 0 ); + if( useBounCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxBsN = _mm_sub_epi16( mm01Vector, mmClassIdxBsP ); + } + mmClassIdxResiP = _mm_set1_epi16( 0 ); + mmClassIdxResiN = _mm_set1_epi16( 0 ); + if( useResiCondition ) + { + mmClassIdxTmp = _mm_loadu_si128( (const __m128i *) (pClassCodingInfo + j)); + mmClassIdxBsP = _mm_srai_epi16( mmClassIdxTmp, 1 ); + mmClassIdxResiP = _mm_sub_epi16( mmClassIdxTmp, _mm_add_epi16( mmClassIdxBsP, mmClassIdxBsP) ); + mmClassIdxResiN = _mm_sub_epi16( mm01Vector, mmClassIdxResiP ); + } +#endif for (int ii = 0; ii < stepY; ii++) { const Pel *pImg0, *pImg1, *pImg2, *pImg3, *pImg4, *pImg5, *pImg6, *pImg7, *pImg8; @@ -6612,6 +9278,65 @@ static void simdFilterResi9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &s accumB = _mm_srai_epi32(accumB, shift); accumA = _mm_blend_epi16(accumA, _mm_slli_si128(accumB, 2), 0xAA); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( useBounCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //BS based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmBsFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector ); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxBsP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxBsN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } + + if( useResiCondition ) + { + accumA = _mm_min_epi16( mmPOffsetClipVector, accumA ); + accumA = _mm_max_epi16( mmNOffsetClipVector, accumA ); + //accumA is Ori Offset + mmOriOffset = accumA; + //Calc Sign + //P = 1, N = 0 + mmSignOffsetP = _mm_abs_epi16( _mm_cmpgt_epi16( mmOriOffset , mmZeroVector )); + //P = 0, N = 1 + mmSignOffsetN = _mm_abs_epi16( _mm_sub_epi16( mm01Vector, mmSignOffsetP )); + //Calc Abs Offset + mmAbsOffset = _mm_abs_epi16( mmOriOffset ); + //Resi based Adjustment + mmAdjOffset = _mm_mullo_epi16( mmAbsOffset, _mm_add_epi16( mm16Vector, mmResiFactor ) ); + mmAdjOffset = _mm_add_epi16( mmAdjOffset, mm08Vector); + mmAdjOffset = _mm_srai_epi16( mmAdjOffset, 4 ); + + __m128i mmTmpAdj = _mm_mullo_epi16( mmClassIdxResiP, mmAdjOffset ); + __m128i mmTmpOrg = _mm_mullo_epi16( mmClassIdxResiN, mmAbsOffset ); + + __m128i mmTmpFin = _mm_add_epi16( mmTmpAdj, mmTmpOrg ); + + __m128i mmTmpSignP = _mm_mullo_epi16( mmSignOffsetP, mmTmpFin ); + __m128i mmTmpSignN = _mm_sub_epi16( mmZeroVector, _mm_mullo_epi16( mmSignOffsetN, mmTmpFin )); + + accumA = _mm_add_epi16( mmTmpSignP, mmTmpSignN ); + } +#endif accumA = _mm_add_epi16(accumA, cur); accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); @@ -6624,6 +9349,9 @@ static void simdFilterResi9x9Db9Blk(AlfClassifier **classifier, const CPelBuf &s } // for (size_t j = 0; j < width; j += stepX) src += srcStride * stepY; } +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + }//Use 256 Bit Simd +#endif } #else template<X86_VEXT vext> @@ -7103,6 +9831,243 @@ static void simdCalcClass0(AlfClassifier **classifier, const Area &blkDst, const #endif const __m128i shift = _mm_cvtsi32_si128(9 + bitDepth - 4); const int multTab[] = { 5628, 1407, 624, 351, 225, 156 }; + +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool use256BitSimd = vext >= AVX2 && blkDst.width % 16 == 0 ? true : false; + + if( use256BitSimd) + { +#if JVET_AG0157_ALF_CHROMA_FIXED_FILTER + const __m256i mult = _mm256_set1_epi32(multTab[dirWindSize % 10]); +#else + const __m256i mult = _mm256_set1_epi32(multTab[dirWindSize]); +#endif + const __m256i dirOff = _mm256_set1_epi32(noDir * (noDir + 1)); + const __m256i ones = _mm256_set1_epi32(1); + const __m256i zeros = _mm256_setzero_si256(); + const __m256i scale = _mm256_set1_epi32(192); + + int lapOffset = (dirWindSize == 1) ? 2 : 0; + for (int i = 0; i < curBlk.height; i += 2) + { + int iOffset = (i >> 1) + lapOffset; + for (int j = 0; j < curBlk.width; j += 16) + { + int jOffset = (j >> 1) + lapOffset; +#if JVET_AE0139_ALF_IMPROVED_FIXFILTER + int iOffsetV = i >> 1; + int jOffsetV = j >> 1; +#endif + __m256i sumV = _mm256_loadu_si256((const __m256i *) &laplacian[VER][iOffset][jOffset]); // 8 32-bit values + __m256i sumH = _mm256_loadu_si256((const __m256i *) &laplacian[HOR][iOffset][jOffset]); + __m256i sumD0 = _mm256_loadu_si256((const __m256i *) &laplacian[DIAG0][iOffset][jOffset]); + __m256i sumD1 = _mm256_loadu_si256((const __m256i *) &laplacian[DIAG1][iOffset][jOffset]); + + // sum += sumV + sumH; + __m256i tempAct = _mm256_add_epi32(sumV, sumH); + __m256i activity = _mm256_mullo_epi32(tempAct, mult); + activity = _mm256_srl_epi32(activity, shift); + activity = _mm256_min_epi32(activity, scale); + + __m256i xmm2 = activity; + __m256i xmm0 = _mm256_setzero_si256(); + __m256i xmm15 = _mm256_cmpeq_epi32(xmm0, xmm0); + __m256i xmm1 = _mm256_srli_epi32(xmm15, 31); + __m256i xmm7 = _mm256_srli_epi32(xmm15, 29); +#if JVET_AE0139_ALF_IMPROVED_FIXFILTER + __m256i xmm8 = _mm256_srli_epi32(xmm15, 28); +#endif + __m256i xmm9 = _mm256_add_epi32(_mm256_slli_epi32(xmm7, 2), xmm1); + + __m256i LUT192 = _mm256_set_epi32(0x0C020A00, 0x0E040608, 0x0E040608, 0x0C020A00, 0x0C020A00, 0x0E040608, 0x0E040608, 0x0C020A00); + + xmm2 = _mm256_or_si256(xmm2, _mm256_srli_epi32(xmm2, 1)); + xmm2 = _mm256_or_si256(xmm2, _mm256_srli_epi32(xmm2, 2)); + xmm2 = _mm256_or_si256(xmm2, _mm256_srli_epi32(xmm2, 4)); + xmm2 = _mm256_mullo_epi16(xmm2, xmm9); + xmm2 = _mm256_and_si256(_mm256_srli_epi32(xmm2, 5), xmm7); + xmm2 = _mm256_shuffle_epi8(LUT192, xmm2); + + __m256i xmm4 = _mm256_xor_si256(activity, _mm256_srli_epi32(activity, 1)); +// xmm4 = _mm256_cmplt_epi32(xmm4, activity); + xmm4 = _mm256_cmpgt_epi32(xmm4, _mm256_sub_epi32( activity, _mm256_set1_epi32(1) ) ); + xmm4 = _mm256_add_epi32( _mm256_abs_epi32(xmm4), _mm256_set1_epi32( -1 ) ); + + xmm4 = _mm256_or_si256(_mm256_cmpeq_epi32(activity, xmm1), xmm4); + xmm4 = _mm256_and_si256(xmm4, xmm1); + + activity = _mm256_or_si256(xmm2, xmm4); + + __m256i hv1 = _mm256_max_epi32(sumV, sumH); + __m256i hv0 = _mm256_min_epi32(sumV, sumH); + + __m256i d1 = _mm256_max_epi32(sumD0, sumD1); + __m256i d0 = _mm256_min_epi32(sumD0, sumD1); + + // edgeStrengthHV, to optimize + __m256i hv0Two = _mm256_slli_epi32(hv0, 1); + __m256i hv0Eight = _mm256_slli_epi32(hv0, 3); + __m256i hv1Two = _mm256_slli_epi32(hv1, 1); + __m256i strength = _mm256_cmpgt_epi32(_mm256_slli_epi32(hv1, 2), _mm256_add_epi32(hv0, _mm256_slli_epi32(hv0, 2))); // 4, 5 + __m256i edgeStrengthHV = _mm256_and_si256(strength, ones); + + strength = _mm256_cmpgt_epi32(hv1Two, _mm256_add_epi32(hv0, hv0Two)); // 2, 3 + edgeStrengthHV = _mm256_add_epi32(edgeStrengthHV, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(hv1, hv0Two); // 1, 2 + edgeStrengthHV = _mm256_add_epi32(edgeStrengthHV, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(hv1, _mm256_add_epi32(hv0, hv0Two)); // 1, 3 + edgeStrengthHV = _mm256_add_epi32(edgeStrengthHV, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(hv1Two, _mm256_add_epi32(hv0, hv0Eight)); // 2, 9 + edgeStrengthHV = _mm256_add_epi32(edgeStrengthHV, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(hv1, hv0Eight); // 1, 8 + edgeStrengthHV = _mm256_add_epi32(edgeStrengthHV, _mm256_and_si256(strength, ones)); + + // edgeStrengthD, to optimize + __m256i d0Two = _mm256_slli_epi32(d0, 1); + __m256i d0Eight = _mm256_slli_epi32(d0, 3); + __m256i d1Two = _mm256_slli_epi32(d1, 1); + strength = _mm256_cmpgt_epi32(_mm256_slli_epi32(d1, 2), _mm256_add_epi32(d0, _mm256_slli_epi32(d0, 2))); // 4, 5 + __m256i edgeStrengthD = _mm256_and_si256(strength, ones); + + strength = _mm256_cmpgt_epi32(d1Two, _mm256_add_epi32(d0, d0Two)); // 2, 3 + edgeStrengthD = _mm256_add_epi32(edgeStrengthD, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(d1, d0Two); // 1, 2 + edgeStrengthD = _mm256_add_epi32(edgeStrengthD, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(d1, _mm256_add_epi32(d0, d0Two)); // 1, 3 + edgeStrengthD = _mm256_add_epi32(edgeStrengthD, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(d1Two, _mm256_add_epi32(d0, d0Eight)); // 2, 9 + edgeStrengthD = _mm256_add_epi32(edgeStrengthD, _mm256_and_si256(strength, ones)); + + strength = _mm256_cmpgt_epi32(d1, d0Eight); // 1, 8 + edgeStrengthD = _mm256_add_epi32(edgeStrengthD, _mm256_and_si256(strength, ones)); + + const __m256i hv1Xd0e = _mm256_mul_epi32(hv1, d0); + const __m256i hv0Xd1e = _mm256_mul_epi32(hv0, d1); + const __m256i hv1Xd0o = _mm256_mul_epi32(_mm256_srli_si256(hv1, 4), _mm256_srli_si256(d0, 4)); + const __m256i hv0Xd1o = _mm256_mul_epi32(_mm256_srli_si256(hv0, 4), _mm256_srli_si256(d1, 4)); + + const __m256i xmme = _mm256_sub_epi64(hv0Xd1e, hv1Xd0e); + const __m256i xmmo = _mm256_sub_epi64(hv0Xd1o, hv1Xd0o); + + __m256i dirCondition = _mm256_srai_epi32(_mm256_blend_epi16(_mm256_srli_si256(xmme, 4), xmmo, 0xCC), 31); + + __m256i cx = _mm256_blendv_epi8(edgeStrengthHV, edgeStrengthD, dirCondition); // x + __m256i cy = _mm256_blendv_epi8(edgeStrengthD, edgeStrengthHV, dirCondition); // y + __m256i dirOffset = _mm256_blendv_epi8(_mm256_set1_epi32(28), zeros, dirCondition); + // direction = (y*(y+1))/2 + x + __m256i direction = _mm256_mullo_epi32(cy, cy); + direction = _mm256_add_epi32(direction, cy); + direction = _mm256_srli_epi32(direction, 1); + direction = _mm256_add_epi32(direction, cx); + direction = _mm256_andnot_si256(_mm256_cmpgt_epi32(cx, cy), direction); + direction = _mm256_add_epi32(direction, dirOffset); +#if JVET_AE0139_ALF_IMPROVED_FIXFILTER + __m256i sum2 = _mm256_loadu_si256((const __m256i *) &laplacian[VARIANCE][iOffsetV][jOffsetV]); + __m256i shiftLut = _mm256_set_m128i( _mm_loadu_si128( (const __m128i *) divShift2), _mm_loadu_si128( (const __m128i *) divShift2) ); + __m256i shiftVal = _mm256_shuffle_epi8(shiftLut, activity); + shiftVal = _mm256_add_epi32(shiftVal, xmm1); + shiftVal = _mm256_add_epi32(shiftVal, xmm1); + if (vext >= AVX2) + { + sum2 = _mm256_srlv_epi32(sum2, shiftVal); + } + else + { + __m128i sum2Tmp0 = _mm256_extracti128_si256(sum2, 0); + __m128i sum2Tmp1 = _mm256_extracti128_si256(sum2, 1); + __m128i shiftValTmp0 = _mm256_extracti128_si256(shiftVal, 0); + __m128i shiftValTmp1 = _mm256_extracti128_si256(shiftVal, 1); + + uint64_t tmpVal0[4]; + int32_t *pVal0 = (int32_t *) tmpVal0; + _mm_storeu_si128((__m128i *) tmpVal0, sum2Tmp0); + _mm_storeu_si128((__m128i *) (tmpVal0 + 2), shiftValTmp0); + pVal0[0] >>= pVal0[4]; + pVal0[1] >>= pVal0[5]; + pVal0[2] >>= pVal0[6]; + pVal0[3] >>= pVal0[7]; + + uint64_t tmpVal1[4]; + int32_t *pVal1 = (int32_t *) tmpVal1; + _mm_storeu_si128((__m128i *) tmpVal1, sum2Tmp1); + _mm_storeu_si128((__m128i *) (tmpVal1 + 2), shiftValTmp1); + pVal1[0] >>= pVal1[4]; + pVal1[1] >>= pVal1[5]; + pVal1[2] >>= pVal1[6]; + pVal1[3] >>= pVal1[7]; + + sum2 = _mm256_set_m128i( _mm_loadu_si128((const __m128i *) pVal1 ), _mm_loadu_si128((const __m128i *) pVal0) ); + } + + __m256i LUT0 = _mm256_set_m128i( _mm_loadu_si128((const __m128i *) sqrtSum), _mm_loadu_si128((const __m128i *) sqrtSum) ); + __m256i LUT1 = _mm256_set_m128i(_mm_loadu_si128((const __m128i *) &sqrtSum[16]), _mm_loadu_si128((const __m128i *) &sqrtSum[16]) ); + __m256i xmm16 = _mm256_set_epi32(16, 16, 16, 16, 16, 16, 16, 16); + __m256i xmm35 = _mm256_set_epi32(35, 35, 35, 35, 35, 35, 35, 35); + __m256i xmm48 = _mm256_set_epi32(48, 48, 48, 48, 48, 48, 48, 48); + + __m256i use1 = _mm256_cmpgt_epi32(sum2, xmm8); + + __m256i idx0 = _mm256_and_si256(sum2, xmm8); + __m256i idx1 = _mm256_sub_epi32(sum2, xmm16); + idx1 = _mm256_min_epi32(idx1, xmm8); + + idx0 = _mm256_shuffle_epi8(LUT0, idx0); + idx1 = _mm256_shuffle_epi8(LUT1, idx1); + + idx1 = _mm256_add_epi32(idx1, _mm256_slli_epi32(xmm1, 2)); + + idx0 = _mm256_andnot_si256(use1, idx0); + idx1 = _mm256_and_si256(use1, idx1); + idx0 = _mm256_add_epi32(idx0, idx1); + + xmm35 = _mm256_cmpgt_epi32(sum2, xmm35); + xmm48 = _mm256_cmpgt_epi32(sum2, xmm48); + + xmm35 = _mm256_and_si256(xmm35, xmm1); + xmm48 = _mm256_and_si256(xmm48, xmm1); + + xmm35 = _mm256_add_epi32(xmm35, xmm48); + + xmm2 = _mm256_add_epi32(idx0, xmm35); + xmm2 = _mm256_slli_epi32(xmm2, 4); + activity = _mm256_add_epi32(activity, xmm2); +#endif + __m256i classIdx = _mm256_mullo_epi32(dirOff, activity); + classIdx = _mm256_add_epi32(classIdx, direction); + + // transpose + __m256i dirTempHVMinus1 = _mm256_cmpgt_epi32(sumV, sumH); + __m256i dirTempDMinus1 = _mm256_cmpgt_epi32(sumD0, sumD1); + __m256i transposeIdx = _mm256_set1_epi32(3); + transposeIdx = _mm256_add_epi32(transposeIdx, dirTempHVMinus1); + transposeIdx = _mm256_add_epi32(transposeIdx, dirTempDMinus1); + transposeIdx = _mm256_add_epi32(transposeIdx, dirTempDMinus1); + + classIdx = _mm256_slli_epi16(classIdx, 2); + classIdx = _mm256_add_epi16(classIdx, transposeIdx); + classIdx = _mm256_shuffle_epi8(classIdx, _mm256_setr_epi8(0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13, 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13)); +#if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY + _mm256_storeu_si256((__m256i *) &classifier[blkDst.pos().y + i][blkDst.pos().x + j], classIdx); + _mm256_storeu_si256((__m256i *) &classifier[blkDst.pos().y + i + 1][blkDst.pos().x + j], classIdx); +#else + _mm256_storeu_si256((__m256i *) &classifier[curBlk.pos().y + i][curBlk.pos().x + j], classIdx); + _mm256_storeu_si256((__m256i *) &classifier[curBlk.pos().y + i + 1][curBlk.pos().x + j], classIdx); +#endif + + } // for (int j = 0; j < curBlk.width; j += 16) + } // for (int i = 0; i < curBlk.height; i += 2) + + } + else + { +#endif #if JVET_AG0157_ALF_CHROMA_FIXED_FILTER const __m128i mult = _mm_set1_epi32(multTab[dirWindSize % 10]); #else @@ -7312,8 +10277,14 @@ static void simdCalcClass0(AlfClassifier **classifier, const Area &blkDst, const }//for (int j = 0; j < curBlk.width; j += 8) }//for (int i = 0; i < curBlk.height; i += 2) +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + }// Use 256 Bit Simd + #endif } +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION +template <X86_VEXT vext> +#endif static void simdCalcClass1(AlfClassifier **classifier, const Area &blkDst, const Area &curBlk, int dirWindSize, int classDir, int noDir, int noAct, int bitDepth, int subBlkSize, int mappingDir[NUM_DIR_FIX][NUM_DIR_FIX], uint32_t **laplacian[NUM_DIRECTIONS]) { const __m128i shift = _mm_cvtsi32_si128(9 + bitDepth); @@ -7322,6 +10293,82 @@ static void simdCalcClass1(AlfClassifier **classifier, const Area &blkDst, const #else const int multTab[] = { 5628, 1407, 624, 351, 225, 156 }; #endif + +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + const bool use256BitSimd = vext >= AVX2 && curBlk.width % 16 == 0 ? true : false; + if( use256BitSimd ) + { + const __m256i mult = _mm256_set1_epi32(multTab[dirWindSize]); + const __m256i scale = _mm256_set1_epi32(15); + + for (int i = 0; i < curBlk.height; i += 2) + { + int iOffset = i >> 1; + for (int j = 0; j < curBlk.width; j += 16) + { + int jOffset = j >> 1; + __m256i sumV = _mm256_loadu_si256((const __m256i *) &laplacian[VER][iOffset][jOffset]); // 8 32-bit values + __m256i sumH = _mm256_loadu_si256((const __m256i *) &laplacian[HOR][iOffset][jOffset]); + __m256i sumD0 = _mm256_loadu_si256((const __m256i *) &laplacian[DIAG0][iOffset][jOffset]); + __m256i sumD1 = _mm256_loadu_si256((const __m256i *) &laplacian[DIAG1][iOffset][jOffset]); + + // sum += sumV + sumH; + __m256i tempAct = _mm256_add_epi32(sumV, sumH); + __m256i activity = _mm256_mullo_epi32(tempAct, mult); + activity = _mm256_srl_epi32(activity, shift); + activity = _mm256_min_epi32(activity, scale); + __m256i classIdx = _mm256_shuffle_epi8(_mm256_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity); + classIdx = _mm256_add_epi32(classIdx, _mm256_slli_epi32(classIdx, 2)); // activity * 5 + + __m256i hv1 = _mm256_max_epi32(sumV, sumH); + __m256i hv0 = _mm256_min_epi32(sumV, sumH); + + __m256i d1 = _mm256_max_epi32(sumD0, sumD1); + __m256i d0 = _mm256_min_epi32(sumD0, sumD1); + + const __m256i hv1Xd0e = _mm256_mul_epi32(hv1, d0); + const __m256i hv0Xd1e = _mm256_mul_epi32(hv0, d1); + const __m256i hv1Xd0o = _mm256_mul_epi32(_mm256_srli_si256(hv1, 4), _mm256_srli_si256(d0, 4)); + const __m256i hv0Xd1o = _mm256_mul_epi32(_mm256_srli_si256(hv0, 4), _mm256_srli_si256(d1, 4)); + + const __m256i xmme = _mm256_sub_epi64(hv1Xd0e, hv0Xd1e); + const __m256i xmmo = _mm256_sub_epi64(hv1Xd0o, hv0Xd1o); + + __m256i dirCondition = _mm256_srai_epi32(_mm256_blend_epi16(_mm256_srli_si256(xmme, 4), xmmo, 0xCC), 31); + + __m256i hvd1 = _mm256_blendv_epi8(hv1, d1, dirCondition); + __m256i hvd0 = _mm256_blendv_epi8(hv0, d0, dirCondition); + __m256i strength1 = _mm256_cmpgt_epi32(hvd1, _mm256_add_epi32(hvd0, hvd0)); + __m256i strength2 = _mm256_cmpgt_epi32(_mm256_add_epi32(hvd1, hvd1), _mm256_add_epi32(hvd0, _mm256_slli_epi32(hvd0, 3))); + __m256i offset = _mm256_and_si256(strength1, _mm256_set1_epi32(1)); + __m256i direction = _mm256_add_epi32(offset, _mm256_and_si256(strength2, _mm256_set1_epi32(1))); + direction = _mm256_add_epi32(direction, _mm256_andnot_si256(dirCondition, _mm256_set1_epi32(2))); + direction = _mm256_and_si256(direction, strength1); + classIdx = _mm256_add_epi32(direction, classIdx); + + // transpose + __m256i dirTempHVMinus1 = _mm256_cmpgt_epi32(sumV, sumH); + __m256i dirTempDMinus1 = _mm256_cmpgt_epi32(sumD0, sumD1); + __m256i transposeIdx = _mm256_set1_epi32(3); + transposeIdx = _mm256_add_epi32(transposeIdx, dirTempHVMinus1); + transposeIdx = _mm256_add_epi32(transposeIdx, dirTempDMinus1); + transposeIdx = _mm256_add_epi32(transposeIdx, dirTempDMinus1); + classIdx = _mm256_slli_epi16(classIdx, 2); + classIdx = _mm256_add_epi16(classIdx, transposeIdx); + classIdx = _mm256_shuffle_epi8(classIdx, _mm256_setr_epi8(0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13, 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13)); +#if JVET_Z0105_LOOP_FILTER_VIRTUAL_BOUNDARY + _mm256_storeu_si256((__m256i *) &classifier[blkDst.pos().y + i][blkDst.pos().x + j], classIdx); + _mm256_storeu_si256((__m256i *) &classifier[blkDst.pos().y + i + 1][blkDst.pos().x + j], classIdx); +#else + _mm256_storeu_si256((__m256i *) &classifier[curBlk.pos().y + i][curBlk.pos().x + j], classIdx); + _mm256_storeu_si256((__m256i *) &classifier[curBlk.pos().y + i + 1][curBlk.pos().x + j], classIdx); +#endif + } // for (int j = 0; j < curBlk.width; j += 16) + } // for (int i = 0; i < curBlk.height; i += 2) + } + else + { +#endif const __m128i mult = _mm_set1_epi32(multTab[dirWindSize]); const __m128i scale = _mm_set1_epi32(15); @@ -7389,6 +10436,9 @@ static void simdCalcClass1(AlfClassifier **classifier, const Area &blkDst, const #endif }//for (int j = 0; j < curBlk.width; j += 8) }//for (int i = 0; i < curBlk.height; i += 2) +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + }//Use 256 Bit Simd +#endif } #endif @@ -7438,7 +10488,11 @@ void AdaptiveLoopFilter::_initAdaptiveLoopFilterX86() #else m_calcClass0 = simdCalcClass0; #endif +#if USE_AVX2 && JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_calcClass1 = simdCalcClass1<vext>; +#else m_calcClass1 = simdCalcClass1; +#endif for( int i = 0; i < NUM_SETS_FIXED_FILTERS; i++ ) { diff --git a/source/Lib/DecoderLib/DecLib.cpp b/source/Lib/DecoderLib/DecLib.cpp index 82223c1df013107d5a3b10cd4e7d70537795293e..955a7d0b25d7df026bbceda2439ca720cd3ccdfe 100644 --- a/source/Lib/DecoderLib/DecLib.cpp +++ b/source/Lib/DecoderLib/DecLib.cpp @@ -815,6 +815,9 @@ void DecLib::executeLoopFilters() m_cALF.copyResiData(cs); } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_cALF.callCodingInfoBuf( cs ).fill( 0 ); +#endif // deblocking filter #if DB_PARAM_TID @@ -842,8 +845,12 @@ void DecLib::executeLoopFilters() } #endif - +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf codingInfoBuf = m_cALF.callCodingInfoBuf( cs ); + m_cLoopFilter.loopFilterPic( cs, codingInfoBuf, true ); +#else m_cLoopFilter.loopFilterPic( cs ); +#endif #if !MULTI_PASS_DMVR CS::setRefinedMotionField(cs); diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index f974db44538cd4898d9ee7e5f2a9a49e3424a7e4..5e9c09f5826f1e7e90d226605facac4490df15aa 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -2182,6 +2182,16 @@ void HLSyntaxReader::parseSPS(SPS* pcSPS) { pcSPS->setAlfPrecisionFlag( false ); } +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( pcSPS->getALFEnabledFlag() ) + { + READ_FLAG( uiCode, "sps_alf_luma_fixed_filter_adjust" ); pcSPS->setAlfLumaFixedFilterAdjust( uiCode ? true : false ); + } + else + { + pcSPS->setAlfLumaFixedFilterAdjust( false ); + } #endif if (pcSPS->getALFEnabledFlag() && pcSPS->getChromaFormatIdc() != CHROMA_400) { diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index 944ae6daf30388244b9151cea55754bd3cabd6ec..38e77f597248ba1a5d4820adb075179d5200580d 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -1421,6 +1421,9 @@ void EncAdaptiveLoopFilter::ALFProcess( CodingStructure& cs, const double *lambd #endif const CPelBuf &resiLuma = resiYuv.get(COMPONENT_Y); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf recYuvCodingInfo = m_tempBufCodingInfo.getBuf( cs.area ); +#endif #if JVET_AI0166_CCALF_CHROMA_SAO_INPUT m_tempBufSAO.copyFrom(cs.getRecoBuf()); PelUnitBuf recYuvSAO = m_tempBufSAO.getBuf(cs.area); @@ -1483,7 +1486,23 @@ void EncAdaptiveLoopFilter::ALFProcess( CodingStructure& cs, const double *lambd buf.extendBorderPel( MAX_ALF_PADDING_SIZE ); #endif buf = buf.subBuf( UnitArea( cs.area.chromaFormat, Area( clipL ? 0 : MAX_ALF_PADDING_SIZE, clipT ? 0 : MAX_ALF_PADDING_SIZE, w, h ) ) ); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf bufCodingInfo = m_tempBufCodingInfo2.subBuf( UnitArea( CHROMA_400, Area( 0, 0, wBuf, hBuf ) ) ); + bufCodingInfo.copyFrom( recYuvCodingInfo.subBuf( UnitArea( CHROMA_400, Area( xStart - ( clipL ? 0 : MAX_ALF_PADDING_SIZE ), yStart - ( clipT ? 0 : MAX_ALF_PADDING_SIZE ), wBuf, hBuf ) ) ) ); + // pad top-left unavailable samples for raster slice + if( xStart == xPos && yStart == yPos && ( rasterSliceAlfPad & 1 ) ) + { + bufCodingInfo.padBorderPel( MAX_ALF_PADDING_SIZE, 1 ); + } + // pad bottom-right unavailable samples for raster slice + if( xEnd == xPos + width && yEnd == yPos + height && ( rasterSliceAlfPad & 2 ) ) + { + bufCodingInfo.padBorderPel( MAX_ALF_PADDING_SIZE, 2 ); + } + mirroredPaddingForAlf(cs, bufCodingInfo, MAX_ALF_PADDING_SIZE, true, false); + bufCodingInfo = bufCodingInfo.subBuf( UnitArea( CHROMA_400, Area( clipL ? 0 : MAX_ALF_PADDING_SIZE, clipT ? 0 : MAX_ALF_PADDING_SIZE, w, h ) ) ); +#endif #if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT PelUnitBuf bufResi = m_tempBufResi2.subBuf(UnitArea(cs.area.chromaFormat, Area(0, 0, wBuf, hBuf))); bufResi.copyFrom(resiYuv.subBuf( @@ -1543,6 +1562,9 @@ void EncAdaptiveLoopFilter::ALFProcess( CodingStructure& cs, const double *lambd const Area blkDstChroma( xStart >> scaleX, yStart >> scaleY, w >> scaleX, h >> scaleY ); deriveFixedFilterChroma( m_classifier, buf, bufDb, blkDstChroma, blkSrcChroma, cs, -1, MAX_NUM_COMPONENT ); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + calcAlfLumaCodingInfoBlk(cs, m_classifierCodingInfo[0], blkDst, blkSrc, buf.get(COMPONENT_Y), 2, 2, m_inputBitDepth[CHANNEL_TYPE_LUMA], bufResi.get(COMPONENT_Y), m_laplacian[0], bufCodingInfo.get(COMPONENT_Y) ); +#endif #if JVET_X0071_ALF_BAND_CLASSIFIER deriveClassification( m_classifier, buf.get( COMPONENT_Y ), #if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT @@ -1577,6 +1599,9 @@ void EncAdaptiveLoopFilter::ALFProcess( CodingStructure& cs, const double *lambd Area blkChroma(xPos >> scaleX, yPos >> scaleY, width >> scaleX, height >> scaleY); deriveFixedFilterChroma( m_classifier, recYuv, recYuvBeforeDb, blkChroma, blkChroma, cs, -1, MAX_NUM_COMPONENT ); #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + calcAlfLumaCodingInfoBlk(cs, m_classifierCodingInfo[0], blk, blk, recYuv.get(COMPONENT_Y), 2, 2, m_inputBitDepth[CHANNEL_TYPE_LUMA], resiYuv.get(COMPONENT_Y), m_laplacian[0], recYuvCodingInfo.get(COMPONENT_Y) ); +#endif #if JVET_X0071_ALF_BAND_CLASSIFIER deriveClassification( m_classifier, recLuma, #if JVET_AC0162_ALF_RESIDUAL_SAMPLES_INPUT diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index 616369020a22ab7bf9eddaa436d81378558df2ae..3d709252d5ae1ee795e3e449c0a1688c51c48457 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -712,6 +712,9 @@ protected: #endif #if JVET_AH0057_CCALF_COEFF_PRECISION bool m_ccalfPrecision; +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + bool m_alfLumaFixedFilterAdjust; #endif bool m_bTestSAODisableAtPictureLevel; double m_saoEncodingRate; // When non-0 SAO early picture termination is enabled for luma and chroma @@ -2389,6 +2392,10 @@ public: #if JVET_AH0057_CCALF_COEFF_PRECISION void setUseCCALFPrecision(bool b) { m_ccalfPrecision = b; } bool getUseCCALFPrecision() const { return m_ccalfPrecision; } +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + void setAlfLumaFixedFilterAdjust(bool b) { m_alfLumaFixedFilterAdjust = b; } + bool getAlfLumaFixedFilterAdjust() const { return m_alfLumaFixedFilterAdjust; } #endif void setTestSAODisableAtPictureLevel (bool bVal) { m_bTestSAODisableAtPictureLevel = bVal; } bool getTestSAODisableAtPictureLevel ( ) const { return m_bTestSAODisableAtPictureLevel; } diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index 4b27948fbd737786ae9a2356c6f0043142b9d13e..4f9d962143d3015177a148924f8d24406a4fe4ee 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -23992,13 +23992,23 @@ void EncCu::xCalDebCost( CodingStructure &cs, Partitioner &partitioner, bool cal if ( leftEdgeAvai ) { m_pcLoopFilter->resetFilterLengths(); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + //No Impact on OBMC Buffer + m_pcLoopFilter->xDeblockCU( *cu, EDGE_VER, false, m_tempWoOBMCBuffer ); +#else m_pcLoopFilter->xDeblockCU( *cu, EDGE_VER ); +#endif } if (topEdgeAvai) { m_pcLoopFilter->resetFilterLengths(); +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + //No Impact on OBMC Buffer + m_pcLoopFilter->xDeblockCU( *cu, EDGE_HOR, false, m_tempWoOBMCBuffer ); +#else m_pcLoopFilter->xDeblockCU( *cu, EDGE_HOR ); +#endif } //update current CU SSE diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index 1d8c0cd314828a9799ab8705417a21dcea192484..f23b366bd9612471502010ea8b55c086d0615f5c 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -3771,7 +3771,9 @@ void EncGOP::compressGOP(int iPOCLast, int iNumPicRcvd, PicList &rcListPic, std: m_pcALF->copyResiData(cs); } #endif - +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + m_pcALF->callCodingInfoBuf( cs ).fill( 0 ); +#endif // create SAO object based on the picture size if( pcSlice->getSPS()->getSAOEnabledFlag() #if JVET_W0066_CCSAO @@ -3862,7 +3864,12 @@ void EncGOP::compressGOP(int iPOCLast, int iNumPicRcvd, PicList &rcListPic, std: } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf codingInfoBuf = m_pcALF->callCodingInfoBuf( cs ); + m_pcLoopFilter->loopFilterPic( cs, codingInfoBuf, true ); +#else m_pcLoopFilter->loopFilterPic( cs ); +#endif #if !MULTI_PASS_DMVR CS::setRefinedMotionField(cs); @@ -5128,7 +5135,12 @@ uint64_t EncGOP::preLoopFilterPicAndCalcDist( Picture* pcPic ) } #endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + PelUnitBuf codingInfoBuf = m_pcALF->callCodingInfoBuf( cs ); + m_pcLoopFilter->loopFilterPic( cs, codingInfoBuf, false ); +#else m_pcLoopFilter->loopFilterPic( cs ); +#endif const CPelUnitBuf picOrg = pcPic->getRecoBuf(); const CPelUnitBuf picRec = cs.getRecoBuf(); diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index d3209d48c5d0393c47f9c28c089ee38f79578fb2..5a1cad6daf4be6e33ccbe0b45db5e89383cbad44 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -2225,6 +2225,9 @@ void EncLib::xInitSPS( SPS& sps ) #endif #if JVET_AH0057_CCALF_COEFF_PRECISION sps.setCCALFPrecisionFlag( m_ccalfPrecision ); +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + sps.setAlfLumaFixedFilterAdjust( m_intraPeriod < 0 ? false : true ); #endif sps.setJointCbCrEnabledFlag( m_JointCbCrMode ); sps.setMaxTLayers( m_maxTempLayer ); diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index bdb2d71d23654628a9d1001ee3451c316293773d..11bb3fbeb9a5785d121f84e6cb6e2a8d8faf6b51 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -1344,6 +1344,12 @@ void HLSWriter::codeSPS( const SPS* pcSPS ) } #endif } +#endif +#if JVET_AJ0188_CODING_INFO_CLASSIFICATION + if( pcSPS->getALFEnabledFlag() ) + { + WRITE_FLAG( pcSPS->getAlfLumaFixedFilterAdjust(), "sps_alf_luma_fixed_filter_adjust" ); + } #endif if (pcSPS->getALFEnabledFlag() && pcSPS->getChromaFormatIdc() != CHROMA_400) {