diff --git a/source/App/DecoderApp/decmain.cpp b/source/App/DecoderApp/decmain.cpp index c8a6e3bd7070cdcbd867a95d3f39c27cca6739e1..0a664aa461019162caa0c5c7d1d09092bce5eb47 100644 --- a/source/App/DecoderApp/decmain.cpp +++ b/source/App/DecoderApp/decmain.cpp @@ -54,7 +54,7 @@ int main(int argc, char* argv[]) // print information fprintf( stdout, "\n" ); - fprintf( stdout, "VVCSoftware: VTM Decoder Version %s ", VTM_VERSION ); + fprintf( stdout, "VVCSoftware: ECM Decoder Version %s (VTM-%s) ", ECM_VERSION, VTM_VERSION ); fprintf( stdout, NVM_ONOS ); fprintf( stdout, NVM_COMPILEDBY ); fprintf( stdout, NVM_BITS ); diff --git a/source/App/EncoderApp/EncApp.cpp b/source/App/EncoderApp/EncApp.cpp index 02bedf8fa1bb0f68541d7a1267d166976744de50..378561a7a65918bc4935c41dab9ab3711c3d642f 100644 --- a/source/App/EncoderApp/EncApp.cpp +++ b/source/App/EncoderApp/EncApp.cpp @@ -788,9 +788,9 @@ void EncApp::xInitLibCfg() m_cEncLib.setUseWrapAround ( m_wrapAround ); m_cEncLib.setWrapAroundOffset ( m_wrapAroundOffset ); -#if IDCC_TPM_JEM - m_cEncLib.setUseIntraTMP(m_IntraTMP); - m_cEncLib.setIntraTMPMaxSize(m_IntraTMP_MaxSize); +#if JVET_V0130_INTRA_TMP + m_cEncLib.setUseIntraTMP ( m_intraTMP ); + m_cEncLib.setIntraTMPMaxSize ( m_intraTmpMaxSize ); #endif #if JVET_V0094_BILATERAL_FILTER m_cEncLib.setUseBIF ( m_BIF ); diff --git a/source/App/EncoderApp/EncAppCfg.cpp b/source/App/EncoderApp/EncAppCfg.cpp index 21890e532d76457d352cbe6d6015d70421bf7758..33be266cc7f211f1f44df7dd5c67d63811482e8e 100644 --- a/source/App/EncoderApp/EncAppCfg.cpp +++ b/source/App/EncoderApp/EncAppCfg.cpp @@ -1044,9 +1044,9 @@ bool EncAppCfg::parseCfg( int argc, char* argv[] ) ("AdditionalInterHypRefFrames", m_maxNumAddHypRefFrames, 4, "max. number of ref frames for additional inter hypotheseis") ("AdditionalInterHypTries", m_addHypTries, 1, "number of tries for additional inter prediction hypotheseis") #endif -#if IDCC_TPM_JEM - ("IntraTMP", m_IntraTMP, false, "intra Template Matching (0: off, 1:on) [default: on]") - ("IntraTMPMaxSize", m_IntraTMP_MaxSize, 64u, "intra Template Matching max CU size [default: 64]") +#if JVET_V0130_INTRA_TMP + ("IntraTMP", m_intraTMP, false, "intra Template Matching (0: off, 1:on) [default: on]") + ("IntraTMPMaxSize", m_intraTmpMaxSize, 64u, "intra Template Matching max CU size [default: 64]") #endif #if JVET_V0094_BILATERAL_FILTER ("BIF", m_BIF, true, "bilateral filter (0: off, 1:on) [default: on]") @@ -4176,12 +4176,12 @@ void EncAppCfg::xPrintParameter() } #endif } -#if IDCC_TPM_JEM - msg(DETAILS, "Intra TMP: %d\n", m_IntraTMP); - msg(DETAILS, "Max CU size of TMP: %d\n", m_IntraTMP_MaxSize); +#if INTRA_TEMPLATE_MATCHING + msg(DETAILS, "Intra TMP: %d\n", m_intraTMP); + msg(DETAILS, "Max CU size of TMP: %d\n", m_intraTmpMaxSize); msg(DETAILS, "dynamic search range with fixed comparison per pixel: \n"); - msg(DETAILS, " searchRangeWidth = %d*Width \n", IDCC_SearchRangeMultFactor); - msg(DETAILS, " searchRangeHeight = %d*Heigh \n", IDCC_SearchRangeMultFactor); + msg(DETAILS, " searchRangeWidth = %d*Width \n", TMP_SEARCH_RANGE_MULT_FACTOR ); + msg(DETAILS, " searchRangeHeight = %d*Heigh \n", TMP_SEARCH_RANGE_MULT_FACTOR ); #endif msg( DETAILS, "Max Num Merge Candidates : %d\n", m_maxNumMergeCand ); @@ -4316,9 +4316,9 @@ void EncAppCfg::xPrintParameter() { msg( VERBOSE, "WrapAroundOffset:%d ", m_wrapAroundOffset ); } -#if IDCC_TPM_JEM - msg( VERBOSE, "IntraTMP:%d ", m_IntraTMP); - msg( VERBOSE, "IntraTMP_MaxSize:%d ", m_IntraTMP_MaxSize); +#if INTRA_TEMPLATE_MATCHING + msg( VERBOSE, "IntraTMP:%d ", m_intraTMP); + msg( VERBOSE, "IntraTmpMaxSize:%d ", m_intraTmpMaxSize); #endif #if JVET_V0094_BILATERAL_FILTER msg( VERBOSE, "BIF:%d ", m_BIF); diff --git a/source/App/EncoderApp/EncAppCfg.h b/source/App/EncoderApp/EncAppCfg.h index a035b4d4c277f54a8360ef341a66000d81015815..83b5967d6cfb4d41bb9ed786457ce56db66ab722 100644 --- a/source/App/EncoderApp/EncAppCfg.h +++ b/source/App/EncoderApp/EncAppCfg.h @@ -411,9 +411,9 @@ protected: int m_maxNumAddHypRefFrames; ///< max. number of ref frames for additional inter hypotheseis int m_addHypTries; ///< max. number of tries for additional inter hypotheseis #endif -#if IDCC_TPM_JEM - bool m_IntraTMP; ///< intra Template Matching - unsigned m_IntraTMP_MaxSize; ///< max CU size for which intra TMP is allowed +#if JVET_V0130_INTRA_TMP + bool m_intraTMP; ///< intra Template Matching + unsigned m_intraTmpMaxSize; ///< max CU size for which intra TMP is allowed #endif #if JVET_V0094_BILATERAL_FILTER bool m_BIF; ///< bilateral filter diff --git a/source/App/EncoderApp/encmain.cpp b/source/App/EncoderApp/encmain.cpp index 0dfefebecaeb29ce3ebcea6a9e55c2bbe2672ab9..44a68dc8f4bbf7432d8622841ed5abd2db83a58b 100644 --- a/source/App/EncoderApp/encmain.cpp +++ b/source/App/EncoderApp/encmain.cpp @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) { // print information fprintf( stdout, "\n" ); - fprintf( stdout, "VVCSoftware: VTM Encoder Version %s ", VTM_VERSION ); + fprintf( stdout, "VVCSoftware: ECM Encoder Version %s (VTM-%s) ", ECM_VERSION, VTM_VERSION ); fprintf( stdout, NVM_ONOS ); fprintf( stdout, NVM_COMPILEDBY ); fprintf( stdout, NVM_BITS ); diff --git a/source/Lib/CommonLib/BilateralFilter.cpp b/source/Lib/CommonLib/BilateralFilter.cpp index f5b7f0f3151c1666053530dbdf09ab11980649b2..38058a4da2d79cee6a2c2678f08320ee32b93a45 100755 --- a/source/Lib/CommonLib/BilateralFilter.cpp +++ b/source/Lib/CommonLib/BilateralFilter.cpp @@ -50,6 +50,13 @@ BilateralFilter::BilateralFilter() { + m_bilateralFilterDiamond5x5 = blockBilateralFilterDiamond5x5; + +#if ENABLE_SIMD_BILATERAL_FILTER +#ifdef TARGET_SIMD_X86 + initBilateralFilterX86(); +#endif +#endif } BilateralFilter::~BilateralFilter() @@ -64,292 +71,193 @@ void BilateralFilter::destroy() { } -void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO) +const char* BilateralFilter::getFilterLutParameters( const int size, const PredMode predMode, const int32_t qp, int& bfac ) { - int pad = 2; - int padwidth = iWidthExtSIMD; - - __m128i center, left, right, up, down, lu, ld, ru, rd, diffabs, four, fifteen, lut, acc, temp, round_add, clipmin, clipmax, inputVals; - __m128i ll, rr, uu, dd; - - four = _mm_set1_epi16(4); - fifteen = _mm_set1_epi16(15); - round_add = _mm_set1_epi16(bif_round_add); - clipmin = _mm_set1_epi16(clpRng.min); - clipmax = _mm_set1_epi16(clpRng.max); + if( size <= 4 ) + { + bfac = 3; + } + else if( size >= 16 ) + { + bfac = 1; + } + else + { + bfac = 2; + } - lut = _mm_loadu_si128((__m128i*)(LUTrowPtr)); - acc = _mm_set1_epi32(0); - - // Copy back parameters - Pel *tempBlockPtr = (short*)blkFilt + (((padwidth+4) << 1) + 2); - int tempBlockStride = padwidth+4; - - - for (int col = 0; col < uiWidth; col += 8) + if( predMode == MODE_INTER ) { - for (int row = 0; row < uiHeight; row++) + if( size <= 4 ) { - acc = _mm_set1_epi32(0); - int16_t *point = &block[(row + pad)*padwidth + pad + col]; - - center = _mm_loadu_si128((__m128i*)(point)); - - //load neighbours - left = _mm_loadu_si128((__m128i*)(point - 1)); - right = _mm_loadu_si128((__m128i*)(point + 1)); - up = _mm_loadu_si128((__m128i*)(point - padwidth)); - down = _mm_loadu_si128((__m128i*)(point + padwidth)); - - lu = _mm_loadu_si128((__m128i*)(point - 1 - padwidth)); - ld = _mm_loadu_si128((__m128i*)(point - 1 + padwidth)); - ru = _mm_loadu_si128((__m128i*)(point + 1 - padwidth)); - rd = _mm_loadu_si128((__m128i*)(point + 1 + padwidth)); - - ll = _mm_loadu_si128((__m128i*)(point - 2)); - rr = _mm_loadu_si128((__m128i*)(point + 2)); - uu = _mm_loadu_si128((__m128i*)(point - 2*padwidth)); - dd = _mm_loadu_si128((__m128i*)(point + 2*padwidth)); - - //calculate diffs - left = _mm_sub_epi16(left, center); - right = _mm_sub_epi16(right, center); - up = _mm_sub_epi16(up, center); - down = _mm_sub_epi16(down, center); - - lu = _mm_sub_epi16(lu, center); - ld = _mm_sub_epi16(ld, center); - ru = _mm_sub_epi16(ru, center); - rd = _mm_sub_epi16(rd, center); - - ll = _mm_sub_epi16(ll, center); - rr = _mm_sub_epi16(rr, center); - uu = _mm_sub_epi16(uu, center); - dd = _mm_sub_epi16(dd, center); - - //LEFT! - //calculate abs - diffabs = _mm_abs_epi16(left); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_sign_epi16(diffabs, left);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //RIGHT! - //calculate abs - diffabs = _mm_abs_epi16(right); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_sign_epi16(diffabs, right);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //UP! - //calculate abs - diffabs = _mm_abs_epi16(up); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_sign_epi16(diffabs, up);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - - //DOWN! - //calculate abs - diffabs = _mm_abs_epi16(down); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_sign_epi16(diffabs, down);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - - //lu! - //calculate abs - diffabs = _mm_abs_epi16(lu); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, lu);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //ld! - //calculate abs - diffabs = _mm_abs_epi16(ld); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, ld);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //ru! - //calculate abs - diffabs = _mm_abs_epi16(ru); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, ru);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //rd! - //calculate abs - diffabs = _mm_abs_epi16(rd); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, rd);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - - //ll! - //calculate abs - diffabs = _mm_abs_epi16(ll); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, ll);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //rr! - //calculate abs - diffabs = _mm_abs_epi16(rr); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, rr);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //uu! - //calculate abs - diffabs = _mm_abs_epi16(uu); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, uu);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - //dd! - //calculate abs - diffabs = _mm_abs_epi16(dd); //abs - diffabs = _mm_add_epi16(diffabs, four); //+4 - diffabs = _mm_srai_epi16(diffabs, 3); //>>3 - diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) - diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 - diffabs = _mm_shuffle_epi8(lut, diffabs);//lut - diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit - diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! - diffabs = _mm_sign_epi16(diffabs, dd);//fix sign! - acc = _mm_add_epi16(diffabs, acc); //add to acc - - if (bfac == 2) - { - acc = _mm_slli_epi16(acc, 1); // Shift left to get 2* - } - else if (bfac == 3) - { - temp = _mm_slli_epi16(acc, 1); // Multiply by two by shifting left - acc = _mm_add_epi16(acc, temp); // Add original value to get 3* - } - - // Add 16 and shift 5 - acc = _mm_add_epi16(acc, round_add); - acc = _mm_srai_epi16(acc, bif_round_shift); - - // Instead we add our input values to the delta - if(isRDO) - { - acc = _mm_add_epi16(acc, center); - } - else - { - int16_t *recpoint = &recPtr[row * recStride + col]; - inputVals = _mm_loadu_si128((__m128i*)(recpoint)); - acc = _mm_add_epi16(acc, inputVals); - } - - // Clip - acc = _mm_max_epi16(acc, clipmin); - acc = _mm_min_epi16(acc, clipmax); - - _mm_store_si128((__m128i*)(blkFilt + (row + pad) * (padwidth + 4) + col + pad), acc); + bfac = 2; + } + else if( size >= 16 ) + { + bfac = 1; + } + else + { + bfac = 2; } } - - // Copy back from tempbufFilter to recBuf - int onerow = uiWidth * sizeof(Pel); - for(uint32_t yy = 0; yy < uiHeight; yy++) + + int sqp = qp; + + if( sqp < 17 ) + { + sqp = 17; + } + + if( sqp > 42 ) { - std::memcpy(recPtr, tempBlockPtr, onerow); - recPtr += recStride; - tempBlockPtr += tempBlockStride; + sqp = 42; } + + return m_wBIF[sqp - 17]; } -void BilateralFilter::blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO) +void BilateralFilter::blockBilateralFilterDiamond5x5( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ) { int pad = 2; -#ifdef TARGET_SIMD_X86 - if ((uiWidth >= 8) || (!isRDO && (uiWidth >= 4))) + int padwidth = iWidthExtSIMD; + int downbuffer[64]; + int downleftbuffer[65]; + int downrightbuffer[2][65]; + int Shift, sg0, v0, idx, w0; + Shift = sizeof( int ) * 8 - 1; + downbuffer[0] = 0; + + for( int x = 0; x < uiWidth; x++ ) { - simdFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, isRDO); + int pixel = block[(-1 + pad)*padwidth + x + pad]; + int below = block[(-1 + pad + 1)*padwidth + x + pad]; + int diff = below - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx]; + int mod = (w0 + sg0) ^ sg0; + downbuffer[x] = mod; + + int belowright = block[(-1 + pad + 1)*padwidth + x + pad + 1]; + diff = belowright - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downrightbuffer[1][x + 1] = mod; + + int belowleft = block[(-1 + pad + 1)*padwidth + x + pad - 1]; + diff = belowleft - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downleftbuffer[x] = mod; } - else -#endif + int width = uiWidth; + for( int y = 0; y < uiHeight; y++ ) { - - int padwidth = iWidthExtSIMD; - int downbuffer[64]; - int downleftbuffer[65]; - int downrightbuffer[2][65]; - int Shift, sg0, v0, idx, w0; - Shift = sizeof(int) * 8 - 1; - downbuffer[0] = 0; - - for (int x = 0; x < uiWidth; x++) - { - int pixel = block[(-1 + pad)*padwidth + x + pad]; - int below = block[(-1 + pad + 1)*padwidth + x + pad]; - int diff = below - pixel; + int diff; + + int16_t *rowStart = &block[(y + pad)*padwidth + pad]; + + int pixel = rowStart[-1]; + + int right = rowStart[0]; + diff = right - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx]; + int mod = (w0 + sg0) ^ sg0; + int rightmod = mod; + + pixel = rowStart[-padwidth - 1]; + int belowright = right; + diff = belowright - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downrightbuffer[(y + 1) % 2][0] = mod; + + pixel = rowStart[-padwidth + width]; + int belowleft = rowStart[width - 1]; + diff = belowleft - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + downleftbuffer[width] = mod; + + for( int x = 0; x < uiWidth; x++ ) + { + pixel = rowStart[x]; + int modsum = 0; + + int abovemod = -downbuffer[x]; + modsum += abovemod; + + int leftmod = -rightmod; + modsum += leftmod; + + right = rowStart[x + 1]; + diff = right - pixel; sg0 = diff >> Shift; v0 = (diff + sg0) ^ sg0; v0 = (v0 + 4) >> 3; idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); w0 = LUTrowPtr[idx]; - int mod = (w0 + sg0) ^ sg0; + mod = (w0 + sg0) ^ sg0; + + modsum += mod; + rightmod = mod; + + int below = rowStart[x + padwidth]; + diff = below - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx]; + mod = (w0 + sg0) ^ sg0; + modsum += mod; downbuffer[x] = mod; - - int belowright = block[(-1 + pad + 1)*padwidth + x + pad + 1]; + + int aboverightmod = -downleftbuffer[x + 1]; + // modsum += ((int16_t)((uint16_t)((aboverightmod) >> 1))); + modsum += aboverightmod; + + int aboveleftmod = -downrightbuffer[(y + 1) % 2][x]; + // modsum += ((int16_t)((uint16_t)((aboveleftmod) >> 1))); + modsum += aboveleftmod; + + int belowleft = rowStart[x + padwidth - 1]; + diff = belowleft - pixel; + sg0 = diff >> Shift; + v0 = (diff + sg0) ^ sg0; + v0 = (v0 + 4) >> 3; + idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + // modsum += ((int16_t)((uint16_t)((mod) >> 1))); + modsum += mod; + downleftbuffer[x] = mod; + + int belowright = rowStart[x + padwidth + 1]; diff = belowright - pixel; sg0 = diff >> Shift; v0 = (diff + sg0) ^ sg0; @@ -357,207 +265,87 @@ void BilateralFilter::blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); w0 = LUTrowPtr[idx] >> 1; mod = (w0 + sg0) ^ sg0; - downrightbuffer[1][x + 1] = mod; - - int belowleft = block[(-1 + pad + 1)*padwidth + x + pad - 1]; - diff = belowleft - pixel; + //modsum += ((int16_t)((uint16_t)((mod) >> 1))); + modsum += mod; + downrightbuffer[y % 2][x + 1] = mod; + + // For samples two pixels out, we do not reuse previously calculated + // values even though that is possible. Doing so would likely increase + // speed when SIMD is turned off. + + int above = rowStart[x - 2 * padwidth]; + diff = above - pixel; sg0 = diff >> Shift; v0 = (diff + sg0) ^ sg0; v0 = (v0 + 4) >> 3; idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); w0 = LUTrowPtr[idx] >> 1; mod = (w0 + sg0) ^ sg0; - downleftbuffer[x] = mod; - } - int width = uiWidth; - for (int y = 0; y < uiHeight; y++) - { - int diff; - - int16_t *rowStart = &block[(y + pad)*padwidth + pad]; - - int pixel = rowStart[-1]; - - int right = rowStart[0]; - diff = right - pixel; + modsum += mod; + + below = rowStart[x + 2 * padwidth]; + diff = below - pixel; sg0 = diff >> Shift; v0 = (diff + sg0) ^ sg0; v0 = (v0 + 4) >> 3; idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx]; - int mod = (w0 + sg0) ^ sg0; - int rightmod = mod; - - pixel = rowStart[-padwidth - 1]; - int belowright = right; - diff = belowright - pixel; + w0 = LUTrowPtr[idx] >> 1; + mod = (w0 + sg0) ^ sg0; + modsum += mod; + + int left = rowStart[x - 2]; + diff = left - pixel; sg0 = diff >> Shift; v0 = (diff + sg0) ^ sg0; v0 = (v0 + 4) >> 3; idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); w0 = LUTrowPtr[idx] >> 1; mod = (w0 + sg0) ^ sg0; - downrightbuffer[(y + 1) % 2][0] = mod; - - pixel = rowStart[-padwidth + width]; - int belowleft = rowStart[width - 1]; - diff = belowleft - pixel; + modsum += mod; + + right = rowStart[x + 2]; + diff = right - pixel; sg0 = diff >> Shift; v0 = (diff + sg0) ^ sg0; v0 = (v0 + 4) >> 3; idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); w0 = LUTrowPtr[idx] >> 1; mod = (w0 + sg0) ^ sg0; - downleftbuffer[width] = mod; - - - for (int x = 0; x < uiWidth; x++) - { - - pixel = rowStart[x]; - - int modsum = 0; - - - int abovemod = -downbuffer[x]; - modsum += abovemod; - - int leftmod = -rightmod; - modsum += leftmod; - - right = rowStart[x + 1]; - diff = right - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx]; - mod = (w0 + sg0) ^ sg0; - - modsum += mod; - rightmod = mod; - - int below = rowStart[x + padwidth]; - diff = below - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx]; - mod = (w0 + sg0) ^ sg0; - modsum += mod; - downbuffer[x] = mod; - - int aboverightmod = -downleftbuffer[x + 1]; - // modsum += ((int16_t)((uint16_t)((aboverightmod) >> 1))); - modsum += aboverightmod; - - int aboveleftmod = -downrightbuffer[(y + 1) % 2][x]; - // modsum += ((int16_t)((uint16_t)((aboveleftmod) >> 1))); - modsum += aboveleftmod; - - int belowleft = rowStart[x + padwidth - 1]; - diff = belowleft - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx] >> 1; - mod = (w0 + sg0) ^ sg0; - // modsum += ((int16_t)((uint16_t)((mod) >> 1))); - modsum += mod; - downleftbuffer[x] = mod; - - int belowright = rowStart[x + padwidth + 1]; - diff = belowright - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx] >> 1; - mod = (w0 + sg0) ^ sg0; - //modsum += ((int16_t)((uint16_t)((mod) >> 1))); - modsum += mod; - downrightbuffer[y % 2][x + 1] = mod; - - // For samples two pixels out, we do not reuse previously calculated - // values even though that is possible. Doing so would likely increase - // speed when SIMD is turned off. - - int above = rowStart[x - 2*padwidth]; - diff = above - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx] >> 1; - mod = (w0 + sg0) ^ sg0; - modsum += mod; - - below = rowStart[x + 2*padwidth]; - diff = below - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx] >> 1; - mod = (w0 + sg0) ^ sg0; - modsum += mod; - - int left = rowStart[x - 2]; - diff = left - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx] >> 1; - mod = (w0 + sg0) ^ sg0; - modsum += mod; - - right = rowStart[x + 2]; - diff = right - pixel; - sg0 = diff >> Shift; - v0 = (diff + sg0) ^ sg0; - v0 = (v0 + 4) >> 3; - idx = 15 + ((v0 - 15)&((v0 - 15) >> Shift)); - w0 = LUTrowPtr[idx] >> 1; - mod = (w0 + sg0) ^ sg0; - modsum += mod; - - blkFilt[(y + pad)*(padwidth+4) + x + pad] = ((int16_t)((uint16_t)((modsum*bfac + bif_round_add) >> bif_round_shift))); - } + modsum += mod; + + blkFilt[(y + pad)*(padwidth + 4) + x + pad] = (( int16_t ) (( uint16_t ) ((modsum*bfac + bif_round_add) >> bif_round_shift))); } + } - // Copy back - Pel *tempBlockPtr = (short*)blkFilt + (((padwidth+4) << 1) + 2); - int tempBlockStride = padwidth+4; - if(isRDO) + // Copy back + Pel *tempBlockPtr = ( short* ) blkFilt + (((padwidth + 4) << 1) + 2); + int tempBlockStride = padwidth + 4; + if( isRDO ) + { + Pel *srcBlockPtr = ( short* ) block + (((padwidth) << 1) + 2); + int srcBlockStride = padwidth; + for( uint32_t yy = 0; yy < uiHeight; yy++ ) { - Pel *srcBlockPtr = (short*)block + (((padwidth) << 1) + 2); - int srcBlockStride = padwidth; - for(uint32_t yy = 0; yy < uiHeight; yy++) + for( uint32_t xx = 0; xx < uiWidth; xx++ ) { - for(uint32_t xx = 0; xx < uiWidth; xx++) - { - recPtr[xx] = ClipPel(srcBlockPtr[xx] + tempBlockPtr[xx], clpRng); - } - recPtr += recStride; - tempBlockPtr += tempBlockStride; - srcBlockPtr += srcBlockStride; + recPtr[xx] = ClipPel( srcBlockPtr[xx] + tempBlockPtr[xx], clpRng ); } + recPtr += recStride; + tempBlockPtr += tempBlockStride; + srcBlockPtr += srcBlockStride; } - else + } + else + { + for( uint32_t yy = 0; yy < uiHeight; yy++ ) { - for(uint32_t yy = 0; yy < uiHeight; yy++) + for( uint32_t xx = 0; xx < uiWidth; xx++ ) { - for(uint32_t xx = 0; xx < uiWidth; xx++) - { - // new result = old result (which is SAO-treated already) + diff due to bilateral filtering - recPtr[xx] = ClipPel<int>(recPtr[xx] + tempBlockPtr[xx], clpRng); - } - recPtr += recStride; - tempBlockPtr += tempBlockStride; + // new result = old result (which is SAO-treated already) + diff due to bilateral filtering + recPtr[xx] = ClipPel<int>( recPtr[xx] + tempBlockPtr[xx], clpRng ); } + recPtr += recStride; + tempBlockPtr += tempBlockStride; } } } @@ -568,34 +356,10 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu const unsigned uiHeight = predBuf.height; int bfac = 1; - - int size = std::min(uiWidth, uiHeight); - if (size <= 4) - bfac = 3; - else if (size >= 16) - bfac = 1; - else - bfac = 2; - if (currTU.cu->predMode == MODE_INTER) - { - if (size <= 4) - bfac = 2; - else if (size >= 16) - bfac = 1; - else - bfac = 2; - } - - qp = qp + currTU.cs->pps->getBIFQPOffset(); int bif_round_add = (BIF_ROUND_ADD) >> (currTU.cs->pps->getBIFStrength()); - int bif_round_shift = (BIF_ROUND_SHIFT) - (currTU.cs->pps->getBIFStrength()); - - int sqp = qp; - if(sqp<17) - sqp = 17; - if(sqp>42) - sqp = 42; - LUTrowPtr = wBIF[sqp-17]; + int bif_round_shift = ( BIF_ROUND_SHIFT ) -(currTU.cs->pps->getBIFStrength()); + + const char* LUTrowPtr = getFilterLutParameters( std::min( uiWidth, uiHeight ), currTU.cu->predMode, qp + currTU.cs->pps->getBIFQPOffset(), bfac ); const unsigned uiPredStride = predBuf.stride; const unsigned uiStrideRes = resiBuf.stride; @@ -612,7 +376,6 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu Pel *piRecoTemp = piReco; // Reco = Pred + Resi - Pel *tempBlockPtr; uint32_t uiWidthExt = uiWidth + (NUMBER_PADDED_SAMPLES << 1); @@ -702,9 +465,13 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu { // copy 4 pixels one line above block from block to blockx + 3 std::copy(piRecIPred - (uiRecIPredStride)+blockx, piRecIPred - (uiRecIPredStride)+blockx + 1, tempblock + 2 + uiWidthExt + blockx); - if(doReshape) - for(int xx = 0; xx < 1; xx++) - tempblock[2+uiWidthExt+blockx+xx] = pLUT[tempblock[2+uiWidthExt+blockx+xx]]; + if( doReshape ) + { + for( int xx = 0; xx < 1; xx++ ) + { + tempblock[2 + uiWidthExt + blockx + xx] = pLUT[tempblock[2 + uiWidthExt + blockx + xx]]; + } + } } } else if (subTuHor) @@ -716,13 +483,23 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu const Pel *earlierPel = earlierHalfBuf.buf + (currTU.prev->lheight() - 1)*earlierStride; std::copy(earlierPel, earlierPel + area.width, tempblock + 2 + uiWidthExt); - if(doReshape) - for(int xx = 0; xx < area.width; xx++) - tempblock[2+uiWidthExt+xx] = pLUT[tempblock[2+uiWidthExt+xx]]; + if( doReshape ) + { + for( int xx = 0; xx < area.width; xx++ ) + { + tempblock[2 + uiWidthExt + xx] = pLUT[tempblock[2 + uiWidthExt + xx]]; + } + } + std::copy(earlierPel - earlierStride, earlierPel - earlierStride + area.width, tempblock + 2); - if(doReshape) - for(int xx = 0; xx < area.width; xx++) - tempblock[2+xx] = pLUT[tempblock[2+xx]]; + + if( doReshape ) + { + for( int xx = 0; xx < area.width; xx++ ) + { + tempblock[2 + xx] = pLUT[tempblock[2 + xx]]; + } + } } // left column if (leftAvailable) @@ -767,7 +544,7 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu std::copy(tempblock + uiWidthExt, tempblock + uiWidthExt + uiWidthExt, tempblock); std::copy(tempblock + uiWidthExt*(uiHeightExt-2), tempblock + uiWidthExt*(uiHeightExt-2) + uiWidthExt, tempblock + uiWidthExt*(uiHeightExt-1)); - blockBilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, uiWidth + 4, bfac, bif_round_add, bif_round_shift, true); + m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, uiWidth + 4, bfac, bif_round_add, bif_round_shift, true, LUTrowPtr ); if (!useReco) { @@ -802,36 +579,9 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB int recStride = rec.get(COMPONENT_Y).stride; Pel *recPtr = rec.get(COMPONENT_Y).bufAt(compArea); - - int size = std::min(uiWidth, uiHeight); - int bfac = 1; - - if (size <= 4) - bfac = 3; - else if (size >= 16) - bfac = 1; - else - bfac = 2; - if (currTU.cu->predMode == MODE_INTER) - { - if (size <= 4) - bfac = 2; - else if (size >= 16) - bfac = 1; - else - bfac = 2; - } - - // Offset qp before deciding on LUT: - qp = qp + currTU.cs->pps->getBIFQPOffset(); - - int sqp = qp; - if(sqp<17) - sqp = 17; - if(sqp>42) - sqp = 42; - LUTrowPtr = wBIF[sqp-17]; + int bfac = 1; + const char* LUTrowPtr = getFilterLutParameters( std::min( uiWidth, uiHeight ), currTU.cu->predMode, qp + currTU.cs->pps->getBIFQPOffset(), bfac ); int bif_round_add = (BIF_ROUND_ADD) >> (currTU.cs->pps->getBIFStrength()); int bif_round_shift = (BIF_ROUND_SHIFT) - (currTU.cs->pps->getBIFStrength()); @@ -846,8 +596,10 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB uint32_t uiHeightExt = uiHeight + (NUMBER_PADDED_SAMPLES << 1); int iWidthExtSIMD = uiWidthExt; - if(uiWidth < 8) + if( uiWidth < 8 ) + { iWidthExtSIMD = 8 + (NUMBER_PADDED_SAMPLES << 1); + } Pel *tempBlockPtr; @@ -902,7 +654,7 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB { std::memcpy(tempBlockPtr, srcPtr, (uiWidthExt) * sizeof(Pel)); } - return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false); + return m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false, LUTrowPtr ); } else { @@ -1034,7 +786,7 @@ void BilateralFilter::bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitB } } - blockBilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false); + m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, false, LUTrowPtr ); } void BilateralFilter::clipNotBilaterallyFilteredBlocks(const CPelUnitBuf& src, PelUnitBuf& rec, const ClpRng& clpRng, TransformUnit & currTU) @@ -1225,10 +977,14 @@ void BilateralFilter::bilateralFilterPicRDOperCTU(CodingStructure& cs, PelUnitBu rec.copyFrom(src); } - if (bifParams.frmOn == 0) - std::fill(bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 0); - else if (bifParams.allCtuOn) - std::fill(bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 1); + if( bifParams.frmOn == 0 ) + { + std::fill( bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 0 ); + } + else if( bifParams.allCtuOn ) + { + std::fill( bifParams.ctuOn.begin(), bifParams.ctuOn.end(), 1 ); + } } #endif diff --git a/source/Lib/CommonLib/BilateralFilter.h b/source/Lib/CommonLib/BilateralFilter.h index 0504013e76fec51da5db3ac5c9ff019188456647..4cbd41be7cb19150bafba652de4e71334c4ec209 100755 --- a/source/Lib/CommonLib/BilateralFilter.h +++ b/source/Lib/CommonLib/BilateralFilter.h @@ -64,12 +64,10 @@ private: // = 2313 128-bit words which has been rounded up to 2320 above. short *tempblockFiltered = &tempblockFilteredTemp[-2]; - void blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO); - void simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO); - - char *LUTrowPtr; + void (*m_bilateralFilterDiamond5x5)( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr); + static void blockBilateralFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ); - char wBIF[26][16] = { + char m_wBIF[26][16] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 2, 2, 2, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, }, @@ -112,6 +110,19 @@ public: void bilateralFilterDiamond5x5(const CPelUnitBuf& src, PelUnitBuf& rec, int32_t qp, const ClpRng& clpRng, TransformUnit & currTU); void clipNotBilaterallyFilteredBlocks(const CPelUnitBuf& src, PelUnitBuf& rec, const ClpRng& clpRng, TransformUnit & currTU); + const char* getFilterLutParameters( const int size, const PredMode predMode, const int qp, int& bfac ); + +#if ENABLE_SIMD_BILATERAL_FILTER +#ifdef TARGET_SIMD_X86 + template<X86_VEXT vext> + static void simdFilterDiamond5x5( uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ); + + void initBilateralFilterX86(); + template <X86_VEXT vext> + void _initBilateralFilterX86(); +#endif +#endif + }; #endif diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 7210ce71c7abed003447fe9083e549086a3dfcc7..7a26296aafdd575c98bcc7e5a92fffb67e651a08 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -477,11 +477,6 @@ static const int ALF_VB_POS_ABOVE_CTUROW_CHMA = 2; static const int MAX_ENCODER_DEBLOCKING_QUALITY_LAYERS = 8 ; #endif -#if IDCC_TPM_JEM -static const int USE_MORE_BLOCKSIZE_DEPTH_MAX = IDCC_TMP_MaxSize_Depth - 1; -static const int INIT_THRESHOULD_SHIFTBITS = 2; ///< (default 2) Early skip threshold for checking distance. -#endif - #if SHARP_LUMA_DELTA_QP static const uint32_t LUMA_LEVEL_TO_DQP_LUT_MAXSIZE = 1024; ///< max LUT size for QP offset based on luma @@ -955,4 +950,12 @@ static const int MAX_FILTER_LENGTH_FIXED = 13; static const int FIX_FILTER_NUM_COEFF = 42; #endif +#if JVET_V0130_INTRA_TMP +static const int TMP_TEMPLATE_SIZE = 4; // must be multiple of 4 for SIMD +static const int TMP_MAXSIZE_DEPTH = 6; // should be log2(TMP_TEMPLATE_SIZE): keep as 6 to avoid any error +static const int USE_MORE_BLOCKSIZE_DEPTH_MAX = TMP_MAXSIZE_DEPTH - 1; +static const int INIT_THRESHOULD_SHIFTBITS = 2; ///< (default 2) Early skip threshold for checking distance. +static const int TMP_SEARCH_RANGE_MULT_FACTOR = 5; +#endif + #endif // end of #ifndef __COMMONDEF__ diff --git a/source/Lib/CommonLib/ContextModelling.cpp b/source/Lib/CommonLib/ContextModelling.cpp index 7855d636f03d74734a51f7f92acb5460e63cabfc..895e739d4da85b93e85c63642f9cebff728cca06 100644 --- a/source/Lib/CommonLib/ContextModelling.cpp +++ b/source/Lib/CommonLib/ContextModelling.cpp @@ -778,17 +778,17 @@ void MergeCtx::setMmvdMergeCandiInfo(PredictionUnit& pu, int candIdx) PU::restrictBiPredMergeCandsOne(pu); } -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP unsigned DeriveCtx::CtxTmpFlag(const CodingUnit& cu) { const CodingStructure* cs = cu.cs; unsigned ctxId = 0; const CodingUnit* cuLeft = cs->getCURestricted(cu.lumaPos().offset(-1, 0), cu, CH_L); - ctxId = (cuLeft && cuLeft->TmpFlag) ? 1 : 0; + ctxId = (cuLeft && cuLeft->tmpFlag) ? 1 : 0; const CodingUnit* cuAbove = cs->getCURestricted(cu.lumaPos().offset(0, -1), cu, CH_L); - ctxId += (cuAbove && cuAbove->TmpFlag) ? 1 : 0; + ctxId += (cuAbove && cuAbove->tmpFlag) ? 1 : 0; ctxId = (cu.lwidth() > 2 * cu.lheight() || cu.lheight() > 2 * cu.lwidth()) ? 3 : ctxId; diff --git a/source/Lib/CommonLib/ContextModelling.h b/source/Lib/CommonLib/ContextModelling.h index 6e3ad4616ecfce51ec1eacf7418eb2608fce7406..834e03af34a3c377352cdbcc5d6afa0f90870959 100644 --- a/source/Lib/CommonLib/ContextModelling.h +++ b/source/Lib/CommonLib/ContextModelling.h @@ -611,7 +611,7 @@ unsigned CtxAffineFlag( const CodingUnit& cu ); unsigned CtxPredModeFlag( const CodingUnit& cu ); unsigned CtxIBCFlag(const CodingUnit& cu); unsigned CtxMipFlag ( const CodingUnit& cu ); -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP unsigned CtxTmpFlag(const CodingUnit& cu); #endif unsigned CtxPltCopyFlag( const unsigned prevRunType, const unsigned dist ); diff --git a/source/Lib/CommonLib/Contexts.cpp b/source/Lib/CommonLib/Contexts.cpp index 5b61e1b97c98302457599341a7218b96a4771802..dfce0eb5fa9ac8c61ce60203397cadb392f9f0dd 100644 --- a/source/Lib/CommonLib/Contexts.cpp +++ b/source/Lib/CommonLib/Contexts.cpp @@ -1000,15 +1000,15 @@ const CtxSet ContextSetCfg::MipFlag = ContextSetCfg::addCtxSet { 9, 9, 8, 6 }, { 10, 10, 9, 6 } }); -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP const CtxSet ContextSetCfg::TmpFlag = ContextSetCfg::addCtxSet ({ { CNU, CNU, CNU, CNU, }, { CNU, CNU, CNU, CNU, }, { CNU, CNU, CNU, CNU, }, - { DWS, DWS, DWS, DWS, }, - { DWS, DWS, DWS, DWS, }, - { DWS, DWS, DWS, DWS, }, + { DWS, DWS, DWS, DWS, }, + { DWS, DWS, DWS, DWS, }, + { DWS, DWS, DWS, DWS, }, }); #endif @@ -2089,13 +2089,13 @@ const CtxSet ContextSetCfg::MipFlag = ContextSetCfg::addCtxSet { 33, 49, 50, 25, }, { 9, 10, 9, 6, }, }); -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP const CtxSet ContextSetCfg::TmpFlag = ContextSetCfg::addCtxSet ({ { CNU, CNU, CNU, CNU, }, { CNU, CNU, CNU, CNU, }, { CNU, CNU, CNU, CNU, }, - { DWS, DWS, DWS, DWS, }, + { DWS, DWS, DWS, DWS, }, }); #endif diff --git a/source/Lib/CommonLib/Contexts.h b/source/Lib/CommonLib/Contexts.h index 0506ecf1983adfc686a63adacecea0f47646e06f..8993e11bbff8dccf4930e823bf6bb63ff935030c 100644 --- a/source/Lib/CommonLib/Contexts.h +++ b/source/Lib/CommonLib/Contexts.h @@ -245,7 +245,7 @@ public: static const CtxSet CclmModeIdx; static const CtxSet IntraChromaPredMode; static const CtxSet MipFlag; -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP static const CtxSet TmpFlag; #endif #if MMLM diff --git a/source/Lib/CommonLib/IntraPrediction.cpp b/source/Lib/CommonLib/IntraPrediction.cpp index 4df14069b127922b9e232a2631d6b7e8356ddf01..bc5c898ed0d09a8e270f0db2f8d5176041b8e4dc 100644 --- a/source/Lib/CommonLib/IntraPrediction.cpp +++ b/source/Lib/CommonLib/IntraPrediction.cpp @@ -686,7 +686,7 @@ void IntraPrediction::initPredIntraParams(const PredictionUnit & pu, const CompA if( sps.getSpsRangeExtension().getIntraSmoothingDisabledFlag() || !isLuma( chType ) || useISP -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP || PU::isTmp(pu, chType) #endif || PU::isMIP( pu, chType ) @@ -1394,7 +1394,7 @@ void IntraPrediction::initIntraPatternChTypeISP(const CodingUnit& cu, const Comp } } -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area) { const ChannelType chType = toChannelType(area.compID); @@ -1402,14 +1402,12 @@ bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area) const SPS& sps = *cs.sps; const PreCalcValues& pcv = *cs.pcv; - const int tuWidth = area.width; const int tuHeight = area.height; const int predSize = m_topRefLength; const int predHSize = m_leftRefLength; //const int predStride = predSize; - const int unitWidth = pcv.minCUWidth >> getComponentScaleX(area.compID, sps.getChromaFormatIdc()); const int unitHeight = pcv.minCUHeight >> getComponentScaleY(area.compID, sps.getChromaFormatIdc()); @@ -1421,8 +1419,10 @@ bool IntraPrediction::isRefTemplateAvailable(CodingUnit& cu, CompArea& area) const int numAboveRightUnits = totalAboveUnits - numAboveUnits; const int numLeftBelowUnits = totalLeftUnits - numLeftUnits; - if (numAboveUnits <= 0 || numLeftUnits <= 0 || numAboveRightUnits <= 0 || numLeftBelowUnits <= 0) - return false; + if( numAboveUnits <= 0 || numLeftUnits <= 0 || numAboveRightUnits <= 0 || numLeftBelowUnits <= 0 ) + { + return false; + } // ----- Step 1: analyze neighborhood ----- const Position posLT = area; diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h index 5d7ae3173634dd702b00ccf6248c3021e7e5f51e..95a98729e8b67700dfec4de8d5f5f4ad4614de51 100644 --- a/source/Lib/CommonLib/IntraPrediction.h +++ b/source/Lib/CommonLib/IntraPrediction.h @@ -144,7 +144,7 @@ protected: void xPredIntraBDPCM ( const CPelBuf &pSrc, PelBuf &pDst, const uint32_t dirMode, const ClpRng& clpRng ); Pel xGetPredValDc ( const CPelBuf &pSrc, const Size &dstSize ); -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP bool isRefTemplateAvailable(CodingUnit& cu, CompArea& area); #endif diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp index 94ef8a9abcb4733eb8b16024af158ac1cfdbf6a5..934d14c39ac50bc84650f081b54be54510cfe0e7 100644 --- a/source/Lib/CommonLib/Slice.cpp +++ b/source/Lib/CommonLib/Slice.cpp @@ -3113,9 +3113,9 @@ SPS::SPS() #if ENABLE_DIMD , m_dimd ( false ) #endif -#if IDCC_TPM_JEM -, m_IntraTMP ( false ) -, m_IntraTMP_MaxSize ( 64 ) +#if JVET_V0130_INTRA_TMP +, m_intraTMP ( false ) +, m_intraTmpMaxSize ( 64 ) #endif #if ENABLE_OBMC , m_OBMC ( false ) diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index ec74b504c951a6289b65f4852c511a86dc6f5efe..f53244b1dbbedad876acfc6f5385cf719faa4399 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -1651,9 +1651,9 @@ private: #if ENABLE_DIMD bool m_dimd; #endif -#if IDCC_TPM_JEM - bool m_IntraTMP; ///< intra Template Matching - unsigned m_IntraTMP_MaxSize; ///< max CU size for which intra TMP is allowed +#if JVET_V0130_INTRA_TMP + bool m_intraTMP; ///< intra Template Matching + unsigned m_intraTmpMaxSize; ///< max CU size for which intra TMP is allowed #endif #if ENABLE_OBMC bool m_OBMC; @@ -2077,11 +2077,11 @@ void setCCALFEnabledFlag( bool b ) void setUseDimd ( bool b ) { m_dimd = b; } bool getUseDimd () const { return m_dimd; } #endif -#if IDCC_TPM_JEM - void setUseIntraTMP(bool b) { m_IntraTMP = b; } - bool getUseIntraTMP() const { return m_IntraTMP; } - void setIntraTMPMaxSize(unsigned n) { m_IntraTMP_MaxSize = n; } - unsigned getIntraTMPMaxSize() const { return m_IntraTMP_MaxSize; } +#if JVET_V0130_INTRA_TMP + void setUseIntraTMP (bool b) { m_intraTMP = b; } + bool getUseIntraTMP () const { return m_intraTMP; } + void setIntraTMPMaxSize (unsigned n) { m_intraTmpMaxSize = n; } + unsigned getIntraTMPMaxSize () const { return m_intraTmpMaxSize; } #endif #if ENABLE_OBMC void setUseOBMC ( bool b ) { m_OBMC = b; } diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp index c74ab4c431e67373d95234b32e93db2c78fd177f..5889375a2e2f21adafaf099111dfe9810d04b936 100644 --- a/source/Lib/CommonLib/TrQuant.cpp +++ b/source/Lib/CommonLib/TrQuant.cpp @@ -55,16 +55,14 @@ #include "CommonLib/CodingStatistics.h" #endif -#if IDCC_TMP_SIMD +#if ENABLE_SIMD_TMP #include "CommonDefX86.h" #endif -#if IDCC_TPM_JEM - +#if JVET_V0130_INTRA_TMP unsigned int g_uiDepth2Width[5] = { 4, 8, 16, 32, 64 }; #endif - struct coeffGroupRDStats { int iNNZbeforePos0; @@ -197,8 +195,8 @@ TrQuant::TrQuant() : m_quant( nullptr ) m_fwdICT[-2] = fwdTransformCbCr<-2>; m_fwdICT[ 3] = fwdTransformCbCr< 3>; m_fwdICT[-3] = fwdTransformCbCr<-3>; -#if IDCC_TPM_JEM - m_pppTarPatch = NULL; +#if JVET_V0130_INTRA_TMP + m_pppTarPatch = NULL; #endif } } @@ -210,17 +208,15 @@ TrQuant::~TrQuant() delete m_quant; m_quant = nullptr; } -#if IDCC_TPM_JEM -#endif -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP if (m_pppTarPatch != NULL) { for (unsigned int uiDepth = 0; uiDepth < USE_MORE_BLOCKSIZE_DEPTH_MAX; uiDepth++) { unsigned int blkSize = g_uiDepth2Width[uiDepth]; - unsigned int patchSize = blkSize + IDCC_TemplateSize; + unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE; for (unsigned int uiRow = 0; uiRow < patchSize; uiRow++) { if (m_pppTarPatch[uiDepth][uiRow] != NULL) @@ -275,7 +271,7 @@ void TrQuant::init( const Quant* otherQuant, } -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP unsigned int blkSize; if (m_pppTarPatch == NULL) @@ -285,7 +281,7 @@ void TrQuant::init( const Quant* otherQuant, { blkSize = g_uiDepth2Width[uiDepth]; - unsigned int patchSize = blkSize + IDCC_TemplateSize; + unsigned int patchSize = blkSize + TMP_TEMPLATE_SIZE; m_pppTarPatch[uiDepth] = new Pel * [patchSize]; for (unsigned int uiRow = 0; uiRow < patchSize; uiRow++) { @@ -328,6 +324,9 @@ void TrQuant::init( const Quant* otherQuant, #if ENABLE_SIMD_SIGN_PREDICTION m_computeSAD = xComputeSAD; #endif +#if INTRA_TEMPLATE_MATCHING + m_calcTemplateDiff = calcTemplateDiff; +#endif #if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT #ifdef TARGET_SIMD_X86 @@ -435,32 +434,30 @@ void TrQuant::invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32 } } -#if IDCC_TPM_JEM -void insertNode(DistType diff, int& iXOffset, int& iYOffset, DistType& pDiff, int& pX, int& pY, short& pId, unsigned int& setId) +#if JVET_V0130_INTRA_TMP +void insertNode(int diff, int& iXOffset, int& iYOffset, int& pDiff, int& pX, int& pY, short& pId, unsigned int& setId) { pDiff = diff; pX = iXOffset; pY = iYOffset; pId = setId; } -#if IDCC_TPM_JEM + void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iHorMax, int& iVerMin, int& iVerMax, unsigned int uiTemplateSize, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int iCurrY, int iCurrX, int offsetLCUY, int offsetLCUX) { - int SearchRange_Height, SearchRange_Width; - - SearchRange_Width = IDCC_SearchRangeMultFactor * uiBlkWidth; - SearchRange_Height = IDCC_SearchRangeMultFactor * uiBlkHeight; - int iMvShift = 0; + int searchRangeWidth = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkWidth; + int searchRangeHeight = TMP_SEARCH_RANGE_MULT_FACTOR * uiBlkHeight; + int iMvShift = 0; int iTemplateSize = uiTemplateSize; int iBlkWidth = uiBlkWidth; int iBlkHeight = uiBlkHeight; if (regionId == 0) //above outside LCU { - iHorMax = std::min((iCurrX + SearchRange_Width) << iMvShift, (int)((pcCU->cs->sps->getMaxPicWidthInLumaSamples() - iBlkWidth) << iMvShift)); - iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - SearchRange_Width) << iMvShift); + iHorMax = std::min((iCurrX + searchRangeWidth) << iMvShift, (int)((pcCU->cs->sps->getMaxPicWidthInLumaSamples() - iBlkWidth) << iMvShift)); + iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift); iVerMax = (iCurrY - iBlkHeight - offsetLCUY) << iMvShift; - iVerMin = std::max(((iTemplateSize) << iMvShift), ((iCurrY - SearchRange_Height) << iMvShift)); + iVerMin = std::max(((iTemplateSize) << iMvShift), ((iCurrY - searchRangeHeight) << iMvShift)); iHorMin = iHorMin - iCurrX; iHorMax = iHorMax - iCurrX; @@ -470,7 +467,7 @@ void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iH else if (regionId == 1) //left outside LCU { iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift; - iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - SearchRange_Width) << iMvShift); + iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift); iVerMin = std::max((iTemplateSize) << iMvShift, (iCurrY - iBlkHeight - offsetLCUY) << iMvShift); iVerMax = (iCurrY) << iMvShift; @@ -482,7 +479,7 @@ void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iH } else if (regionId == 2) //left outside LCU (can reach the bottom row of LCU) { - iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - SearchRange_Width) << iMvShift); + iHorMin = std::max((iTemplateSize) << iMvShift, (iCurrX - searchRangeWidth) << iMvShift); iHorMax = (iCurrX - offsetLCUX - iBlkWidth) << iMvShift; iVerMin = (iCurrY + 1) << iMvShift; iVerMax = std::min(pcCU->cs->sps->getMaxPicHeightInLumaSamples() - iBlkHeight, (iCurrY - offsetLCUY + pcCU->cs->sps->getCTUSize() - iBlkHeight) << iMvShift); @@ -493,10 +490,7 @@ void clipMvIntraConstraint(CodingUnit* pcCU, int regionId, int& iHorMin, int& iH iVerMin = iVerMin - iCurrY; } } -#endif -#endif -#if IDCC_TPM_JEM TempLibFast::TempLibFast() { } @@ -504,12 +498,10 @@ TempLibFast::TempLibFast() TempLibFast::~TempLibFast() { } -#endif -#if IDCC_TPM_JEM void TempLibFast::initTemplateDiff(unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int bitDepth) { - DistType maxValue = ((1 << bitDepth) >> (INIT_THRESHOULD_SHIFTBITS)) * (uiPatchHeight * uiPatchWidth - uiBlkHeight * uiBlkWidth); + int maxValue = ((1 << bitDepth) >> (INIT_THRESHOULD_SHIFTBITS)) * (uiPatchHeight * uiPatchWidth - uiBlkHeight * uiBlkWidth); m_diffMax = maxValue; { m_pDiff = maxValue; @@ -519,8 +511,8 @@ void TempLibFast::initTemplateDiff(unsigned int uiPatchWidth, unsigned int uiPat void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight) { const ComponentID compID = COMPONENT_Y; - unsigned int uiPatchWidth = uiBlkWidth + IDCC_TemplateSize; - unsigned int uiPatchHeight = uiBlkHeight + IDCC_TemplateSize; + unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE; + unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE; unsigned int uiTarDepth = floorLog2(std::max(uiBlkHeight, uiBlkWidth)) - 2; Pel** tarPatch = m_pppTarPatch[uiTarDepth]; CompArea area = pcCU->blocks[compID]; @@ -528,13 +520,11 @@ void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsig unsigned int uiPicStride = pcCU->cs->picture->getRecoBuf(compID).stride; unsigned int uiY, uiX; - - //fill template //up-left & up Pel* tarTemp; - Pel* pCurrTemp = pCurrStart - IDCC_TemplateSize * uiPicStride - IDCC_TemplateSize; - for (uiY = 0; uiY < IDCC_TemplateSize; uiY++) + Pel* pCurrTemp = pCurrStart - TMP_TEMPLATE_SIZE * uiPicStride - TMP_TEMPLATE_SIZE; + for (uiY = 0; uiY < TMP_TEMPLATE_SIZE; uiY++) { tarTemp = tarPatch[uiY]; for (uiX = 0; uiX < uiPatchWidth; uiX++) @@ -544,10 +534,10 @@ void TrQuant::getTargetTemplate(CodingUnit* pcCU, unsigned int uiBlkWidth, unsig pCurrTemp += uiPicStride; } //left - for (uiY = IDCC_TemplateSize; uiY < uiPatchHeight; uiY++) + for (uiY = TMP_TEMPLATE_SIZE; uiY < uiPatchHeight; uiY++) { tarTemp = tarPatch[uiY]; - for (uiX = 0; uiX < IDCC_TemplateSize; uiX++) + for (uiX = 0; uiX < TMP_TEMPLATE_SIZE; uiX++) { tarTemp[uiX] = pCurrTemp[uiX]; } @@ -559,8 +549,8 @@ void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, un { const ComponentID compID = COMPONENT_Y; const int channelBitDepth = pcCU->cs->sps->getBitDepth(toChannelType(compID)); - unsigned int uiPatchWidth = uiBlkWidth + IDCC_TemplateSize; - unsigned int uiPatchHeight = uiBlkHeight + IDCC_TemplateSize; + unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE; + unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE; unsigned int uiTarDepth = floorLog2(std::max(uiBlkWidth, uiBlkHeight)) - 2; Pel** tarPatch = getTargetPatch(uiTarDepth); //Initialize the library for saving the best candidates @@ -568,25 +558,29 @@ void TrQuant::candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, un short setId = 0; //record the reference picture. searchCandidateFromOnePicIntra(pcCU, tarPatch, uiPatchWidth, uiPatchHeight, setId); //count collected candidate number - DistType pDiff = m_tempLibFast.getDiff(); - DistType maxDiff = m_tempLibFast.getDiffMax(); + int pDiff = m_tempLibFast.getDiff(); + int maxDiff = m_tempLibFast.getDiffMax(); - if (pDiff < maxDiff) - m_uiVaildCandiNum = 1; - else - m_uiVaildCandiNum = 0; + if( pDiff < maxDiff ) + { + m_uiVaildCandiNum = 1; + } + else + { + m_uiVaildCandiNum = 0; + } } void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId) { const ComponentID compID = COMPONENT_Y; - unsigned int uiBlkWidth = uiPatchWidth - IDCC_TemplateSize; - unsigned int uiBlkHeight = uiPatchHeight - IDCC_TemplateSize; + unsigned int uiBlkWidth = uiPatchWidth - TMP_TEMPLATE_SIZE; + unsigned int uiBlkHeight = uiPatchHeight - TMP_TEMPLATE_SIZE; int pX = m_tempLibFast.getX(); int pY = m_tempLibFast.getY(); - DistType pDiff = m_tempLibFast.getDiff(); + int pDiff = m_tempLibFast.getDiff(); short pId = m_tempLibFast.getId(); CompArea area = pcCU->blocks[compID]; int refStride = pcCU->cs->picture->getRecoBuf(compID).stride; @@ -594,9 +588,7 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, Pel* ref = pcCU->cs->picture->getRecoBuf(area).buf; setRefPicUsed(ref); //facilitate the access of each candidate point - setStride(refStride); - Mv cTmpMvPred; cTmpMvPred.setZero(); @@ -614,27 +606,23 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, int iYOffset, iXOffset; - DistType diff; + int diff; Pel* refCurr; - -#define REGION_NUM 3 - int mvYMins[REGION_NUM]; - int mvYMaxs[REGION_NUM]; - int mvXMins[REGION_NUM]; - int mvXMaxs[REGION_NUM]; - int regionNum = REGION_NUM; + const int regionNum = 3; + int mvYMins[regionNum]; + int mvYMaxs[regionNum]; + int mvXMins[regionNum]; + int mvXMaxs[regionNum]; int regionId = 0; - //1. check the near pixels within LCU //above pixels in LCU - int iTemplateSize = IDCC_TemplateSize; + int iTemplateSize = TMP_TEMPLATE_SIZE; int iBlkWidth = uiBlkWidth; int iBlkHeight = uiBlkHeight; regionId = 0; int iMvShift = 0; - int iVerMin = std::max(((iTemplateSize) << iMvShift), (iCurrY - offsetLCUY - iBlkHeight + 1) << iMvShift); int iVerMax = (iCurrY - iBlkHeight) << iMvShift; @@ -646,8 +634,6 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, mvYMins[regionId] = iVerMin - iCurrY; mvYMaxs[regionId] = iVerMax - iCurrY; - - //check within CTU pixels for (regionId = 0; regionId < 1; regionId++) { @@ -659,12 +645,13 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, { continue; } + for (iYOffset = mvYMax; iYOffset >= mvYMin; iYOffset--) { for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--) { refCurr = ref + iYOffset * refStride + iXOffset; - diff = calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff); + diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff); if (diff < (pDiff)) { insertNode(diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId); @@ -680,8 +667,9 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, //2. check the pixels outside CTU for (regionId = 0; regionId < regionNum; regionId++) {// this function fills in the range the template matching for pixels outside the current CTU - clipMvIntraConstraint(pcCU, regionId, mvXMins[regionId], mvXMaxs[regionId], mvYMins[regionId], mvYMaxs[regionId], IDCC_TemplateSize, uiBlkWidth, uiBlkHeight, iCurrY, iCurrX, offsetLCUY, offsetLCUX); + clipMvIntraConstraint(pcCU, regionId, mvXMins[regionId], mvXMaxs[regionId], mvYMins[regionId], mvYMaxs[regionId], TMP_TEMPLATE_SIZE, uiBlkWidth, uiBlkHeight, iCurrY, iCurrX, offsetLCUY, offsetLCUX); } + for (regionId = 0; regionId < regionNum; regionId++) { int mvYMin = mvYMins[regionId]; @@ -697,11 +685,13 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, for (iXOffset = mvXMax; iXOffset >= mvXMin; iXOffset--) { refCurr = ref + iYOffset * refStride + iXOffset; - diff = calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff); + diff = m_calcTemplateDiff(refCurr, refStride, tarPatch, uiPatchWidth, uiPatchHeight, pDiff); + if (diff < (pDiff)) { insertNode(diff, iXOffset, iYOffset, pDiff, pX, pY, pId, setId); } + if (pDiff == 0) { regionId = regionNum; @@ -709,6 +699,7 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, } } } + m_tempLibFast.m_pX = pX; m_tempLibFast.m_pY = pY; m_tempLibFast.m_pDiff = pDiff; @@ -717,8 +708,8 @@ void TrQuant::searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned int uiBlkWidth, unsigned int uiBlkHeight, int& foundCandiNum) { bool bSucceedFlag = true; - unsigned int uiPatchWidth = uiBlkWidth + IDCC_TemplateSize; - unsigned int uiPatchHeight = uiBlkHeight + IDCC_TemplateSize; + unsigned int uiPatchWidth = uiBlkWidth + TMP_TEMPLATE_SIZE; + unsigned int uiPatchHeight = uiBlkHeight + TMP_TEMPLATE_SIZE; foundCandiNum = m_uiVaildCandiNum; if (foundCandiNum < 1) @@ -732,8 +723,8 @@ bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned int picStride = getStride(); int iOffsetY, iOffsetX; Pel* refTarget; - unsigned int uiHeight = uiPatchHeight - IDCC_TemplateSize; - unsigned int uiWidth = uiPatchWidth - IDCC_TemplateSize; + unsigned int uiHeight = uiPatchHeight - TMP_TEMPLATE_SIZE; + unsigned int uiWidth = uiPatchWidth - TMP_TEMPLATE_SIZE; //the data center: we use the prediction block as the center now. //collect the candidates @@ -755,128 +746,46 @@ bool TrQuant::generateTMPrediction(Pel* piPred, unsigned int uiStride, unsigned return bSucceedFlag; } -DistType TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, DistType iMax) +int TrQuant::calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax) { - DistType iDiffSum = 0; - int iY; - Pel* refPatchRow = ref - IDCC_TemplateSize * uiStride - IDCC_TemplateSize; - Pel* tarPatchRow; - - uint32_t uiSum; - // horizontal difference - for (iY = 0; iY < IDCC_TemplateSize; iY++) - { - tarPatchRow = tarPatch[iY]; - const short* pSrc1 = (const short*)tarPatchRow; - const short* pSrc2 = (const short*)refPatchRow; - - // SIMD difference - //int iRows = uiPatchHeight; - int iCols = uiPatchWidth; - if ((iCols & 7) == 0) - { - // Do with step of 8 - __m128i vzero = _mm_setzero_si128(); - __m128i vsum32 = vzero; - //for (int iY = 0; iY < iRows; iY += iSubStep) - { - __m128i vsum16 = vzero; - for (int iX = 0; iX < iCols; iX += 8) - { - __m128i vsrc1 = _mm_loadu_si128((const __m128i*)(&pSrc1[iX])); - __m128i vsrc2 = _mm_lddqu_si128((const __m128i*)(&pSrc2[iX])); - vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2))); - } - __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero)); - vsum32 = _mm_add_epi32(vsum32, vsumtemp); - //pSrc1 += iStrideSrc1; - //pSrc2 += iStrideSrc2; - } - vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e)); // 01001110 - vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1)); // 10110001 - uiSum = _mm_cvtsi128_si32(vsum32); - } - else - { - // Do with step of 4 - __m128i vzero = _mm_setzero_si128(); - __m128i vsum32 = vzero; - //for (int iY = 0; iY < iRows; iY += iSubStep) - { - __m128i vsum16 = vzero; - for (int iX = 0; iX < iCols; iX += 4) - { - __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]); - __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]); - vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2))); - } - __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero)); - vsum32 = _mm_add_epi32(vsum32, vsumtemp); - //pSrc1 += iStrideSrc1; - //pSrc2 += iStrideSrc2; - } - vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e)); // 01001110 - vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1)); // 10110001 - uiSum = _mm_cvtsi128_si32(vsum32); - } - iDiffSum += uiSum; - - if (iDiffSum > iMax) //for speeding up - { - return iDiffSum; - } - // update location - refPatchRow += uiStride; - } + int iDiffSum = 0; + Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE; + Pel* tarPatchRow; - // vertical difference - int iCols = IDCC_TemplateSize; - for (iY = IDCC_TemplateSize; iY < uiPatchHeight; iY++) - { - tarPatchRow = tarPatch[iY]; - const short* pSrc1 = (const short*)tarPatchRow; - const short* pSrc2 = (const short*)refPatchRow ; - - // SIMD difference - - // Do with step of 4 - __m128i vzero = _mm_setzero_si128(); - __m128i vsum32 = vzero; - //for (int iY = 0; iY < iRows; iY += iSubStep) - { - __m128i vsum16 = vzero; - for (int iX = 0; iX < iCols; iX += 4) - { - __m128i vsrc1 = _mm_loadl_epi64((const __m128i*) & pSrc1[iX]); - __m128i vsrc2 = _mm_loadl_epi64((const __m128i*) & pSrc2[iX]); - vsum16 = _mm_add_epi16(vsum16, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2))); - } - __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero)); - vsum32 = _mm_add_epi32(vsum32, vsumtemp); - //pSrc1 += iStrideSrc1; - //pSrc2 += iStrideSrc2; - } - vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e)); // 01001110 - vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1)); // 10110001 - uiSum = _mm_cvtsi128_si32(vsum32); + // horizontal difference + for( int iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ ) + { + tarPatchRow = tarPatch[iY]; + for( int iX = 0; iX < uiPatchWidth; iX++ ) + { + iDiffSum += abs( refPatchRow[iX] - tarPatchRow[iX] ); + } + if( iDiffSum > iMax ) //for speeding up + { + return iDiffSum; + } + refPatchRow += uiStride; + } - iDiffSum += uiSum; + // vertical difference + for( int iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ ) + { + tarPatchRow = tarPatch[iY]; + for( int iX = 0; iX < TMP_TEMPLATE_SIZE; iX++ ) + { + iDiffSum += abs( refPatchRow[iX] - tarPatchRow[iX] ); + } + if( iDiffSum > iMax ) //for speeding up + { + return iDiffSum; + } + refPatchRow += uiStride; + } - if (iDiffSum > iMax) //for speeding up - { - return iDiffSum; - } - // update location - refPatchRow += uiStride; - } - - return iDiffSum; - + return iDiffSum; } #endif - - uint32_t TrQuant::getLFNSTIntraMode( int wideAngPredMode ) { uint32_t intraMode; @@ -930,11 +839,11 @@ void TrQuant::xInvLfnst( const TransformUnit &tu, const ComponentID compID ) { intraMode = PLANAR_IDX; } -#if IDCC_TPM_JEM - if (PU::isTmp(*tu.cs->getPU(area.pos(), toChannelType(compID)), toChannelType(compID))) - { - intraMode = PLANAR_IDX; - } +#if JVET_V0130_INTRA_TMP + if( PU::isTmp( *tu.cs->getPU( area.pos(), toChannelType( compID ) ), toChannelType( compID ) ) ) + { + intraMode = PLANAR_IDX; + } #endif CHECK( intraMode >= NUM_INTRA_MODE - 1, "Invalid intra mode" ); @@ -1076,11 +985,11 @@ void TrQuant::xFwdLfnst( const TransformUnit &tu, const ComponentID compID, cons { intraMode = PLANAR_IDX; } -#if IDCC_TPM_JEM - if (PU::isTmp(*tu.cs->getPU(area.pos(), toChannelType(compID)), toChannelType(compID))) - { - intraMode = PLANAR_IDX; - } +#if JVET_V0130_INTRA_TMP + if( PU::isTmp( *tu.cs->getPU( area.pos(), toChannelType( compID ) ), toChannelType( compID ) ) ) + { + intraMode = PLANAR_IDX; + } #endif CHECK( intraMode >= NUM_INTRA_MODE - 1, "Invalid intra mode" ); @@ -1319,8 +1228,8 @@ void TrQuant::getTrTypes(const TransformUnit tu, const ComponentID compID, int & return; } -#if IDCC_TPM_JEM - if (isImplicitMTS || isISP || tu.cu->TmpFlag) +#if JVET_V0130_INTRA_TMP + if (isImplicitMTS || isISP || tu.cu->tmpFlag) #else if (isImplicitMTS || isISP) #endif diff --git a/source/Lib/CommonLib/TrQuant.h b/source/Lib/CommonLib/TrQuant.h index f116d83e256f8ab45ccf75ef0296a4a6ca14bcf1..8556825e4e00241ab087347b8a5609876dfea999 100644 --- a/source/Lib/CommonLib/TrQuant.h +++ b/source/Lib/CommonLib/TrQuant.h @@ -57,10 +57,7 @@ typedef void InvTrans(const TCoeff*, TCoeff*, int, int, int, int, const TCoeff, -#if IDCC_TPM_JEM - - -#define MAX_1DTRANS_LEN (1 << (((USE_MORE_BLOCKSIZE_DEPTH_MAX) + 1) << 1)) ///< 4x4 = 16, 8x8 = 64, 16x16=256, 32x32 = 1024 +#if JVET_V0130_INTRA_TMP extern unsigned int g_uiDepth2Width[5]; extern unsigned int g_uiDepth2MaxCandiNum[5]; @@ -71,13 +68,13 @@ public: int m_pY; //offset Y int m_pXInteger; //offset X for integer pixel search int m_pYInteger; //offset Y for integer pixel search - DistType m_pDiffInteger; + int m_pDiffInteger; int getXInteger() { return m_pXInteger; } int getYInteger() { return m_pYInteger; } - DistType getDiffInteger() { return m_pDiffInteger; } + int getDiffInteger() { return m_pDiffInteger; } short m_pIdInteger; //frame id short getIdInteger() { return m_pIdInteger; } - DistType m_pDiff; //mse + int m_pDiff; //mse short m_pId; //frame id @@ -86,7 +83,7 @@ public: //void init(); int getX() { return m_pX; } int getY() { return m_pY; } - DistType getDiff() { return m_pDiff; } + int getDiff() { return m_pDiff; } short getId() { return m_pId; } /*void initDiff(unsigned int uiPatchSize, int bitDepth); void initDiff(unsigned int uiPatchSize, int bitDepth, int iCandiNumber);*/ @@ -132,13 +129,14 @@ public: void fwdLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ); void invLfnstNxN( int* src, int* dst, const uint32_t mode, const uint32_t index, const uint32_t size, int zeroOutSize ); #endif -#if IDCC_TPM_JEM - DistType calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, DistType iMax); - Pel** getTargetPatch(unsigned int uiDepth) { return m_pppTarPatch[uiDepth]; } - Pel* getRefPicUsed() { return m_refPicUsed; } - void setRefPicUsed(Pel* ref) { m_refPicUsed = ref; } - unsigned int getStride() { return m_uiPicStride; } - void setStride(unsigned int uiPicStride) { m_uiPicStride = uiPicStride; } +#if JVET_V0130_INTRA_TMP + int ( *m_calcTemplateDiff )(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax); + static int calcTemplateDiff(Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax); + Pel** getTargetPatch(unsigned int uiDepth) { return m_pppTarPatch[uiDepth]; } + Pel* getRefPicUsed() { return m_refPicUsed; } + void setRefPicUsed(Pel* ref) { m_refPicUsed = ref; } + unsigned int getStride() { return m_uiPicStride; } + void setStride(unsigned int uiPicStride) { m_uiPicStride = uiPicStride; } void searchCandidateFromOnePicIntra(CodingUnit* pcCU, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, unsigned int setId); void candidateSearchIntra(CodingUnit* pcCU, unsigned int uiBlkWidth, unsigned int uiBlkHeight); @@ -199,14 +197,14 @@ public: protected: TCoeff m_tempCoeff[MAX_TB_SIZEY * MAX_TB_SIZEY]; -#if IDCC_TPM_JEM - int m_uiPartLibSize; - TempLibFast m_tempLibFast; - Pel* m_refPicUsed; - Picture* m_refPicBuf; +#if JVET_V0130_INTRA_TMP + int m_uiPartLibSize; + TempLibFast m_tempLibFast; + Pel* m_refPicUsed; + Picture* m_refPicBuf; unsigned int m_uiPicStride; unsigned int m_uiVaildCandiNum; - Pel*** m_pppTarPatch; + Pel*** m_pppTarPatch; #endif #if SIGN_PREDICTION Pel m_tempSignPredResid[SIGN_PRED_MAX_BS * SIGN_PRED_MAX_BS * 2]{0}; @@ -282,7 +280,7 @@ private: static void fastInverseTransform_SIMD( const TCoeff *coeff, TCoeff *block, int shift, int line, int iSkipLine, int iSkipLine2, const TCoeff outputMinimum, const TCoeff outputMaximum ); #endif -#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT +#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP #ifdef TARGET_SIMD_X86 void initTrQuantX86(); template <X86_VEXT vext> diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index 67363bc713e5b5816e9dff58f580e9e4cea20e5a..9cb01a37576d9d26be8fbad57ed503f2b3f6a32d 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -1,4 +1,4 @@ -/* The copyright in this software is being made available under the BSD +/* The copyright in this software is being made available under the BSD * License, included below. This software may be subject to other third party * and contributor rights, including patent rights, and no such rights are * granted under this license. @@ -86,25 +86,6 @@ #define INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS 1 // Enable 2xN and Nx2 block by removing SCIPU constraints #define CCLM_LATENCY_RESTRICTION_RMV 1 // remove the latency between luma and chroma restriction of CCLM #define LMS_LINEAR_MODEL 1 // LMS for parameters derivation of CCLM and MMLM mode, Remove constraint in derivation of neighbouring samples -#define IDCC_TPM_JEM 1 // template matching prediction as implemented in JEM-7.2 - -#if IDCC_TPM_JEM - -#define IDCC_TMP_SIMD 1 - -#define IDCC_SearchRangeMultFactor 5 - -#if IDCC_TMP_SIMD -#define IDCC_TemplateSize 4 // must be multiple of 4 for SIMD -#else -#define IDCC_TemplateSize 4 -#endif - -#define IDCC_TMP_MaxSize_Depth 6 // should be log2(IDCC_TMP_MaxSize): keep as 6 to avoid any error - -typedef int DistType; -#endif - //-- inter #define CIIP_RM_BLOCK_SIZE_CONSTRAINTS 1 // Remove the 64x64 restriction and enable 8x4/4x8 block for CIIP @@ -129,6 +110,7 @@ typedef int DistType; #define SECONDARY_MPM 1 // Primary MPM and Secondary MPM: Add neighbouring modes into MPMs from positions AR, BL, AL, derived modes #define ENABLE_DIMD 1 // Decoder side intra mode derivation #define JVET_V0087_DIMD_NO_ISP ENABLE_DIMD // disallow combination of DIMD and ISP +#define JVET_V0130_INTRA_TMP 1 // JVET-V0130: template matching prediction // Inter #define CIIP_PDPC 1 // apply pdpc to megre prediction as a new CIIP mode (CIIP_PDPC) additional to CIIP mode @@ -167,6 +149,12 @@ typedef int DistType; #if SIGN_PREDICTION #define ENABLE_SIMD_SIGN_PREDICTION 1 #endif +#if JVET_V0130_INTRA_TMP +#define ENABLE_SIMD_TMP 1 +#endif +#if JVET_V0094_BILATERAL_FILTER +#define ENABLE_SIMD_BILATERAL_FILTER 1 +#endif #endif // tools diff --git a/source/Lib/CommonLib/Unit.cpp b/source/Lib/CommonLib/Unit.cpp index c281acf6ffe27554228e07829e54c7becf1f8a15..0d5f7f2a3ca47b9c20ed0a550dc52c9a4388ad19 100644 --- a/source/Lib/CommonLib/Unit.cpp +++ b/source/Lib/CommonLib/Unit.cpp @@ -303,8 +303,8 @@ CodingUnit& CodingUnit::operator=( const CodingUnit& other ) smvdMode = other.smvdMode; ispMode = other.ispMode; mipFlag = other.mipFlag; -#if IDCC_TPM_JEM - TmpFlag = other.TmpFlag; +#if JVET_V0130_INTRA_TMP + tmpFlag = other.tmpFlag; #endif #if INTER_LIC LICFlag = other.LICFlag; @@ -387,8 +387,8 @@ void CodingUnit::initData() smvdMode = 0; ispMode = 0; mipFlag = false; -#if IDCC_TPM_JEM - TmpFlag = false; +#if JVET_V0130_INTRA_TMP + tmpFlag = false; #endif #if INTER_LIC LICFlag = false; diff --git a/source/Lib/CommonLib/Unit.h b/source/Lib/CommonLib/Unit.h index e24b8c58bc8be25c0646fcbe72bf6af3eb96bd8a..31fd09ac0aff9313a9fc4a67fee1363dc61d2db2 100644 --- a/source/Lib/CommonLib/Unit.h +++ b/source/Lib/CommonLib/Unit.h @@ -334,8 +334,8 @@ struct CodingUnit : public UnitArea uint8_t BcwIdx; int8_t refIdxBi[2]; bool mipFlag; -#if IDCC_TPM_JEM - bool TmpFlag; +#if JVET_V0130_INTRA_TMP + bool tmpFlag; #endif #if INTER_LIC bool LICFlag; diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp index 9f6b464441c3aac5f52ae109540b67491cf1415b..ce87e544860d11b5b58c14eb9b268e068afcf705 100644 --- a/source/Lib/CommonLib/UnitTools.cpp +++ b/source/Lib/CommonLib/UnitTools.cpp @@ -333,7 +333,7 @@ uint32_t CU::getCtuAddr( const CodingUnit &cu ) { return getCtuAddr( cu.blocks[cu.chType].lumaPos(), *cu.cs->pcv ); } -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP Position CU::getCtuXYAddr(const CodingUnit& cu) { return Position((cu.blocks[cu.chType].lumaPos().x >> cu.cs->pcv->maxCUWidthLog2) << cu.cs->pcv->maxCUWidthLog2, (cu.blocks[cu.chType].lumaPos().y >> cu.cs->pcv->maxCUHeightLog2) << cu.cs->pcv->maxCUHeightLog2); @@ -944,10 +944,10 @@ bool PU::isMIP(const PredictionUnit &pu, const ChannelType &chType) return isDMChromaMIP(pu) && (pu.intraDir[CHANNEL_TYPE_CHROMA] == DM_CHROMA_IDX); } } -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP bool PU::isTmp(const PredictionUnit& pu, const ChannelType& chType) { - return (chType == CHANNEL_TYPE_LUMA && pu.cu->TmpFlag); + return (chType == CHANNEL_TYPE_LUMA && pu.cu->tmpFlag); } #endif bool PU::isDMChromaMIP(const PredictionUnit &pu) @@ -961,7 +961,7 @@ bool PU::isDMChromaMIP(const PredictionUnit &pu) uint32_t PU::getIntraDirLuma( const PredictionUnit &pu ) { -#if IDCC_TPM_JEM +#if INTRA_TEMPLATE_MATCHING if (isMIP(pu) || isTmp(pu)) #else if (isMIP(pu)) @@ -4984,8 +4984,8 @@ bool CU::isMTSAllowed(const CodingUnit &cu, const ComponentID compID) mtsAllowed &= cuWidth <= maxSize && cuHeight <= maxSize; mtsAllowed &= !cu.ispMode; mtsAllowed &= !cu.sbtInfo; -#if IDCC_TMP_ImplicitMTS - mtsAllowed &= !cu.TmpFlag; +#if JVET_V0130_INTRA_TMP + mtsAllowed &= !cu.tmpFlag; #endif mtsAllowed &= !(cu.bdpcmMode && cuWidth <= tsMaxSize && cuHeight <= tsMaxSize); return mtsAllowed; @@ -5321,8 +5321,8 @@ bool allowLfnstWithMip(const Size& block) } return false; } -#if IDCC_TPM_JEM -bool allowLfnstWithTpm() +#if JVET_V0130_INTRA_TMP +bool allowLfnstWithTmp() { return true; } diff --git a/source/Lib/CommonLib/UnitTools.h b/source/Lib/CommonLib/UnitTools.h index a9c47df9814bcf6a67ce72f5f47f0f8dae446d50..ab2c96231fb0bfa826aa208253437a3eac1656c5 100644 --- a/source/Lib/CommonLib/UnitTools.h +++ b/source/Lib/CommonLib/UnitTools.h @@ -70,8 +70,8 @@ namespace CU bool isSameSubPic (const CodingUnit &cu, const CodingUnit &cu2); bool isLastSubCUOfCtu (const CodingUnit &cu); uint32_t getCtuAddr (const CodingUnit &cu); -#if IDCC_TPM_JEM - Position getCtuXYAddr(const CodingUnit& cu); +#if JVET_V0130_INTRA_TMP + Position getCtuXYAddr (const CodingUnit& cu); #endif int predictQP (const CodingUnit& cu, const int prevQP ); @@ -141,7 +141,7 @@ namespace PU int getIntraMPMs(const PredictionUnit &pu, unsigned *mpm, const ChannelType &channelType = CHANNEL_TYPE_LUMA); #endif bool isMIP (const PredictionUnit &pu, const ChannelType &chType = CHANNEL_TYPE_LUMA); -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP bool isTmp(const PredictionUnit& pu, const ChannelType& chType = CHANNEL_TYPE_LUMA); #endif bool isDMChromaMIP (const PredictionUnit &pu); @@ -274,8 +274,8 @@ uint32_t getCtuAddr (const Position& pos, const PreCalcValues &pcv); int getNumModesMip (const Size& block); int getMipSizeId (const Size& block); bool allowLfnstWithMip(const Size& block); -#if IDCC_TPM_JEM -bool allowLfnstWithTpm(); +#if JVET_V0130_INTRA_TMP +bool allowLfnstWithTmp(); #endif template<typename T, size_t N> diff --git a/source/Lib/CommonLib/dtrace_blockstatistics.cpp b/source/Lib/CommonLib/dtrace_blockstatistics.cpp index a5e071fbcb35f0572326c2367eaedf59a7de3cd4..2ec1079052779c5c57c9d79f01fa7c9e6bde5b2c 100644 --- a/source/Lib/CommonLib/dtrace_blockstatistics.cpp +++ b/source/Lib/CommonLib/dtrace_blockstatistics.cpp @@ -884,8 +884,8 @@ void writeAllData(const CodingStructure& cs, const UnitArea& ctuArea) if(chType == CHANNEL_TYPE_LUMA) { DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::MIPFlag), cu.mipFlag); -#if IDCC_TPM_JEM - DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::TmpFlag), cu.TmpFlag); +#if JVET_V0130_INTRA_TMP + DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::TmpFlag), cu.tmpFlag); #endif DTRACE_BLOCK_SCALAR(g_trace_ctx, D_BLOCK_STATISTICS_ALL, cu, GetBlockStatisticName(BlockStatistic::ISPMode), cu.ispMode); } diff --git a/source/Lib/CommonLib/dtrace_blockstatistics.h b/source/Lib/CommonLib/dtrace_blockstatistics.h index a416227b6dbfe3abf96f6b3f34b7ae77b0fcd397..fe3032dd9323b1abe8443f40833b807eaf6ac2a9 100644 --- a/source/Lib/CommonLib/dtrace_blockstatistics.h +++ b/source/Lib/CommonLib/dtrace_blockstatistics.h @@ -78,7 +78,7 @@ enum class BlockStatistic { Chroma_IntraMode, MultiRefIdx, MIPFlag, -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP TmpFlag, #endif ISPMode, @@ -173,7 +173,7 @@ static const std::map<BlockStatistic, std::tuple<std::string, BlockStatisticType { BlockStatistic::JointCbCr, std::tuple<std::string, BlockStatisticType, std::string>{"JointCbCr", BlockStatisticType::Flag, ""}}, { BlockStatistic::MIPFlag, std::tuple<std::string, BlockStatisticType, std::string>{"MIPFlag", BlockStatisticType::Flag, ""}}, -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP { BlockStatistic::TmpFlag, std::tuple<std::string, BlockStatisticType, std::string>{"TmpFlag", BlockStatisticType::Flag, ""}}, #endif { BlockStatistic::ISPMode, std::tuple<std::string, BlockStatisticType, std::string>{"ISPMode", BlockStatisticType::Integer, "[0, " + std::to_string(NUM_INTRA_SUBPARTITIONS_MODES) + "]"}}, diff --git a/source/Lib/CommonLib/version.h b/source/Lib/CommonLib/version.h index 250f2cd9fd501badc15831fc0c5ffc13aeb723e1..70ad90a766d6e3ce8d2c18d989ca90082b7ce57a 100644 --- a/source/Lib/CommonLib/version.h +++ b/source/Lib/CommonLib/version.h @@ -1,3 +1,4 @@ #if ! defined( VTM_VERSION ) #define VTM_VERSION "10.0" +#define ECM_VERSION "0.0" #endif diff --git a/source/Lib/CommonLib/x86/BilateralFilterX86.h b/source/Lib/CommonLib/x86/BilateralFilterX86.h new file mode 100644 index 0000000000000000000000000000000000000000..c0ff50f957dc3f99f8e0c257b9402817955fc418 --- /dev/null +++ b/source/Lib/CommonLib/x86/BilateralFilterX86.h @@ -0,0 +1,311 @@ +/* The copyright in this software is being made available under the BSD + * License, included below. This software may be subject to other third party + * and contributor rights, including patent rights, and no such rights are + * granted under this license. + * + * Copyright (c) 2010-2021, ITU/ISO/IEC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of the ITU/ISO/IEC nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "CommonDefX86.h" +#include "../BilateralFilter.h" + +#ifdef TARGET_SIMD_X86 +#if defined _MSC_VER +#include <tmmintrin.h> +#else +#include <x86intrin.h> +#endif + +#if ENABLE_SIMD_BILATERAL_FILTER +template<X86_VEXT vext> +void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr ) +{ + if( uiWidth < 4 || ( uiWidth < 8 && isRDO ) ) + { + return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, isRDO, LUTrowPtr ); + } + + int pad = 2; + int padwidth = iWidthExtSIMD; + + __m128i center, left, right, up, down, lu, ld, ru, rd, diffabs, four, fifteen, lut, acc, temp, round_add, clipmin, clipmax, inputVals; + __m128i ll, rr, uu, dd; + + four = _mm_set1_epi16(4); + fifteen = _mm_set1_epi16(15); + round_add = _mm_set1_epi16(bif_round_add); + clipmin = _mm_set1_epi16(clpRng.min); + clipmax = _mm_set1_epi16(clpRng.max); + + lut = _mm_loadu_si128((__m128i*)(LUTrowPtr)); + acc = _mm_set1_epi32(0); + + // Copy back parameters + Pel *tempBlockPtr = (short*)blkFilt + (((padwidth+4) << 1) + 2); + int tempBlockStride = padwidth+4; + + for (int col = 0; col < uiWidth; col += 8) + { + for (int row = 0; row < uiHeight; row++) + { + acc = _mm_set1_epi32(0); + int16_t *point = &block[(row + pad)*padwidth + pad + col]; + + center = _mm_loadu_si128((__m128i*)(point)); + + //load neighbours + left = _mm_loadu_si128((__m128i*)(point - 1)); + right = _mm_loadu_si128((__m128i*)(point + 1)); + up = _mm_loadu_si128((__m128i*)(point - padwidth)); + down = _mm_loadu_si128((__m128i*)(point + padwidth)); + + lu = _mm_loadu_si128((__m128i*)(point - 1 - padwidth)); + ld = _mm_loadu_si128((__m128i*)(point - 1 + padwidth)); + ru = _mm_loadu_si128((__m128i*)(point + 1 - padwidth)); + rd = _mm_loadu_si128((__m128i*)(point + 1 + padwidth)); + + ll = _mm_loadu_si128((__m128i*)(point - 2)); + rr = _mm_loadu_si128((__m128i*)(point + 2)); + uu = _mm_loadu_si128((__m128i*)(point - 2*padwidth)); + dd = _mm_loadu_si128((__m128i*)(point + 2*padwidth)); + + //calculate diffs + left = _mm_sub_epi16(left, center); + right = _mm_sub_epi16(right, center); + up = _mm_sub_epi16(up, center); + down = _mm_sub_epi16(down, center); + + lu = _mm_sub_epi16(lu, center); + ld = _mm_sub_epi16(ld, center); + ru = _mm_sub_epi16(ru, center); + rd = _mm_sub_epi16(rd, center); + + ll = _mm_sub_epi16(ll, center); + rr = _mm_sub_epi16(rr, center); + uu = _mm_sub_epi16(uu, center); + dd = _mm_sub_epi16(dd, center); + + //LEFT! + //calculate abs + diffabs = _mm_abs_epi16(left); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, left);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //RIGHT! + //calculate abs + diffabs = _mm_abs_epi16(right); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, right);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //UP! + //calculate abs + diffabs = _mm_abs_epi16(up); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, up);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + //DOWN! + //calculate abs + diffabs = _mm_abs_epi16(down); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_sign_epi16(diffabs, down);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + //lu! + //calculate abs + diffabs = _mm_abs_epi16(lu); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, lu);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //ld! + //calculate abs + diffabs = _mm_abs_epi16(ld); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, ld);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //ru! + //calculate abs + diffabs = _mm_abs_epi16(ru); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, ru);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //rd! + //calculate abs + diffabs = _mm_abs_epi16(rd); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, rd);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + //ll! + //calculate abs + diffabs = _mm_abs_epi16(ll); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, ll);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //rr! + //calculate abs + diffabs = _mm_abs_epi16(rr); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, rr);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //uu! + //calculate abs + diffabs = _mm_abs_epi16(uu); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, uu);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + //dd! + //calculate abs + diffabs = _mm_abs_epi16(dd); //abs + diffabs = _mm_add_epi16(diffabs, four); //+4 + diffabs = _mm_srai_epi16(diffabs, 3); //>>3 + diffabs = _mm_min_epi16(diffabs, fifteen); //min(x,15) + diffabs = _mm_packus_epi16(diffabs, diffabs); //convert to 8 + diffabs = _mm_shuffle_epi8(lut, diffabs);//lut + diffabs = _mm_cvtepi8_epi16(diffabs);//back to 16-bit + diffabs = _mm_srai_epi16(diffabs, 1);//diagonal shift! + diffabs = _mm_sign_epi16(diffabs, dd);//fix sign! + acc = _mm_add_epi16(diffabs, acc); //add to acc + + if (bfac == 2) + { + acc = _mm_slli_epi16(acc, 1); // Shift left to get 2* + } + else if (bfac == 3) + { + temp = _mm_slli_epi16(acc, 1); // Multiply by two by shifting left + acc = _mm_add_epi16(acc, temp); // Add original value to get 3* + } + + // Add 16 and shift 5 + acc = _mm_add_epi16(acc, round_add); + acc = _mm_srai_epi16(acc, bif_round_shift); + + // Instead we add our input values to the delta + if(isRDO) + { + acc = _mm_add_epi16(acc, center); + } + else + { + int16_t *recpoint = &recPtr[row * recStride + col]; + inputVals = _mm_loadu_si128((__m128i*)(recpoint)); + acc = _mm_add_epi16(acc, inputVals); + } + + // Clip + acc = _mm_max_epi16(acc, clipmin); + acc = _mm_min_epi16(acc, clipmax); + + _mm_store_si128((__m128i*)(blkFilt + (row + pad) * (padwidth + 4) + col + pad), acc); + } + } + + // Copy back from tempbufFilter to recBuf + int onerow = uiWidth * sizeof(Pel); + for(uint32_t yy = 0; yy < uiHeight; yy++) + { + std::memcpy(recPtr, tempBlockPtr, onerow); + recPtr += recStride; + tempBlockPtr += tempBlockStride; + } +} + +template <X86_VEXT vext> +void BilateralFilter::_initBilateralFilterX86() +{ + m_bilateralFilterDiamond5x5 = simdFilterDiamond5x5<vext>; +} + +template void BilateralFilter::_initBilateralFilterX86<SIMDX86>(); +#endif +#endif // TARGET_SIMD_X86 diff --git a/source/Lib/CommonLib/x86/InitX86.cpp b/source/Lib/CommonLib/x86/InitX86.cpp index 9511379d34cd6fb40cc6a9159316acba912bfa7e..b8e30fbdd77ecd10595385509d689aeff69652ba 100644 --- a/source/Lib/CommonLib/x86/InitX86.cpp +++ b/source/Lib/CommonLib/x86/InitX86.cpp @@ -50,9 +50,12 @@ #include "CommonLib/IbcHashMap.h" -#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT +#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP #include "CommonLib/TrQuant.h" #endif +#if ENABLE_SIMD_BILATERAL_FILTER +#include "CommonLib/BilateralFilter.h" +#endif #ifdef TARGET_SIMD_X86 @@ -190,7 +193,7 @@ void IbcHashMap::initIbcHashMapX86() } #endif -#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT +#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP void TrQuant::initTrQuantX86() { auto vext = read_x86_extension_flags(); @@ -213,5 +216,28 @@ void TrQuant::initTrQuantX86() } #endif +#if ENABLE_SIMD_BILATERAL_FILTER +void BilateralFilter::initBilateralFilterX86() +{ + auto vext = read_x86_extension_flags(); + switch( vext ) + { + case AVX512: + case AVX2: + _initBilateralFilterX86<AVX2>(); + break; + case AVX: + _initBilateralFilterX86<AVX>(); + break; + case SSE42: + case SSE41: + _initBilateralFilterX86<SSE41>(); + break; + default: + break; + } +} +#endif + #endif diff --git a/source/Lib/CommonLib/x86/TrQuantX86.h b/source/Lib/CommonLib/x86/TrQuantX86.h index 5239521da29f73107d0f9773c23f0da7039316f2..5f7238fc60ea6aa9385a797399af4a7ccb3b524f 100644 --- a/source/Lib/CommonLib/x86/TrQuantX86.h +++ b/source/Lib/CommonLib/x86/TrQuantX86.h @@ -410,7 +410,129 @@ uint32_t computeSAD_SIMD( const Pel* ref, const Pel* cur, const int size ) } #endif -#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT +#if ENABLE_SIMD_TMP +template< X86_VEXT vext > +int calcTemplateDiffSIMD( Pel* ref, unsigned int uiStride, Pel** tarPatch, unsigned int uiPatchWidth, unsigned int uiPatchHeight, int iMax ) +{ + int iDiffSum = 0; + int iY; + Pel* refPatchRow = ref - TMP_TEMPLATE_SIZE * uiStride - TMP_TEMPLATE_SIZE; + Pel* tarPatchRow; + uint32_t uiSum; + + // horizontal difference + for( iY = 0; iY < TMP_TEMPLATE_SIZE; iY++ ) + { + tarPatchRow = tarPatch[iY]; + const short* pSrc1 = ( const short* ) tarPatchRow; + const short* pSrc2 = ( const short* ) refPatchRow; + + // SIMD difference + //int iRows = uiPatchHeight; + int iCols = uiPatchWidth; + if( (iCols & 7) == 0 ) + { + // Do with step of 8 + __m128i vzero = _mm_setzero_si128(); + __m128i vsum32 = vzero; + //for (int iY = 0; iY < iRows; iY += iSubStep) + { + __m128i vsum16 = vzero; + for( int iX = 0; iX < iCols; iX += 8 ) + { + __m128i vsrc1 = _mm_loadu_si128( (const __m128i*)(&pSrc1[iX]) ); + __m128i vsrc2 = _mm_lddqu_si128( (const __m128i*)(&pSrc2[iX]) ); + vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); + } + __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); + vsum32 = _mm_add_epi32( vsum32, vsumtemp ); + //pSrc1 += iStrideSrc1; + //pSrc2 += iStrideSrc2; + } + vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) ); // 01001110 + vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) ); // 10110001 + uiSum = _mm_cvtsi128_si32( vsum32 ); + } + else + { + // Do with step of 4 + __m128i vzero = _mm_setzero_si128(); + __m128i vsum32 = vzero; + //for (int iY = 0; iY < iRows; iY += iSubStep) + { + __m128i vsum16 = vzero; + for( int iX = 0; iX < iCols; iX += 4 ) + { + __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] ); + __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] ); + vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); + } + __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); + vsum32 = _mm_add_epi32( vsum32, vsumtemp ); + //pSrc1 += iStrideSrc1; + //pSrc2 += iStrideSrc2; + } + vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) ); // 01001110 + vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) ); // 10110001 + uiSum = _mm_cvtsi128_si32( vsum32 ); + } + iDiffSum += uiSum; + + if( iDiffSum > iMax ) //for speeding up + { + return iDiffSum; + } + // update location + refPatchRow += uiStride; + } + + // vertical difference + int iCols = TMP_TEMPLATE_SIZE; + + for( iY = TMP_TEMPLATE_SIZE; iY < uiPatchHeight; iY++ ) + { + tarPatchRow = tarPatch[iY]; + const short* pSrc1 = ( const short* ) tarPatchRow; + const short* pSrc2 = ( const short* ) refPatchRow; + + // SIMD difference + + // Do with step of 4 + __m128i vzero = _mm_setzero_si128(); + __m128i vsum32 = vzero; + //for (int iY = 0; iY < iRows; iY += iSubStep) + { + __m128i vsum16 = vzero; + for( int iX = 0; iX < iCols; iX += 4 ) + { + __m128i vsrc1 = _mm_loadl_epi64( (const __m128i*) & pSrc1[iX] ); + __m128i vsrc2 = _mm_loadl_epi64( (const __m128i*) & pSrc2[iX] ); + vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); + } + __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); + vsum32 = _mm_add_epi32( vsum32, vsumtemp ); + //pSrc1 += iStrideSrc1; + //pSrc2 += iStrideSrc2; + } + vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) ); // 01001110 + vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) ); // 10110001 + uiSum = _mm_cvtsi128_si32( vsum32 ); + + iDiffSum += uiSum; + + if( iDiffSum > iMax ) //for speeding up + { + return iDiffSum; + } + // update location + refPatchRow += uiStride; + } + + return iDiffSum; +} +#endif + +#if ENABLE_SIMD_SIGN_PREDICTION || TRANSFORM_SIMD_OPT || ENABLE_SIMD_TMP template <X86_VEXT vext> void TrQuant::_initTrQuantX86() { @@ -545,6 +667,10 @@ void TrQuant::_initTrQuantX86() fastInvTrans[2][5] = fastInverseTransform_SIMD<DST7, 64>; #endif #endif + +#if ENABLE_SIMD_TMP + m_calcTemplateDiff = calcTemplateDiffSIMD<vext>; +#endif } template void TrQuant::_initTrQuantX86<SIMDX86>(); diff --git a/source/Lib/CommonLib/x86/avx/BilateralFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/BilateralFilter_avx.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3 --- /dev/null +++ b/source/Lib/CommonLib/x86/avx/BilateralFilter_avx.cpp @@ -0,0 +1 @@ +#include "../BilateralFilterX86.h" diff --git a/source/Lib/CommonLib/x86/avx2/BilateralFilter_avx2.cpp b/source/Lib/CommonLib/x86/avx2/BilateralFilter_avx2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3 --- /dev/null +++ b/source/Lib/CommonLib/x86/avx2/BilateralFilter_avx2.cpp @@ -0,0 +1 @@ +#include "../BilateralFilterX86.h" diff --git a/source/Lib/CommonLib/x86/sse41/BilateralFilter_sse41.cpp b/source/Lib/CommonLib/x86/sse41/BilateralFilter_sse41.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3 --- /dev/null +++ b/source/Lib/CommonLib/x86/sse41/BilateralFilter_sse41.cpp @@ -0,0 +1 @@ +#include "../BilateralFilterX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp b/source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bb696a1fef690da0fc3fc6830bdcb7189a3a11a3 --- /dev/null +++ b/source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp @@ -0,0 +1 @@ +#include "../BilateralFilterX86.h" diff --git a/source/Lib/DecoderLib/CABACReader.cpp b/source/Lib/DecoderLib/CABACReader.cpp index 10f7f72b23ee698e39a7cea978eaf391eabd11e2..0539150a948c6a7aab9dad25b7c846d7787f8e4a 100644 --- a/source/Lib/DecoderLib/CABACReader.cpp +++ b/source/Lib/DecoderLib/CABACReader.cpp @@ -1606,16 +1606,20 @@ void CABACReader::intra_luma_pred_modes( CodingUnit &cu ) cu.firstPU->intraDir[0] = cu.bdpcmMode == 2? VER_IDX : HOR_IDX; return; } -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP int TMP_MaxSize=cu.cs->sps->getIntraTMPMaxSize(); if (cu.lwidth() <= TMP_MaxSize && cu.lheight() <= TMP_MaxSize) { - Tmp_Flag(cu); - if (cu.TmpFlag) - return; + tmp_flag(cu); + if( cu.tmpFlag ) + { + return; + } } else - cu.TmpFlag = 0; + { + cu.tmpFlag = 0; + } #endif mip_flag(cu); if (cu.mipFlag) @@ -3862,10 +3866,10 @@ void CABACReader::residual_lfnst_mode( CodingUnit& cu, CUCtx& cuCtx ) int chIdx = cu.isSepTree() && cu.chType == CHANNEL_TYPE_CHROMA ? 1 : 0; #endif if ((cu.ispMode && !CU::canUseLfnstWithISP(cu, cu.chType)) -#if IDCC_TPM_JEM - || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.TmpFlag && !allowLfnstWithTpm()))) +#if JVET_V0130_INTRA_TMP + || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.tmpFlag && !allowLfnstWithTmp()))) #else - || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) + || (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) #endif #if INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS || (CS::isDualITree(*cu.cs) && cu.chType == CHANNEL_TYPE_CHROMA && std::min(cu.blocks[1].width, cu.blocks[1].height) < 4) @@ -4506,8 +4510,8 @@ unsigned CABACReader::code_unary_fixed( unsigned ctxId, unsigned unary_max, unsi } return idx; } -#if IDCC_TPM_JEM -void CABACReader::Tmp_Flag(CodingUnit& cu) +#if JVET_V0130_INTRA_TMP +void CABACReader::tmp_flag(CodingUnit& cu) { RExt__DECODER_DEBUG_BIT_STATISTICS_CREATE_SET(STATS__CABAC_BITS__OTHER); @@ -4518,13 +4522,13 @@ void CABACReader::Tmp_Flag(CodingUnit& cu) if( !cu.cs->sps->getUseIntraTMP() ) { - cu.TmpFlag = false; + cu.tmpFlag = false; return; } unsigned ctxId = DeriveCtx::CtxTmpFlag(cu); - cu.TmpFlag = m_BinDecoder.decodeBin(Ctx::TmpFlag(ctxId)); - DTRACE(g_trace_ctx, D_SYNTAX, "Tmp_Flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.TmpFlag ? 1 : 0); + cu.tmpFlag = m_BinDecoder.decodeBin(Ctx::TmpFlag(ctxId)); + DTRACE(g_trace_ctx, D_SYNTAX, "tmp_flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.tmpFlag ? 1 : 0); } #endif void CABACReader::mip_flag( CodingUnit& cu ) diff --git a/source/Lib/DecoderLib/CABACReader.h b/source/Lib/DecoderLib/CABACReader.h index 5513be42e0602be439420806bd6e1531a62a82cb..59607c833b83baa691603e354f34bc03f1ea175e 100644 --- a/source/Lib/DecoderLib/CABACReader.h +++ b/source/Lib/DecoderLib/CABACReader.h @@ -107,8 +107,8 @@ public: void adaptive_color_transform(CodingUnit& cu); void sbt_mode ( CodingUnit& cu ); void end_of_ctu ( CodingUnit& cu, CUCtx& cuCtx ); -#if IDCC_TPM_JEM - void Tmp_Flag(CodingUnit& cu); +#if JVET_V0130_INTRA_TMP + void tmp_flag ( CodingUnit& cu ); #endif void mip_flag ( CodingUnit& cu ); void mip_pred_modes ( CodingUnit& cu ); diff --git a/source/Lib/DecoderLib/DecCu.cpp b/source/Lib/DecoderLib/DecCu.cpp index b20d81b6efc70f8061829531de46d87bc5ba5fec..19f5affab59e9e932206a0c7ec8a21a8304935e8 100644 --- a/source/Lib/DecoderLib/DecCu.cpp +++ b/source/Lib/DecoderLib/DecCu.cpp @@ -307,7 +307,7 @@ void DecCu::xIntraRecBlk( TransformUnit& tu, const ComponentID compID ) } else { -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP if (PU::isTmp(pu, chType)) { int foundCandiNum; @@ -512,11 +512,11 @@ void DecCu::xIntraRecACTBlk(TransformUnit& tu) PelBuf piPred = cs.getPredBuf(area); m_pcIntraPred->initIntraPatternChType(*tu.cu, area); -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP if (PU::isTmp(pu, chType)) { int foundCandiNum; - const unsigned int uiStride = cs.picture->getRecoBuf(COMPONENT_Y).stride; + const unsigned int uiStride = cs.picture->getRecoBuf(COMPONENT_Y).stride; m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight()); m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight()); m_pcTrQuant->generateTMPrediction(piPred.buf, uiStride, pu.lwidth(), pu.lheight(), foundCandiNum); diff --git a/source/Lib/DecoderLib/VLCReader.cpp b/source/Lib/DecoderLib/VLCReader.cpp index d8a529556478f0dfcd2abd49c7bc895579a9e39b..b41bffd482821ebc8174ffe69f8abc181fe6215b 100644 --- a/source/Lib/DecoderLib/VLCReader.cpp +++ b/source/Lib/DecoderLib/VLCReader.cpp @@ -2260,7 +2260,7 @@ void HLSyntaxReader::parseSPS(SPS* pcSPS) #if ENABLE_DIMD READ_FLAG(uiCode, "sps_dimd_enabled_flag"); pcSPS->setUseDimd(uiCode != 0); #endif -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP READ_FLAG(uiCode, "sps_intraTMP_enabled_flag"); pcSPS->setUseIntraTMP( uiCode != 0 ); if(pcSPS->getUseIntraTMP()) { diff --git a/source/Lib/EncoderLib/CABACWriter.cpp b/source/Lib/EncoderLib/CABACWriter.cpp index 20225abd53984d871c9e6aedd1a193a17597ee7f..391f28a2fbac914005f1b5bf19360e80dc8806a9 100644 --- a/source/Lib/EncoderLib/CABACWriter.cpp +++ b/source/Lib/EncoderLib/CABACWriter.cpp @@ -1200,13 +1200,15 @@ void CABACWriter::intra_luma_pred_modes( const CodingUnit& cu ) cu.firstPU->intraDir[0] = cu.bdpcmMode == 2? VER_IDX : HOR_IDX; return; } -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP int TMP_MaxSize=cu.cs->sps->getIntraTMPMaxSize(); if (cu.lwidth() <= TMP_MaxSize && cu.lheight() <= TMP_MaxSize) { - Tmp_Flag(cu); - if (cu.TmpFlag) - return; + tmp_flag(cu); + if( cu.tmpFlag ) + { + return; + } } #endif mip_flag(cu); @@ -1389,15 +1391,17 @@ void CABACWriter::intra_luma_pred_mode( const PredictionUnit& pu ) { if( pu.cu->bdpcmMode ) return; -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP // check if sufficient search range is available //bool bCheck = pu.cu-> int TMP_MaxSize=pu.cu->cs->sps->getIntraTMPMaxSize(); if (pu.cu->lwidth() <= TMP_MaxSize && pu.cu->lheight() <= TMP_MaxSize) { - Tmp_Flag(*pu.cu); - if (pu.cu->TmpFlag) - return; + tmp_flag(*pu.cu); + if( pu.cu->tmpFlag ) + { + return; + } } #endif mip_flag(*pu.cu); @@ -3592,10 +3596,10 @@ void CABACWriter::residual_lfnst_mode( const CodingUnit& cu, CUCtx& cuCtx ) int chIdx = cu.isSepTree() && cu.chType == CHANNEL_TYPE_CHROMA ? 1 : 0; #endif if( ( cu.ispMode && !CU::canUseLfnstWithISP( cu, cu.chType ) ) || -#if IDCC_TPM_JEM - (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.TmpFlag && !allowLfnstWithTpm()))) || +#if JVET_V0130_INTRA_TMP + (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && ((cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || (cu.tmpFlag && !allowLfnstWithTmp()))) || #else - (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || + (cu.cs->sps->getUseLFNST() && CU::isIntra(cu) && cu.mipFlag && !allowLfnstWithMip(cu.firstPU->lumaSize())) || #endif #if INTRA_RM_SMALL_BLOCK_SIZE_CONSTRAINTS (CS::isDualITree(*cu.cs) && cu.chType == CHANNEL_TYPE_CHROMA && std::min(cu.blocks[1].width, cu.blocks[1].height) < 4) @@ -4222,8 +4226,8 @@ void CABACWriter::code_unary_fixed( unsigned symbol, unsigned ctxId, unsigned un } } -#if IDCC_TPM_JEM -void CABACWriter::Tmp_Flag(const CodingUnit& cu) +#if JVET_V0130_INTRA_TMP +void CABACWriter::tmp_flag(const CodingUnit& cu) { if (!cu.Y().valid()) { @@ -4236,8 +4240,8 @@ void CABACWriter::Tmp_Flag(const CodingUnit& cu) } unsigned ctxId = DeriveCtx::CtxTmpFlag(cu); - m_BinEncoder.encodeBin(cu.TmpFlag, Ctx::TmpFlag(ctxId)); - DTRACE(g_trace_ctx, D_SYNTAX, "Tmp_Flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.TmpFlag ? 1 : 0); + m_BinEncoder.encodeBin(cu.tmpFlag, Ctx::TmpFlag(ctxId)); + DTRACE(g_trace_ctx, D_SYNTAX, "tmp_flag() pos=(%d,%d) mode=%d\n", cu.lumaPos().x, cu.lumaPos().y, cu.tmpFlag ? 1 : 0); } #endif diff --git a/source/Lib/EncoderLib/CABACWriter.h b/source/Lib/EncoderLib/CABACWriter.h index 6fd5d20e5c5551d20001297e47eae3fdb0fd6643..afbf6ed7c1a280ea67cfcaadaabca11236e8cdb6 100644 --- a/source/Lib/EncoderLib/CABACWriter.h +++ b/source/Lib/EncoderLib/CABACWriter.h @@ -116,8 +116,8 @@ public: void adaptive_color_transform(const CodingUnit& cu); void sbt_mode ( const CodingUnit& cu ); void end_of_ctu ( const CodingUnit& cu, CUCtx& cuCtx ); -#if IDCC_TPM_JEM - void Tmp_Flag(const CodingUnit& cu); +#if JVET_V0130_INTRA_TMP + void tmp_flag ( const CodingUnit& cu ); #endif void mip_flag ( const CodingUnit& cu ); void mip_pred_modes ( const CodingUnit& cu ); diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index 55d4aaef581fad47017085d9a45b98f31ef67cf5..a0740d6c0da4a2a4250e539130baa9fa9b919772 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -806,9 +806,9 @@ protected: bool m_alf; ///< Adaptive Loop Filter -#if IDCC_TPM_JEM - bool m_IntraTMP; ///< intra Template Matching - unsigned m_IntraTMP_MaxSize; ///< max CU size for which intra TMP is allowed +#if JVET_V0130_INTRA_TMP + bool m_intraTMP; ///< intra Template Matching + unsigned m_intraTmpMaxSize; ///< max CU size for which intra TMP is allowed #endif #if JVET_V0094_BILATERAL_FILTER bool m_BIF; @@ -1287,11 +1287,11 @@ public: bool getUseWrapAround () const { return m_wrapAround; } void setWrapAroundOffset ( unsigned u ) { m_wrapAroundOffset = u; } unsigned getWrapAroundOffset () const { return m_wrapAroundOffset; } -#if IDCC_TPM_JEM - void setUseIntraTMP(bool b) { m_IntraTMP = b; } - bool getUseIntraTMP() { return m_IntraTMP; } - void setIntraTMPMaxSize(unsigned n) { m_IntraTMP_MaxSize = n; } - unsigned getIntraTMPMaxSize() { return m_IntraTMP_MaxSize; } +#if JVET_V0130_INTRA_TMP + void setUseIntraTMP (bool b) { m_intraTMP = b; } + bool getUseIntraTMP() const { return m_intraTMP; } + void setIntraTMPMaxSize (unsigned n) { m_intraTmpMaxSize = n; } + unsigned getIntraTMPMaxSize() const { return m_intraTmpMaxSize; } #endif #if JVET_V0094_BILATERAL_FILTER void setUseBIF ( bool b ) { m_BIF = b; } diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index 24f7f15146b2d4f0d3587cf2b2870839a6913383..59dbb6c05e0ec253279e2b27f0db3d0cb8cbef52 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -2008,8 +2008,8 @@ bool EncCu::xCheckRDCostIntra(CodingStructure *&tempCS, CodingStructure *&bestCS m_modeCtrl->setISPMode(cu.ispMode); m_modeCtrl->setISPLfnstIdx(cu.lfnstIdx); m_modeCtrl->setMIPFlagISPPass(cu.mipFlag); -#if IDCC_TPM_JEM - m_modeCtrl->setTPMFlagISPPass(cu.TmpFlag); +#if JVET_V0130_INTRA_TMP + m_modeCtrl->setTPMFlagISPPass(cu.tmpFlag); #endif m_modeCtrl->setBestISPIntraModeRelCU(cu.ispMode ? PU::getFinalIntraMode(*cu.firstPU, CHANNEL_TYPE_LUMA) : UINT8_MAX); m_modeCtrl->setBestDCT2NonISPCostRelCU(m_modeCtrl->getMtsFirstPassNoIspCost()); @@ -3881,8 +3881,8 @@ void EncCu::xCheckRDCostMergeGeo2Nx2N(CodingStructure *&tempCS, CodingStructure cu.mmvdSkip = false; cu.skip = false; cu.mipFlag = false; -#if IDCC_TPM_JEM - cu.TmpFlag = false; +#if JVET_V0130_INTRA_TMP + cu.tmpFlag = false; #endif cu.bdpcmMode = 0; @@ -4109,8 +4109,8 @@ void EncCu::xCheckRDCostMergeGeo2Nx2N(CodingStructure *&tempCS, CodingStructure cu.mmvdSkip = false; cu.skip = false; cu.mipFlag = false; -#if IDCC_TPM_JEM - cu.TmpFlag = false; +#if JVET_V0130_INTRA_TMP + cu.tmpFlag = false; #endif cu.bdpcmMode = 0; PredictionUnit &pu = tempCS->addPU(cu, pm.chType); @@ -4808,8 +4808,8 @@ void EncCu::xCheckSATDCostGeoMerge(CodingStructure *&tempCS, CodingUnit &cu, Pre cu.mmvdSkip = false; cu.skip = false; cu.mipFlag = false; -#if IDCC_TPM_JEM - cu.TmpFlag = false; +#if JVET_V0130_INTRA_TMP + cu.tmpFlag = false; #endif cu.bdpcmMode = 0; pu.mergeFlag = true; @@ -7677,8 +7677,6 @@ void EncCu::xCheckRDCostInterMultiHyp2Nx2N(CodingStructure *&tempCS, CodingStruc std::stable_sort(mhResults.begin(), mhResults.end(), RDCostComp); // actual testing with "true" RD costs - if (std::min((int)mhResults.size(), m_pcEncCfg->getAddHypTries()) > 1) - printf("nbTries=%d\n", std::min((int)mhResults.size(), m_pcEncCfg->getAddHypTries())); for (int i = 0; i < std::min((int)mhResults.size(), m_pcEncCfg->getAddHypTries()); ++i) { tempCS->initStructData(encTestMode.qp); diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index d8c7306aed0e4a034ebd6b4e227375c665733f32..b76d81cf34849a886353165abf4603baf1914d66 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -1429,9 +1429,9 @@ void EncLib::xInitSPS( SPS& sps ) sps.setNumAddHypWeights(m_numAddHypWeights); sps.setMaxNumAddHypRefFrames(m_maxNumAddHypRefFrames); #endif -#if IDCC_TPM_JEM - sps.setUseIntraTMP(m_IntraTMP); - sps.setIntraTMPMaxSize(m_IntraTMP_MaxSize); +#if JVET_V0130_INTRA_TMP + sps.setUseIntraTMP(m_intraTMP); + sps.setIntraTMPMaxSize(m_intraTmpMaxSize); #endif // ADD_NEW_TOOL : (encoder lib) set tool enabling flags and associated parameters here sps.setUseISP ( m_ISP ); diff --git a/source/Lib/EncoderLib/EncModeCtrl.cpp b/source/Lib/EncoderLib/EncModeCtrl.cpp index 67b0825473d1807c127628dfe14551828a02688c..007b25ea102de18be0060bb5673c422cff62630a 100644 --- a/source/Lib/EncoderLib/EncModeCtrl.cpp +++ b/source/Lib/EncoderLib/EncModeCtrl.cpp @@ -2209,8 +2209,8 @@ bool EncModeCtrlMTnoRQT::tryMode( const EncTestMode& encTestmode, const CodingSt int bit4 = cuECtx.ispLfnstIdx == 2; int bit5 = cuECtx.mipFlag; int bit6 = cuECtx.bestCostIsp < cuECtx.bestNonDCT2Cost * 0.95; -#if IDCC_TPM_JEM - int bit7 = cuECtx.TmpFlag; +#if JVET_V0130_INTRA_TMP + int bit7 = cuECtx.tmpFlag; #endif int val = (bit0) | @@ -2220,8 +2220,8 @@ bool EncModeCtrlMTnoRQT::tryMode( const EncTestMode& encTestmode, const CodingSt (bit4 << 4) | (bit5 << 5) | (bit6 << 6) | -#if IDCC_TPM_JEM - (bit7 << 7) | +#if JVET_V0130_INTRA_TMP + (bit7 << 7) | #endif ( cuECtx.bestPredModeDCT2 << 9 ); relatedCU.ispPredModeVal = val; diff --git a/source/Lib/EncoderLib/EncModeCtrl.h b/source/Lib/EncoderLib/EncModeCtrl.h index 5b62615ea0b94bb1cf4ee74eebe77560cf92ad82..528ddb8da068916047865280d04109ada77ef12c 100644 --- a/source/Lib/EncoderLib/EncModeCtrl.h +++ b/source/Lib/EncoderLib/EncModeCtrl.h @@ -238,8 +238,8 @@ struct ComprCUCtx ( MAX_DOUBLE ) , bestISPIntraMode ( UINT8_MAX ) -#if IDCC_TPM_JEM - , TmpFlag(false) +#if JVET_V0130_INTRA_TMP + , tmpFlag (false) #endif , mipFlag ( false ) , ispMode ( NOT_INTRA_SUBPARTITIONS ) @@ -286,8 +286,8 @@ struct ComprCUCtx double bestDCT2NonISPCost; double bestNonDCT2Cost; uint8_t bestISPIntraMode; -#if IDCC_TPM_JEM - bool TmpFlag; +#if JVET_V0130_INTRA_TMP + bool tmpFlag; #endif bool mipFlag; uint8_t ispMode; @@ -396,8 +396,8 @@ public: void setBestNonDCT2Cost ( double val ) { m_ComprCUCtxList.back().bestNonDCT2Cost = val; } uint8_t getBestISPIntraModeRelCU () const { return m_ComprCUCtxList.back().bestISPIntraMode; } void setBestISPIntraModeRelCU ( uint8_t val ) { m_ComprCUCtxList.back().bestISPIntraMode = val; } -#if IDCC_TPM_JEM - void setTPMFlagISPPass(bool val) { m_ComprCUCtxList.back().TmpFlag = val; } +#if JVET_V0130_INTRA_TMP + void setTPMFlagISPPass (bool val) { m_ComprCUCtxList.back().tmpFlag = val; } #endif void setMIPFlagISPPass ( bool val ) { m_ComprCUCtxList.back().mipFlag = val; } void setISPMode ( uint8_t val ) { m_ComprCUCtxList.back().ispMode = val; } diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp index 033ecbab76951161f7ef7acad5885c14694b0540..46474299c2704bb447f3af87fb6398c124173acd 100644 --- a/source/Lib/EncoderLib/IntraSearch.cpp +++ b/source/Lib/EncoderLib/IntraSearch.cpp @@ -394,7 +394,7 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c const TempCtx ctxStart ( m_CtxCache, m_CABACEstimator->getCtx() ); const TempCtx ctxStartMipFlag ( m_CtxCache, SubCtx( Ctx::MipFlag, m_CABACEstimator->getCtx() ) ); -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP const TempCtx ctxStartTpmFlag(m_CtxCache, SubCtx(Ctx::TmpFlag, m_CABACEstimator->getCtx())); #endif const TempCtx ctxStartIspMode ( m_CtxCache, SubCtx( Ctx::ISPMode, m_CABACEstimator->getCtx() ) ); @@ -494,9 +494,9 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c const bool mipAllowed = sps.getUseMIP() && isLuma(partitioner.chType) && ((cu.lfnstIdx == 0) || allowLfnstWithMip(cu.firstPU->lumaSize())); const bool testMip = mipAllowed && !(cu.lwidth() > (8 * cu.lheight()) || cu.lheight() > (8 * cu.lwidth())); const bool supportedMipBlkSize = pu.lwidth() <= MIP_MAX_WIDTH && pu.lheight() <= MIP_MAX_HEIGHT; -#if IDCC_TPM_JEM - const bool tpmAllowed = sps.getUseIntraTMP() && isLuma(partitioner.chType) && ((cu.lfnstIdx == 0) || allowLfnstWithTpm()); - const bool testTpm = tpmAllowed && (cu.lwidth() <= sps.getIntraTMPMaxSize() && cu.lheight() <= sps.getIntraTMPMaxSize()); +#if JVET_V0130_INTRA_TMP + const bool tpmAllowed = sps.getUseIntraTMP() && isLuma(partitioner.chType) && ((cu.lfnstIdx == 0) || allowLfnstWithTmp()); + const bool testTpm = tpmAllowed && (cu.lwidth() <= sps.getIntraTMPMaxSize() && cu.lheight() <= sps.getIntraTMPMaxSize()); #endif static_vector<ModeInfo, FAST_UDI_MAX_RDMODE_NUM> uiRdModeList; @@ -573,19 +573,20 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c ? std::max(numModesForFullRD, floorLog2(std::min(pu.lwidth(), pu.lheight())) - 1) : numModesForFullRD; } -#if IDCC_TPM_JEM - if (testTpm) - numModesForFullRD += 1; // testing tpm - const int numHadCand = (testMip ? 2 : 1) * 3 + testTpm; +#if JVET_V0130_INTRA_TMP + if( testTpm ) + { + numModesForFullRD += 1; // testing tpm + } + const int numHadCand = (testMip ? 2 : 1) * 3 + testTpm; + + cu.tmpFlag = false; #else const int numHadCand = (testMip ? 2 : 1) * 3; #endif //*** Derive (regular) candidates using Hadamard cu.mipFlag = false; -#if IDCC_TPM_JEM - cu.TmpFlag = false; -#endif //===== init pattern for luma prediction ===== initIntraPatternChType(cu, pu.Y(), true); @@ -616,8 +617,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c minSadHad += std::min(distParamSad.distFunc(distParamSad) * 2, distParamHad.distFunc(distParamHad)); // NB xFracModeBitsIntra will not affect the mode for chroma that may have already been pre-estimated. -#if IDCC_TPM_JEM - m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag); +#if JVET_V0130_INTRA_TMP + m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag ); #endif m_CABACEstimator->getCtx() = SubCtx( Ctx::MipFlag, ctxStartMipFlag ); m_CABACEstimator->getCtx() = SubCtx( Ctx::ISPMode, ctxStartIspMode ); @@ -693,8 +694,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c // NB xFracModeBitsIntra will not affect the mode for chroma that may have already been // pre-estimated. -#if IDCC_TPM_JEM - m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag); +#if JVET_V0130_INTRA_TMP + m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag ); #endif m_CABACEstimator->getCtx() = SubCtx(Ctx::MipFlag, ctxStartMipFlag); m_CABACEstimator->getCtx() = SubCtx(Ctx::ISPMode, ctxStartIspMode); @@ -761,8 +762,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c std::min(distParamSad.distFunc(distParamSad) * 2, distParamHad.distFunc(distParamHad)); // NB xFracModeBitsIntra will not affect the mode for chroma that may have already been pre-estimated. -#if IDCC_TPM_JEM - m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag); +#if JVET_V0130_INTRA_TMP + m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag ); #endif m_CABACEstimator->getCtx() = SubCtx(Ctx::MipFlag, ctxStartMipFlag); m_CABACEstimator->getCtx() = SubCtx(Ctx::ISPMode, ctxStartIspMode); @@ -806,47 +807,41 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c m_dSavedHadListLFNST.resize(3); LFNSTSaveFlag = false; } -#if IDCC_TPM_JEM - // derive TPM candidate using hadamard - if (testTpm) - { - cu.TmpFlag = true; - cu.mipFlag = false; - pu.multiRefIdx = 0; - - - - int foundCandiNum = 0; - bool bsuccessfull = 0; - CodingUnit cu_cpy = cu; - - if (isRefTemplateAvailable(cu_cpy, cu_cpy.blocks[COMPONENT_Y])) - { - m_pcTrQuant->getTargetTemplate(&cu_cpy, pu.lwidth(), pu.lheight()); - m_pcTrQuant->candidateSearchIntra(&cu_cpy, pu.lwidth(), pu.lheight()); - bsuccessfull = m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum); - } - if (bsuccessfull && foundCandiNum >= 1) - { - - Distortion minSadHad = - std::min(distParamSad.distFunc(distParamSad) * 2, distParamHad.distFunc(distParamHad)); - - m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag); - - uint64_t fracModeBits = xFracModeBitsIntra(pu, 0, CHANNEL_TYPE_LUMA); - - double cost = double(minSadHad) + double(fracModeBits) * sqrtLambdaForFirstPass; - DTRACE(g_trace_ctx, D_INTRA_COST, "IntraTPM: %u, %llu, %f (%d)\n", minSadHad, fracModeBits, cost, - 0); - - updateCandList(ModeInfo(0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1), cost, uiRdModeList, - CandCostList, numModesForFullRD); - updateCandList(ModeInfo(0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1), - 0.8 * double(minSadHad), uiHadModeList, CandHadList, numHadCand); - } - - } +#if JVET_V0130_INTRA_TMP + // derive TPM candidate using hadamard + if( testTpm ) + { + cu.tmpFlag = true; + cu.mipFlag = false; + pu.multiRefIdx = 0; + + int foundCandiNum = 0; + bool bsuccessfull = 0; + CodingUnit cu_cpy = cu; + + if( isRefTemplateAvailable( cu_cpy, cu_cpy.blocks[COMPONENT_Y] ) ) + { + m_pcTrQuant->getTargetTemplate( &cu_cpy, pu.lwidth(), pu.lheight() ); + m_pcTrQuant->candidateSearchIntra( &cu_cpy, pu.lwidth(), pu.lheight() ); + bsuccessfull = m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum ); + } + if( bsuccessfull && foundCandiNum >= 1 ) + { + + Distortion minSadHad = + std::min( distParamSad.distFunc( distParamSad ) * 2, distParamHad.distFunc( distParamHad ) ); + + m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag ); + + uint64_t fracModeBits = xFracModeBitsIntra( pu, 0, CHANNEL_TYPE_LUMA ); + + double cost = double( minSadHad ) + double( fracModeBits ) * sqrtLambdaForFirstPass; + DTRACE( g_trace_ctx, D_INTRA_COST, "IntraTPM: %u, %llu, %f (%d)\n", minSadHad, fracModeBits, cost, 0 ); + + updateCandList( ModeInfo( 0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1 ), cost, uiRdModeList, CandCostList, numModesForFullRD ); + updateCandList( ModeInfo( 0, 0, 0, NOT_INTRA_SUBPARTITIONS, 0, 1 ), 0.8 * double( minSadHad ), uiHadModeList, CandHadList, numHadCand ); + } + } #endif //*** Derive MIP candidates using Hadamard if (testMip && !supportedMipBlkSize) @@ -866,8 +861,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c } else if (testMip) { -#if IDCC_TPM_JEM - cu.TmpFlag = 0; +#if JVET_V0130_INTRA_TMP + cu.tmpFlag = 0; #endif cu.mipFlag = true; pu.multiRefIdx = 0; @@ -1097,8 +1092,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c cs.interHad = 0; //===== reset context models ===== -#if IDCC_TPM_JEM - m_CABACEstimator->getCtx() = SubCtx(Ctx::TmpFlag, ctxStartTpmFlag); +#if JVET_V0130_INTRA_TMP + m_CABACEstimator->getCtx() = SubCtx( Ctx::TmpFlag, ctxStartTpmFlag ); #endif m_CABACEstimator->getCtx() = SubCtx(Ctx::MipFlag, ctxStartMipFlag); m_CABACEstimator->getCtx() = SubCtx(Ctx::ISPMode, ctxStartIspMode); @@ -1204,8 +1199,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c cu.dimd = true; } #endif -#if IDCC_TPM_JEM - cu.TmpFlag = uiOrgMode.tpmFlg; +#if JVET_V0130_INTRA_TMP + cu.tmpFlag = uiOrgMode.tmpFlag; #endif cu.mipFlag = uiOrgMode.mipFlg; pu.mipTransposedFlag = uiOrgMode.mipTrFlg; @@ -1218,10 +1213,10 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c CHECK(cu.ispMode && cu.mipFlag, "Error: combination of ISP and MIP not supported"); CHECK(cu.ispMode && pu.multiRefIdx, "Error: combination of ISP and MRL not supported"); CHECK(cu.ispMode&& cu.colorTransform, "Error: combination of ISP and ACT not supported"); -#if IDCC_TPM_JEM - CHECK(cu.mipFlag&& cu.TmpFlag, "Error: combination of MIP and TPM not supported"); - CHECK(cu.TmpFlag&& cu.ispMode, "Error: combination of TPM and ISP not supported"); - CHECK(cu.TmpFlag&& pu.multiRefIdx, "Error: combination of TPM and MRL not supported"); +#if JVET_V0130_INTRA_TMP + CHECK( cu.mipFlag && cu.tmpFlag, "Error: combination of MIP and TPM not supported" ); + CHECK( cu.tmpFlag && cu.ispMode, "Error: combination of TPM and ISP not supported" ); + CHECK( cu.tmpFlag && pu.multiRefIdx, "Error: combination of TPM and MRL not supported" ); #endif #if ENABLE_DIMD && JVET_V0087_DIMD_NO_ISP CHECK(cu.ispMode && cu.dimd, "Error: combination of ISP and DIMD not supported"); @@ -1265,14 +1260,14 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c uiBestPUMode.ispMod, mtsCheckRangeFlag, mtsFirstCheckId, mtsLastCheckId, moreProbMTSIdxFirst); } } -#if IDCC_TPM_JEM - if (!cu.ispMode && !cu.mtsFlag && !cu.lfnstIdx && !cu.bdpcmMode && !pu.multiRefIdx && !cu.mipFlag && !cu.TmpFlag && testISP) +#if JVET_V0130_INTRA_TMP + if( !cu.ispMode && !cu.mtsFlag && !cu.lfnstIdx && !cu.bdpcmMode && !pu.multiRefIdx && !cu.mipFlag && !cu.tmpFlag && testISP ) #else if (!cu.ispMode && !cu.mtsFlag && !cu.lfnstIdx && !cu.bdpcmMode && !pu.multiRefIdx && !cu.mipFlag && testISP) #endif { -#if IDCC_TPM_JEM - m_regIntraRDListWithCosts.push_back(ModeInfoWithCost(cu.mipFlag, pu.mipTransposedFlag, pu.multiRefIdx, cu.ispMode, uiOrgMode.modeId, cu.TmpFlag, csTemp->cost)); +#if JVET_V0130_INTRA_TMP + m_regIntraRDListWithCosts.push_back( ModeInfoWithCost( cu.mipFlag, pu.mipTransposedFlag, pu.multiRefIdx, cu.ispMode, uiOrgMode.modeId, cu.tmpFlag, csTemp->cost ) ); #else m_regIntraRDListWithCosts.push_back( ModeInfoWithCost( cu.mipFlag, pu.mipTransposedFlag, pu.multiRefIdx, cu.ispMode, uiOrgMode.modeId, csTemp->cost ) ); #endif @@ -1290,10 +1285,10 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c { m_modeCostStore[lfnstIdx][mode] = tmpValidReturn ? csTemp->cost : (MAX_DOUBLE / 2.0); //(MAX_DOUBLE / 2.0) ?? } -#if IDCC_TPM_JEM - DTRACE(g_trace_ctx, D_INTRA_COST, "IntraCost T [x=%d,y=%d,w=%d,h=%d] %f (%d,%d,%d,%d,%d,%d,%d) \n", cu.blocks[0].x, - cu.blocks[0].y, (int)width, (int)height, csTemp->cost, uiOrgMode.modeId, uiOrgMode.ispMod, - pu.multiRefIdx, cu.TmpFlag, cu.mipFlag, cu.lfnstIdx, cu.mtsFlag); +#if JVET_V0130_INTRA_TMP + DTRACE( g_trace_ctx, D_INTRA_COST, "IntraCost T [x=%d,y=%d,w=%d,h=%d] %f (%d,%d,%d,%d,%d,%d,%d) \n", cu.blocks[0].x, + cu.blocks[0].y, ( int ) width, ( int ) height, csTemp->cost, uiOrgMode.modeId, uiOrgMode.ispMod, + pu.multiRefIdx, cu.tmpFlag, cu.mipFlag, cu.lfnstIdx, cu.mtsFlag ); #else DTRACE(g_trace_ctx, D_INTRA_COST, "IntraCost T [x=%d,y=%d,w=%d,h=%d] %f (%d,%d,%d,%d,%d,%d) \n", cu.blocks[0].x, cu.blocks[0].y, (int) width, (int) height, csTemp->cost, uiOrgMode.modeId, uiOrgMode.ispMod, @@ -1395,8 +1390,8 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, c if( validReturn ) { //=== update PU data ==== -#if IDCC_TPM_JEM - cu.TmpFlag = uiBestPUMode.tpmFlg; +#if JVET_V0130_INTRA_TMP + cu.tmpFlag = uiBestPUMode.tmpFlag; #endif cu.mipFlag = uiBestPUMode.mipFlg; pu.mipTransposedFlag = uiBestPUMode.mipTrFlg; @@ -3352,16 +3347,16 @@ void IntraSearch::xIntraCodingTUBlock(TransformUnit &tu, const ComponentID &comp } else { -#if IDCC_TPM_JEM - if (PU::isTmp(pu, chType)) - { - int foundCandiNum; - m_pcTrQuant->getTargetTemplate(tu.cu, pu.lwidth(), pu.lheight()); - m_pcTrQuant->candidateSearchIntra(tu.cu, pu.lwidth(), pu.lheight()); - m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum); - assert(foundCandiNum >= 1); - } - else if (PU::isMIP(pu, chType)) +#if JVET_V0130_INTRA_TMP + if( PU::isTmp( pu, chType ) ) + { + int foundCandiNum; + m_pcTrQuant->getTargetTemplate( tu.cu, pu.lwidth(), pu.lheight() ); + m_pcTrQuant->candidateSearchIntra( tu.cu, pu.lwidth(), pu.lheight() ); + m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum ); + CHECK( foundCandiNum < 1, "" ); + } + else if( PU::isMIP( pu, chType ) ) #else if( PU::isMIP( pu, chType ) ) #endif @@ -4586,17 +4581,17 @@ bool IntraSearch::xRecurIntraCodingACTQT(CodingStructure &cs, Partitioner &parti PelBuf piResi = resiBuf.bufs[compID]; initIntraPatternChType(*tu.cu, area); -#if IDCC_TPM_JEM - if (PU::isTmp(pu, chType)) - { - int foundCandiNum; - m_pcTrQuant->getTargetTemplate(pu.cu, pu.lwidth(), pu.lheight()); - m_pcTrQuant->candidateSearchIntra(pu.cu, pu.lwidth(), pu.lheight()); - m_pcTrQuant->generateTMPrediction(piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum); - assert(foundCandiNum >= 1); - - } - else if (PU::isMIP(pu, chType)) +#if JVET_V0130_INTRA_TMP + if( PU::isTmp( pu, chType ) ) + { + int foundCandiNum; + m_pcTrQuant->getTargetTemplate( pu.cu, pu.lwidth(), pu.lheight() ); + m_pcTrQuant->candidateSearchIntra( pu.cu, pu.lwidth(), pu.lheight() ); + m_pcTrQuant->generateTMPrediction( piPred.buf, piPred.stride, pu.lwidth(), pu.lheight(), foundCandiNum ); + CHECK( foundCandiNum < 1, "" ); + + } + else if( PU::isMIP( pu, chType ) ) #else if (PU::isMIP(pu, chType)) #endif diff --git a/source/Lib/EncoderLib/IntraSearch.h b/source/Lib/EncoderLib/IntraSearch.h index 615408ac82f72010b877b5d9c8161ffd28b0b124..31b813dabccf8007b977a5b4d48c8ed6f2e78a89 100644 --- a/source/Lib/EncoderLib/IntraSearch.h +++ b/source/Lib/EncoderLib/IntraSearch.h @@ -221,13 +221,12 @@ private: int mRefId; // PU::multiRefIdx uint8_t ispMod; // CU::ispMode uint32_t modeId; // PU::intraDir[CHANNEL_TYPE_LUMA] -#if IDCC_TPM_JEM - bool tpmFlg; // CU::TmpFlag -#endif -#if IDCC_TPM_JEM - ModeInfo() : mipFlg(false), mipTrFlg(false), mRefId(0), ispMod(NOT_INTRA_SUBPARTITIONS), modeId(0), tpmFlg(0) {} - ModeInfo(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf = 0) : mipFlg(mipf), mipTrFlg(miptf), mRefId(mrid), ispMod(ispm), modeId(mode), tpmFlg(tpmf) {} - bool operator==(const ModeInfo cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tpmFlg == cmp.tpmFlg); } +#if JVET_V0130_INTRA_TMP + bool tmpFlag; // CU::tmpFlag + + ModeInfo() : mipFlg(false), mipTrFlg(false), mRefId(0), ispMod(NOT_INTRA_SUBPARTITIONS), modeId(0), tmpFlag(0) {} + ModeInfo(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf = 0) : mipFlg(mipf), mipTrFlg(miptf), mRefId(mrid), ispMod(ispm), modeId(mode), tmpFlag(tpmf) {} + bool operator==(const ModeInfo cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tmpFlag == cmp.tmpFlag); } #else ModeInfo() : mipFlg(false), mipTrFlg(false), mRefId(0), ispMod(NOT_INTRA_SUBPARTITIONS), modeId(0) {} ModeInfo(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode) : mipFlg(mipf), mipTrFlg(miptf), mRefId(mrid), ispMod(ispm), modeId(mode) {} @@ -238,9 +237,9 @@ private: { double rdCost; ModeInfoWithCost() : ModeInfo(), rdCost(MAX_DOUBLE) {} -#if IDCC_TPM_JEM - ModeInfoWithCost(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf, double cost) : ModeInfo(mipf, miptf, mrid, ispm, mode, tpmf), rdCost(cost) {} - bool operator==(const ModeInfoWithCost cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tpmFlg == cmp.tpmFlg && rdCost == cmp.rdCost); } +#if JVET_V0130_INTRA_TMP + ModeInfoWithCost(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, const bool tpmf, double cost) : ModeInfo(mipf, miptf, mrid, ispm, mode, tpmf), rdCost(cost) {} + bool operator==(const ModeInfoWithCost cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && tmpFlag == cmp.tmpFlag && rdCost == cmp.rdCost); } #else ModeInfoWithCost(const bool mipf, const bool miptf, const int mrid, const uint8_t ispm, const uint32_t mode, double cost) : ModeInfo(mipf, miptf, mrid, ispm, mode), rdCost(cost) {} bool operator==(const ModeInfoWithCost cmp) const { return (mipFlg == cmp.mipFlg && mipTrFlg == cmp.mipTrFlg && mRefId == cmp.mRefId && ispMod == cmp.ispMod && modeId == cmp.modeId && rdCost == cmp.rdCost); } diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index 4d123fa1f371870762637556c002c8b8fdde1232..cec7da12760889e3721451dbd17fe3b50b56380e 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -1384,11 +1384,11 @@ void HLSWriter::codeSPS( const SPS* pcSPS ) #if ENABLE_DIMD WRITE_FLAG( pcSPS->getUseDimd() ? 1 : 0, "sps_dimd_enabled_flag"); #endif -#if IDCC_TPM_JEM +#if JVET_V0130_INTRA_TMP WRITE_FLAG( pcSPS->getUseIntraTMP() ? 1 : 0, "sps_intraTMP_enabled_flag"); if(pcSPS->getUseIntraTMP()) { - WRITE_UVLC(floorLog2(pcSPS->getIntraTMPMaxSize()), "sps_log2_intra_tmp_max_size"); + WRITE_UVLC(floorLog2(pcSPS->getIntraTMPMaxSize()), "sps_log2_intra_tmp_max_size"); } #endif