diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp index 9ff4a501217b7a83e4d85589292e0633d1078b6d..1c7ea90e2db56433da449942c32d0715aecb02b3 100644 --- a/source/Lib/CommonLib/InterpolationFilter.cpp +++ b/source/Lib/CommonLib/InterpolationFilter.cpp @@ -698,14 +698,11 @@ void InterpolationFilter::xWeightedTriangleBlk( const PredictionUnit &pu, const const int32_t shiftWeighted = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase; const int32_t offsetWeighted = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase); #if JVET_P0530_TPM_WEIGHT_ALIGN - int32_t logSubWidthC = getChannelTypeScaleX(CHANNEL_TYPE_CHROMA, pu.chromaFormat); - int32_t logSubHeightC = getChannelTypeScaleY(CHANNEL_TYPE_CHROMA, pu.chromaFormat); + int32_t stepX = 1 << getComponentScaleX(compIdx, pu.chromaFormat); + int32_t stepY = 1 << getComponentScaleY(compIdx, pu.chromaFormat); - int32_t stepX = (compIdx == 0) ? 1 : (1 << logSubWidthC); - int32_t stepY = (compIdx == 0) ? 1 : (1 << logSubHeightC); - - int32_t widthY = (compIdx == 0) ? width : (width << logSubWidthC); - int32_t heightY = (compIdx == 0) ? height : (height << logSubHeightC); + int32_t widthY = width << getComponentScaleX(compIdx, pu.chromaFormat); + int32_t heightY = height << getComponentScaleY(compIdx, pu.chromaFormat); int32_t ratioWH = (widthY > heightY) ? (widthY / heightY) : 1; int32_t ratioHW = (widthY > heightY) ? 1 : (heightY / widthY); @@ -737,10 +734,8 @@ void InterpolationFilter::xWeightedTriangleBlk( const PredictionUnit &pu, const } for (tmpY = ratioHW; tmpY > 0; tmpY -= stepY) { - for (x = 0; x < weightedStartPos; x++) + for (x = 0; x < weightedStartPos; x += stepX) { - if (x % stepX != 0) - continue; #else for( y = 0; y < height; y+= ratioHW ) { @@ -789,10 +784,9 @@ void InterpolationFilter::xWeightedTriangleBlk( const PredictionUnit &pu, const } #if JVET_P0530_TPM_WEIGHT_ALIGN - for (x = weightedEndPos + 1; x < widthY; x++) + int32_t start = ((weightedEndPos + 1) % stepX != 0) ? (weightedEndPos + 2) : (weightedEndPos + 1); + for (x = start; x < widthY; x += stepX) { - if (x % stepX != 0) - continue; #else for( x = weightedEndPos + 1; x < width; x++ ) { diff --git a/source/Lib/CommonLib/Rom.cpp b/source/Lib/CommonLib/Rom.cpp index dd7a0331ba42f4b591f9bc5e9d74e2bbf5491c84..38c8963c9dcc270c97ae3f7aa14f784d34665967 100644 --- a/source/Lib/CommonLib/Rom.cpp +++ b/source/Lib/CommonLib/Rom.cpp @@ -430,33 +430,25 @@ void initROM() const int nCbR = (nCbW > nCbH) ? nCbW / nCbH : nCbH / nCbW; // let SIMD can read at least 64-bit when at last row +#if JVET_P0530_TPM_WEIGHT_ALIGN + g_triangleWeights[0][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; + g_triangleWeights[1][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; +#else g_triangleWeights[0][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; g_triangleWeights[0][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; g_triangleWeights[1][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; g_triangleWeights[1][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; -#if JVET_P0530_TPM_WEIGHT_ALIGN - g_triangleWeights[2][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; - g_triangleWeights[2][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; - g_triangleWeights[3][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; - g_triangleWeights[3][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4]; - int nCbR_422 = (nCbW * 2 > nCbH) ? (nCbW * 2) / nCbH : nCbH / (nCbW * 2); #endif for (int y = 0; y < nCbH; y++) { for (int x = 0; x < nCbW; x++) { - g_triangleWeights[0][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, x / nCbR - y + 4) : Clip3(0, 8, x - y / nCbR + 4); - g_triangleWeights[0][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH - 1 - x / nCbR - y + 4) : Clip3(0, 8, nCbW - 1 - x - y / nCbR + 4); #if JVET_P0530_TPM_WEIGHT_ALIGN - g_triangleWeights[CHROMA_420][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, (x * 2) / nCbR - (y * 2) + 4) : Clip3(0, 8, (x * 2) - (y * 2) / nCbR + 4); - g_triangleWeights[CHROMA_420][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH * 2 - 1 - (x * 2) / nCbR - (y * 2) + 4) : Clip3(0, 8, nCbW * 2 - 1 - (x * 2) - (y * 2) / nCbR + 4); - - g_triangleWeights[CHROMA_422][0][idxH][idxW][y*nCbW + x] = (nCbW * 2 > nCbH) ? Clip3(0, 8, (x * 2) / nCbR_422 - y + 4) : Clip3(0, 8, (x * 2) - y / nCbR_422 + 4); - g_triangleWeights[CHROMA_422][1][idxH][idxW][y*nCbW + x] = (nCbW * 2 > nCbH) ? Clip3(0, 8, nCbH - 1 - (x * 2) / nCbR_422 - y + 4) : Clip3(0, 8, nCbW * 2 - 1 - (x * 2) - y / nCbR_422 + 4); - - g_triangleWeights[CHROMA_444][0][idxH][idxW][y*nCbW + x] = g_triangleWeights[0][0][idxH][idxW][y*nCbW + x]; - g_triangleWeights[CHROMA_444][1][idxH][idxW][y*nCbW + x] = g_triangleWeights[0][1][idxH][idxW][y*nCbW + x]; + g_triangleWeights[0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, x / nCbR - y + 4) : Clip3(0, 8, x - y / nCbR + 4); + g_triangleWeights[1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH - 1 - x / nCbR - y + 4) : Clip3(0, 8, nCbW - 1 - x - y / nCbR + 4); #else + g_triangleWeights[0][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, x / nCbR - y + 4) : Clip3(0, 8, x - y / nCbR + 4); + g_triangleWeights[0][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH - 1 - x / nCbR - y + 4) : Clip3(0, 8, nCbW - 1 - x - y / nCbR + 4); g_triangleWeights[1][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 4, x / nCbR - y + 2) * 2 : Clip3(0, 4, x - y / nCbR + 2) * 2; g_triangleWeights[1][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 4, nCbH - 1 - x / nCbR - y + 2) * 2 : Clip3(0, 4, nCbW - 1 - x - y / nCbR + 2) * 2; #endif @@ -495,6 +487,12 @@ void destroyROM() { for (int idxW = 0; idxW < MAX_CU_DEPTH - MIN_CU_LOG2 + 2; ++idxW) { +#if JVET_P0530_TPM_WEIGHT_ALIGN + delete[] g_triangleWeights[0][idxH][idxW]; + delete[] g_triangleWeights[1][idxH][idxW]; + g_triangleWeights[0][idxH][idxW] = nullptr; + g_triangleWeights[1][idxH][idxW] = nullptr; +#else delete[] g_triangleWeights[0][0][idxH][idxW]; delete[] g_triangleWeights[0][1][idxH][idxW]; delete[] g_triangleWeights[1][0][idxH][idxW]; @@ -503,15 +501,6 @@ void destroyROM() g_triangleWeights[0][1][idxH][idxW] = nullptr; g_triangleWeights[1][0][idxH][idxW] = nullptr; g_triangleWeights[1][1][idxH][idxW] = nullptr; -#if JVET_P0530_TPM_WEIGHT_ALIGN - delete[] g_triangleWeights[2][0][idxH][idxW]; - delete[] g_triangleWeights[2][1][idxH][idxW]; - delete[] g_triangleWeights[3][0][idxH][idxW]; - delete[] g_triangleWeights[3][1][idxH][idxW]; - g_triangleWeights[2][0][idxH][idxW] = nullptr; - g_triangleWeights[2][1][idxH][idxW] = nullptr; - g_triangleWeights[3][0][idxH][idxW] = nullptr; - g_triangleWeights[3][1][idxH][idxW] = nullptr; #endif } } @@ -756,7 +745,7 @@ const uint32_t g_scalingListSizeX[SCALING_LIST_SIZE_NUM] = { 1, 2, 4, 8, 16, uint8_t g_triangleMvStorage[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_SIZE >> MIN_CU_LOG2][MAX_CU_SIZE >> MIN_CU_LOG2]; #if JVET_P0530_TPM_WEIGHT_ALIGN -int16_t *g_triangleWeights[4][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2]; +int16_t *g_triangleWeights[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2]; #else int16_t *g_triangleWeights[2][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2]; #endif diff --git a/source/Lib/CommonLib/Rom.h b/source/Lib/CommonLib/Rom.h index af8b3d519d7458e1facf71320552afec5183a2e3..fe144b2dfe8dde685fb8d40bf447019d4dc3b66b 100644 --- a/source/Lib/CommonLib/Rom.h +++ b/source/Lib/CommonLib/Rom.h @@ -212,7 +212,7 @@ constexpr uint8_t g_tbMax[257] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, extern uint8_t g_triangleMvStorage[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_SIZE >> MIN_CU_LOG2][MAX_CU_SIZE >> MIN_CU_LOG2]; // 7-tap/3-tap, direction, 2/4/8/16/32/64/128 #if JVET_P0530_TPM_WEIGHT_ALIGN -extern int16_t *g_triangleWeights[4][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2]; +extern int16_t *g_triangleWeights[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2]; #else extern int16_t *g_triangleWeights[2][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2]; #endif diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h index d78580bbc1a7a6f34080625b7c7030a03276df20..7d61dab311e66b6cd8f9a12a5d10cfd264ef4dd5 100644 --- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h +++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h @@ -1227,15 +1227,22 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co int32_t strideSrc0 = predSrc0.get(compIdx).stride; int32_t strideSrc1 = predSrc1.get(compIdx).stride; +#if JVET_P0530_TPM_WEIGHT_ALIGN + int32_t chromaScaleX = getComponentScaleX(compIdx, pu.chromaFormat); + int32_t chromaScaleY = getComponentScaleY(compIdx, pu.chromaFormat); + int8_t log2WidthY = floorLog2(width << chromaScaleX) - 1; + int8_t log2HeightY = floorLog2(height << chromaScaleY) - 1; +#else int8_t log2Width = floorLog2(width) - 1; int8_t log2Height = floorLog2(height) - 1; +#endif const char log2WeightBase = 3; const ClpRng clpRng = pu.cu->slice->clpRngs().comp[compIdx]; const int32_t shiftWeighted = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)) + log2WeightBase; const int32_t offsetWeighted = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase); #if JVET_P0530_TPM_WEIGHT_ALIGN - int wIdx = (compIdx == COMPONENT_Y) ? 0 : pu.cs->sps->getChromaFormatIdc(); - int16_t *weight = g_triangleWeights[wIdx][splitDir][log2Height][log2Width]; + int16_t *weight = g_triangleWeights[splitDir][log2HeightY][log2WidthY]; + int16_t stepY = width << (chromaScaleX + chromaScaleY); #else const bool longWeight = (compIdx == COMPONENT_Y); const bool shortWeight = !longWeight; @@ -1251,13 +1258,23 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co if (width == 2) { +#if JVET_P0530_TPM_WEIGHT_ALIGN + const __m128i mask = _mm_set_epi16( (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, 0x0504, 0x0100 ); +#endif for (int y = 0; y < height; y++) { __m128i s0 = _mm_cvtsi32_si128(*(uint32_t *) src0); __m128i s1 = _mm_cvtsi32_si128(*(uint32_t *) src1); - __m128i w0 = _mm_cvtsi32_si128(*(uint32_t *) weight); +#if JVET_P0530_TPM_WEIGHT_ALIGN + __m128i w0 = _mm_loadl_epi64((__m128i *) (weight)); + if (chromaScaleX == 1) + { + w0 = _mm_shuffle_epi8(w0, mask); + } +#else + __m128i w0 = _mm_cvtsi32_si128(*(uint32_t *)weight); +#endif __m128i w1 = _mm_sub_epi16(mmEight, w0); - s0 = _mm_unpacklo_epi16(s0, s1); w0 = _mm_unpacklo_epi16(w0, w1); s0 = _mm_add_epi32(_mm_madd_epi16(s0, w0), mmOffset); @@ -1269,16 +1286,31 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co dst += strideDst; src0 += strideSrc0; src1 += strideSrc1; +#if JVET_P0530_TPM_WEIGHT_ALIGN + weight += stepY; +#else weight += 2; +#endif } } else if(width == 4) { +#if JVET_P0530_TPM_WEIGHT_ALIGN + const __m128i mask = _mm_set_epi16( (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, 0x0D0C, 0x0908, 0x0504, 0x0100 ); +#endif for (int y = 0; y < height; y++) { __m128i s0 = _mm_loadl_epi64((__m128i *) (src0)); __m128i s1 = _mm_loadl_epi64((__m128i *) (src1)); +#if JVET_P0530_TPM_WEIGHT_ALIGN + __m128i w0 = _mm_loadu_si128((__m128i *) (weight)); + if (chromaScaleX == 1) + { + w0 = _mm_shuffle_epi8(w0, mask); + } +#else __m128i w0 = _mm_loadl_epi64((__m128i *) (weight)); +#endif __m128i w1 = _mm_sub_epi16(mmEight, w0); s0 = _mm_unpacklo_epi16(s0, s1); w0 = _mm_unpacklo_epi16(w0, w1); @@ -1290,18 +1322,38 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co dst += strideDst; src0 += strideSrc0; src1 += strideSrc1; +#if JVET_P0530_TPM_WEIGHT_ALIGN + weight += stepY; +#else weight += 4; +#endif } } else { +#if JVET_P0530_TPM_WEIGHT_ALIGN + const __m128i mask1 = _mm_set_epi16( 0x0D0C, 0x0908, 0x0504, 0x0100, (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080 ); + const __m128i mask2 = _mm_set_epi16( (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, 0x0D0C, 0x0908, 0x0504, 0x0100 ); +#endif for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 8) { __m128i s0 = _mm_loadu_si128((__m128i *) (src0 + x)); __m128i s1 = _mm_loadu_si128((__m128i *) (src1 + x)); + +#if JVET_P0530_TPM_WEIGHT_ALIGN + __m128i w0 = _mm_loadu_si128((__m128i *) (weight + (x << chromaScaleX))); + if (chromaScaleX == 1) + { + __m128i w01 = _mm_loadu_si128((__m128i *) (weight + (x << chromaScaleX) + 8)); + w0 = _mm_shuffle_epi8(w0, mask1); + w01 = _mm_shuffle_epi8(w01, mask2); + w0 = _mm_alignr_epi8(w01, w0, 8); + } +#else __m128i w0 = _mm_loadu_si128((__m128i *) (weight + x)); +#endif __m128i w1 = _mm_sub_epi16(mmEight, w0); __m128i s0tmp = _mm_unpacklo_epi16(s0, s1); @@ -1321,7 +1373,11 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co dst += strideDst; src0 += strideSrc0; src1 += strideSrc1; +#if JVET_P0530_TPM_WEIGHT_ALIGN + weight += stepY; +#else weight += width; +#endif } } }