From 7bcd59fde7f9c10acdc8c95f397dd0f0e10e877a Mon Sep 17 00:00:00 2001
From: zhipin <zhipin.deng@bytedance.com>
Date: Fri, 25 Oct 2019 14:55:49 +0800
Subject: [PATCH] cleanup the code. results bit exact match the previous submit

---
 source/Lib/CommonLib/InterpolationFilter.cpp  | 20 ++----
 source/Lib/CommonLib/Rom.cpp                  | 41 +++++-------
 source/Lib/CommonLib/Rom.h                    |  2 +-
 .../CommonLib/x86/InterpolationFilterX86.h    | 64 +++++++++++++++++--
 4 files changed, 83 insertions(+), 44 deletions(-)

diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp
index 9ff4a5012..1c7ea90e2 100644
--- a/source/Lib/CommonLib/InterpolationFilter.cpp
+++ b/source/Lib/CommonLib/InterpolationFilter.cpp
@@ -698,14 +698,11 @@ void InterpolationFilter::xWeightedTriangleBlk( const PredictionUnit &pu, const
   const int32_t shiftWeighted     = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase;
   const int32_t offsetWeighted    = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);
 #if JVET_P0530_TPM_WEIGHT_ALIGN
-  int32_t logSubWidthC = getChannelTypeScaleX(CHANNEL_TYPE_CHROMA, pu.chromaFormat);
-  int32_t logSubHeightC = getChannelTypeScaleY(CHANNEL_TYPE_CHROMA, pu.chromaFormat);
+  int32_t stepX = 1 << getComponentScaleX(compIdx, pu.chromaFormat);
+  int32_t stepY = 1 << getComponentScaleY(compIdx, pu.chromaFormat);
 
-  int32_t stepX = (compIdx == 0) ? 1 : (1 << logSubWidthC);
-  int32_t stepY = (compIdx == 0) ? 1 : (1 << logSubHeightC);
-
-  int32_t widthY = (compIdx == 0) ? width : (width << logSubWidthC);
-  int32_t heightY = (compIdx == 0) ? height : (height << logSubHeightC);
+  int32_t widthY = width << getComponentScaleX(compIdx, pu.chromaFormat);
+  int32_t heightY = height << getComponentScaleY(compIdx, pu.chromaFormat);
 
   int32_t ratioWH = (widthY > heightY) ? (widthY / heightY) : 1;
   int32_t ratioHW = (widthY > heightY) ? 1 : (heightY / widthY);
@@ -737,10 +734,8 @@ void InterpolationFilter::xWeightedTriangleBlk( const PredictionUnit &pu, const
     }
     for (tmpY = ratioHW; tmpY > 0; tmpY -= stepY)
     {
-      for (x = 0; x < weightedStartPos; x++)
+      for (x = 0; x < weightedStartPos; x += stepX)
       {
-        if (x % stepX != 0)
-          continue;
 #else
   for( y = 0; y < height; y+= ratioHW )
   {
@@ -789,10 +784,9 @@ void InterpolationFilter::xWeightedTriangleBlk( const PredictionUnit &pu, const
       }
 
 #if JVET_P0530_TPM_WEIGHT_ALIGN
-      for (x = weightedEndPos + 1; x < widthY; x++)
+      int32_t start = ((weightedEndPos + 1) % stepX != 0) ? (weightedEndPos + 2) : (weightedEndPos + 1);
+      for (x = start; x < widthY; x += stepX)
       {
-        if (x % stepX != 0)
-          continue;
 #else
       for( x = weightedEndPos + 1; x < width; x++ )
       {
diff --git a/source/Lib/CommonLib/Rom.cpp b/source/Lib/CommonLib/Rom.cpp
index dd7a0331b..38c8963c9 100644
--- a/source/Lib/CommonLib/Rom.cpp
+++ b/source/Lib/CommonLib/Rom.cpp
@@ -430,33 +430,25 @@ void initROM()
       const int nCbR = (nCbW > nCbH) ? nCbW / nCbH : nCbH / nCbW;
 
       // let SIMD can read at least 64-bit when at last row
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+      g_triangleWeights[0][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
+      g_triangleWeights[1][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
+#else
       g_triangleWeights[0][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
       g_triangleWeights[0][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
       g_triangleWeights[1][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
       g_triangleWeights[1][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
-#if JVET_P0530_TPM_WEIGHT_ALIGN
-      g_triangleWeights[2][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
-      g_triangleWeights[2][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
-      g_triangleWeights[3][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
-      g_triangleWeights[3][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
-      int nCbR_422 = (nCbW * 2 > nCbH) ? (nCbW * 2) / nCbH : nCbH / (nCbW * 2);
 #endif
       for (int y = 0; y < nCbH; y++)
       {
         for (int x = 0; x < nCbW; x++)
         {
-          g_triangleWeights[0][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, x / nCbR - y + 4) : Clip3(0, 8, x - y / nCbR + 4);
-          g_triangleWeights[0][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH - 1 - x / nCbR - y + 4) : Clip3(0, 8, nCbW - 1 - x - y / nCbR + 4);
 #if JVET_P0530_TPM_WEIGHT_ALIGN
-          g_triangleWeights[CHROMA_420][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, (x * 2) / nCbR - (y * 2) + 4) : Clip3(0, 8, (x * 2) - (y * 2) / nCbR + 4);
-          g_triangleWeights[CHROMA_420][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH * 2 - 1 - (x * 2) / nCbR - (y * 2) + 4) : Clip3(0, 8, nCbW * 2 - 1 - (x * 2) - (y * 2) / nCbR + 4);
-
-          g_triangleWeights[CHROMA_422][0][idxH][idxW][y*nCbW + x] = (nCbW * 2 > nCbH) ? Clip3(0, 8, (x * 2) / nCbR_422 - y + 4) : Clip3(0, 8, (x * 2) - y / nCbR_422 + 4);
-          g_triangleWeights[CHROMA_422][1][idxH][idxW][y*nCbW + x] = (nCbW * 2 > nCbH) ? Clip3(0, 8, nCbH - 1 - (x * 2) / nCbR_422 - y + 4) : Clip3(0, 8, nCbW * 2 - 1 - (x * 2) - y / nCbR_422 + 4);
-
-          g_triangleWeights[CHROMA_444][0][idxH][idxW][y*nCbW + x] = g_triangleWeights[0][0][idxH][idxW][y*nCbW + x];
-          g_triangleWeights[CHROMA_444][1][idxH][idxW][y*nCbW + x] = g_triangleWeights[0][1][idxH][idxW][y*nCbW + x];
+          g_triangleWeights[0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, x / nCbR - y + 4) : Clip3(0, 8, x - y / nCbR + 4);
+          g_triangleWeights[1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH - 1 - x / nCbR - y + 4) : Clip3(0, 8, nCbW - 1 - x - y / nCbR + 4);
 #else
+          g_triangleWeights[0][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, x / nCbR - y + 4) : Clip3(0, 8, x - y / nCbR + 4);
+          g_triangleWeights[0][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH - 1 - x / nCbR - y + 4) : Clip3(0, 8, nCbW - 1 - x - y / nCbR + 4);
           g_triangleWeights[1][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 4, x / nCbR - y + 2) * 2 : Clip3(0, 4, x - y / nCbR + 2) * 2;
           g_triangleWeights[1][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 4, nCbH - 1 - x / nCbR - y + 2) * 2 : Clip3(0, 4, nCbW - 1 - x - y / nCbR + 2) * 2;
 #endif
@@ -495,6 +487,12 @@ void destroyROM()
   {
     for (int idxW = 0; idxW < MAX_CU_DEPTH - MIN_CU_LOG2 + 2; ++idxW)
     {
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+      delete[] g_triangleWeights[0][idxH][idxW];
+      delete[] g_triangleWeights[1][idxH][idxW];
+      g_triangleWeights[0][idxH][idxW] = nullptr;
+      g_triangleWeights[1][idxH][idxW] = nullptr;
+#else
       delete[] g_triangleWeights[0][0][idxH][idxW];
       delete[] g_triangleWeights[0][1][idxH][idxW];
       delete[] g_triangleWeights[1][0][idxH][idxW];
@@ -503,15 +501,6 @@ void destroyROM()
       g_triangleWeights[0][1][idxH][idxW] = nullptr;
       g_triangleWeights[1][0][idxH][idxW] = nullptr;
       g_triangleWeights[1][1][idxH][idxW] = nullptr;
-#if JVET_P0530_TPM_WEIGHT_ALIGN
-      delete[] g_triangleWeights[2][0][idxH][idxW];
-      delete[] g_triangleWeights[2][1][idxH][idxW];
-      delete[] g_triangleWeights[3][0][idxH][idxW];
-      delete[] g_triangleWeights[3][1][idxH][idxW];
-      g_triangleWeights[2][0][idxH][idxW] = nullptr;
-      g_triangleWeights[2][1][idxH][idxW] = nullptr;
-      g_triangleWeights[3][0][idxH][idxW] = nullptr;
-      g_triangleWeights[3][1][idxH][idxW] = nullptr;
 #endif
     }
   }
@@ -756,7 +745,7 @@ const uint32_t g_scalingListSizeX[SCALING_LIST_SIZE_NUM] = { 1, 2,  4,  8,  16,
 
 uint8_t g_triangleMvStorage[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_SIZE >> MIN_CU_LOG2][MAX_CU_SIZE >> MIN_CU_LOG2];
 #if JVET_P0530_TPM_WEIGHT_ALIGN
-int16_t *g_triangleWeights[4][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
+int16_t *g_triangleWeights[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
 #else
 int16_t *g_triangleWeights[2][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
 #endif
diff --git a/source/Lib/CommonLib/Rom.h b/source/Lib/CommonLib/Rom.h
index af8b3d519..fe144b2df 100644
--- a/source/Lib/CommonLib/Rom.h
+++ b/source/Lib/CommonLib/Rom.h
@@ -212,7 +212,7 @@ constexpr uint8_t g_tbMax[257] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
 extern       uint8_t g_triangleMvStorage[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_SIZE >> MIN_CU_LOG2][MAX_CU_SIZE >> MIN_CU_LOG2];
 // 7-tap/3-tap, direction, 2/4/8/16/32/64/128
 #if JVET_P0530_TPM_WEIGHT_ALIGN
-extern int16_t *g_triangleWeights[4][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
+extern int16_t *g_triangleWeights[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
 #else
 extern int16_t *g_triangleWeights[2][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
 #endif
diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h
index d78580bbc..7d61dab31 100644
--- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h
+++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h
@@ -1227,15 +1227,22 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co
   int32_t strideSrc0 = predSrc0.get(compIdx).stride;
   int32_t strideSrc1 = predSrc1.get(compIdx).stride;
 
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+  int32_t chromaScaleX = getComponentScaleX(compIdx, pu.chromaFormat);
+  int32_t chromaScaleY = getComponentScaleY(compIdx, pu.chromaFormat);
+  int8_t log2WidthY = floorLog2(width << chromaScaleX) - 1;
+  int8_t log2HeightY = floorLog2(height << chromaScaleY) - 1;
+#else
   int8_t log2Width = floorLog2(width) - 1;
   int8_t log2Height = floorLog2(height) - 1;
+#endif
   const char    log2WeightBase = 3;
   const ClpRng  clpRng = pu.cu->slice->clpRngs().comp[compIdx];
   const int32_t shiftWeighted = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)) + log2WeightBase;
   const int32_t offsetWeighted = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);
 #if JVET_P0530_TPM_WEIGHT_ALIGN
-  int wIdx = (compIdx == COMPONENT_Y) ? 0 : pu.cs->sps->getChromaFormatIdc();
-  int16_t *weight = g_triangleWeights[wIdx][splitDir][log2Height][log2Width];
+  int16_t *weight = g_triangleWeights[splitDir][log2HeightY][log2WidthY];
+  int16_t stepY = width << (chromaScaleX + chromaScaleY);
 #else
   const bool    longWeight = (compIdx == COMPONENT_Y);
   const bool    shortWeight = !longWeight;
@@ -1251,13 +1258,23 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co
 
   if (width == 2)
   {
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+    const __m128i mask = _mm_set_epi16( (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, 0x0504, 0x0100 );
+#endif
     for (int y = 0; y < height; y++)
     {
       __m128i s0 = _mm_cvtsi32_si128(*(uint32_t *) src0);
       __m128i s1 = _mm_cvtsi32_si128(*(uint32_t *) src1);
-      __m128i w0 = _mm_cvtsi32_si128(*(uint32_t *) weight);
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+      __m128i w0 = _mm_loadl_epi64((__m128i *) (weight));
+      if (chromaScaleX == 1)
+      {
+        w0 = _mm_shuffle_epi8(w0, mask);
+      }
+#else
+      __m128i w0 = _mm_cvtsi32_si128(*(uint32_t *)weight);
+#endif
       __m128i w1 = _mm_sub_epi16(mmEight, w0);
-
       s0 = _mm_unpacklo_epi16(s0, s1);
       w0 = _mm_unpacklo_epi16(w0, w1);
       s0 = _mm_add_epi32(_mm_madd_epi16(s0, w0), mmOffset);
@@ -1269,16 +1286,31 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co
       dst += strideDst;
       src0 += strideSrc0;
       src1 += strideSrc1;
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+      weight += stepY;
+#else
       weight += 2;
+#endif
     }
   }
   else if(width == 4)
   {
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+    const __m128i mask = _mm_set_epi16( (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, 0x0D0C, 0x0908, 0x0504, 0x0100 );
+#endif
     for (int y = 0; y < height; y++)
     {
       __m128i s0 = _mm_loadl_epi64((__m128i *) (src0));
       __m128i s1 = _mm_loadl_epi64((__m128i *) (src1));
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+      __m128i w0 = _mm_loadu_si128((__m128i *) (weight));
+      if (chromaScaleX == 1)
+      {
+        w0 = _mm_shuffle_epi8(w0, mask);
+      }
+#else
       __m128i w0 = _mm_loadl_epi64((__m128i *) (weight));
+#endif
       __m128i w1 = _mm_sub_epi16(mmEight, w0);
       s0 = _mm_unpacklo_epi16(s0, s1);
       w0 = _mm_unpacklo_epi16(w0, w1);
@@ -1290,18 +1322,38 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co
       dst += strideDst;
       src0 += strideSrc0;
       src1 += strideSrc1;
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+      weight += stepY;
+#else
       weight += 4;
+#endif
     }
   }
   else
   {
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+    const __m128i mask1 = _mm_set_epi16( 0x0D0C, 0x0908, 0x0504, 0x0100, (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080 );
+    const __m128i mask2 = _mm_set_epi16( (short) 0x8080, (short) 0x8080, (short) 0x8080, (short) 0x8080, 0x0D0C, 0x0908, 0x0504, 0x0100 );
+#endif
     for (int y = 0; y < height; y++)
     {
       for (int x = 0; x < width; x += 8)
       {
         __m128i s0 = _mm_loadu_si128((__m128i *) (src0 + x));
         __m128i s1 = _mm_loadu_si128((__m128i *) (src1 + x));
+        
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+        __m128i w0 = _mm_loadu_si128((__m128i *) (weight + (x << chromaScaleX)));
+        if (chromaScaleX == 1)
+        {
+          __m128i w01 = _mm_loadu_si128((__m128i *) (weight + (x << chromaScaleX) + 8));
+          w0 = _mm_shuffle_epi8(w0, mask1);
+          w01 = _mm_shuffle_epi8(w01, mask2);
+          w0 = _mm_alignr_epi8(w01, w0, 8);
+        }
+#else
         __m128i w0 = _mm_loadu_si128((__m128i *) (weight + x));
+#endif
         __m128i w1 = _mm_sub_epi16(mmEight, w0);
 
         __m128i s0tmp = _mm_unpacklo_epi16(s0, s1);
@@ -1321,7 +1373,11 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co
       dst += strideDst;
       src0 += strideSrc0;
       src1 += strideSrc1;
+#if JVET_P0530_TPM_WEIGHT_ALIGN
+      weight += stepY;
+#else
       weight += width;
+#endif
     }
   }
 }
-- 
GitLab