From fd1a941b563f285499d82eed806e751cb645557a Mon Sep 17 00:00:00 2001
From: Jacob Strom <jacob.strom@ericsson.com>
Date: Sun, 16 May 2021 22:38:18 +0200
Subject: [PATCH] Further cleanup of bilateral filter code.

Renamed source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp -> source/Lib/CommonLib/x86/sse42/BilateralFilter_sse42.cpp
Changed SIMD-version of the coded so that it also accepts blocks of width 4.
---
 source/Lib/CommonLib/BilateralFilter.cpp      | 52 +++++++++++--------
 source/Lib/CommonLib/x86/BilateralFilterX86.h |  3 +-
 ...er_sse42.cpp => BilateralFilter_sse42.cpp} |  0
 3 files changed, 31 insertions(+), 24 deletions(-)
 rename source/Lib/CommonLib/x86/sse42/{BilateralFilater_sse42.cpp => BilateralFilter_sse42.cpp} (100%)

diff --git a/source/Lib/CommonLib/BilateralFilter.cpp b/source/Lib/CommonLib/BilateralFilter.cpp
index 38058a4da..beb6d9b05 100755
--- a/source/Lib/CommonLib/BilateralFilter.cpp
+++ b/source/Lib/CommonLib/BilateralFilter.cpp
@@ -381,8 +381,14 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
   uint32_t   uiWidthExt = uiWidth + (NUMBER_PADDED_SAMPLES << 1);
   uint32_t   uiHeightExt = uiHeight + (NUMBER_PADDED_SAMPLES << 1);
   
-  memset(tempblock, 0, uiWidthExt*uiHeightExt * sizeof(short));
-  tempBlockPtr = tempblock + (NUMBER_PADDED_SAMPLES)* uiWidthExt + NUMBER_PADDED_SAMPLES;
+  int iWidthExtSIMD = uiWidthExt;
+  if( uiWidth < 8 )
+  {
+    iWidthExtSIMD = 8 + (NUMBER_PADDED_SAMPLES << 1);
+  }
+  
+  memset(tempblock, 0, iWidthExtSIMD*uiHeightExt * sizeof(short));
+  tempBlockPtr = tempblock + (NUMBER_PADDED_SAMPLES)* iWidthExtSIMD + NUMBER_PADDED_SAMPLES;
   
   //// Clip and move block to temporary block
   if (useReco)
@@ -391,7 +397,7 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
     {
       std::memcpy(tempBlockPtr, piReco, uiWidth * sizeof(Pel));
       piReco += uiRecStride;
-      tempBlockPtr += uiWidthExt;
+      tempBlockPtr += iWidthExtSIMD;
     }
     piReco = piRecoTemp;
   }
@@ -406,7 +412,7 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
       piPred += uiPredStride;
       piResi += uiStrideRes;
       piReco += uiRecStride;
-      tempBlockPtr += uiWidthExt;
+      tempBlockPtr += iWidthExtSIMD;
     }
   }
   
@@ -423,13 +429,13 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
   // if they pass the test.
   for (int yy = 1; yy< uiHeightExt -1 ; yy++)
   {
-    tempblock[yy*uiWidthExt + 1] = tempblock[yy*uiWidthExt + 2];
-    tempblock[yy*uiWidthExt + uiWidthExt - 2] = tempblock[yy*uiWidthExt + uiWidthExt - 3];
+    tempblock[yy*iWidthExtSIMD + 1] = tempblock[yy*iWidthExtSIMD + 2];
+    tempblock[yy*iWidthExtSIMD + uiWidthExt - 2] = tempblock[yy*iWidthExtSIMD + uiWidthExt - 3];
   }
   for (int xx = 1; xx< uiWidthExt - 1; xx++)
   {
-    tempblock[1 * uiWidthExt + xx] = tempblock[2 * uiWidthExt + xx];
-    tempblock[(uiHeightExt - 2)*uiWidthExt + xx] = tempblock[(uiHeightExt - 3)*uiWidthExt + xx];
+    tempblock[1 * iWidthExtSIMD + xx] = tempblock[2 * iWidthExtSIMD + xx];
+    tempblock[(uiHeightExt - 2)*iWidthExtSIMD + xx] = tempblock[(uiHeightExt - 3)*iWidthExtSIMD + xx];
   }
   
   bool subTuVer = currTU.lx() > currTU.cu->lx() ? true : false;
@@ -450,11 +456,11 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
     if (topAvailable && leftAvailable)
     {
       // top left pixels
-      tempblock[uiWidthExt + 1] = *(piRecIPred - (uiRecIPredStride)-1);
+      tempblock[iWidthExtSIMD + 1] = *(piRecIPred - (uiRecIPredStride)-1);
       // Reshape copied pixels if necessary.
       if(doReshape)
       {
-        tempblock[uiWidthExt + 1] = pLUT[tempblock[uiWidthExt + 1]];
+        tempblock[iWidthExtSIMD + 1] = pLUT[tempblock[iWidthExtSIMD + 1]];
       }
     }
     
@@ -464,12 +470,12 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
       for (int blockx = 0; blockx < area.width; blockx += 1)
       {
         // copy 4 pixels one line above block from block to blockx + 3
-        std::copy(piRecIPred - (uiRecIPredStride)+blockx, piRecIPred - (uiRecIPredStride)+blockx + 1, tempblock + 2 + uiWidthExt + blockx);
+        std::copy(piRecIPred - (uiRecIPredStride)+blockx, piRecIPred - (uiRecIPredStride)+blockx + 1, tempblock + 2 + iWidthExtSIMD + blockx);
         if( doReshape )
         {
           for( int xx = 0; xx < 1; xx++ )
           {
-            tempblock[2 + uiWidthExt + blockx + xx] = pLUT[tempblock[2 + uiWidthExt + blockx + xx]];
+            tempblock[2 + iWidthExtSIMD + blockx + xx] = pLUT[tempblock[2 + iWidthExtSIMD + blockx + xx]];
           }
         }
       }
@@ -482,12 +488,12 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
       const unsigned earlierStride = earlierHalfBuf.stride;
       const Pel *earlierPel = earlierHalfBuf.buf + (currTU.prev->lheight() - 1)*earlierStride;
       
-      std::copy(earlierPel, earlierPel + area.width, tempblock + 2 + uiWidthExt);
+      std::copy(earlierPel, earlierPel + area.width, tempblock + 2 + iWidthExtSIMD);
       if( doReshape )
       {
         for( int xx = 0; xx < area.width; xx++ )
         {
-          tempblock[2 + uiWidthExt + xx] = pLUT[tempblock[2 + uiWidthExt + xx]];
+          tempblock[2 + iWidthExtSIMD + xx] = pLUT[tempblock[2 + iWidthExtSIMD + xx]];
         }
       }
 
@@ -506,10 +512,10 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
     {
       for (int blocky = 0; blocky < area.height; blocky += 1)
       {
-        tempblock[(uiWidthExt << 1) + (blocky + 0) * uiWidthExt + 1] = *(piRecIPred + (blocky + 0)*uiRecIPredStride - 1); // 1 pel out
+        tempblock[(iWidthExtSIMD << 1) + (blocky + 0) * iWidthExtSIMD + 1] = *(piRecIPred + (blocky + 0)*uiRecIPredStride - 1); // 1 pel out
         if(doReshape)
         {
-          tempblock[(uiWidthExt << 1) + (blocky + 0) * uiWidthExt + 1] = pLUT[tempblock[(uiWidthExt << 1) + (blocky + 0) * uiWidthExt + 1]];
+          tempblock[(iWidthExtSIMD << 1) + (blocky + 0) * iWidthExtSIMD + 1] = pLUT[tempblock[(iWidthExtSIMD << 1) + (blocky + 0) * iWidthExtSIMD + 1]];
         }
       }
     }
@@ -523,13 +529,13 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
       
       for (int yy = 0; yy < currTU.lheight(); yy++)
       {
-        tempblock[(uiWidthExt << 1) + yy * uiWidthExt + 1] = *(earlierPel + yy*earlierStride + 0);
+        tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 1] = *(earlierPel + yy*earlierStride + 0);
       }
       if(doReshape)
       {
         for (int yy = 0; yy < currTU.lheight(); yy++)
         {
-          tempblock[(uiWidthExt << 1) + yy * uiWidthExt + 1] = pLUT[tempblock[(uiWidthExt << 1) + yy * uiWidthExt + 1]];
+          tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 1] = pLUT[tempblock[(iWidthExtSIMD << 1) + yy * iWidthExtSIMD + 1]];
         }
       }
     }
@@ -538,13 +544,13 @@ void BilateralFilter::bilateralFilterRDOdiamond5x5(PelBuf& resiBuf, const CPelBu
   // Sloppy copying of outer layer
   for(int yy = 0; yy < uiHeight+2; yy++)
   {
-    tempblock[uiWidthExt + yy*uiWidthExt] = tempblock[uiWidthExt + yy*uiWidthExt + 1];
-    tempblock[(uiWidthExt<<1) - 1 + yy*uiWidthExt] = tempblock[(uiWidthExt<<1) - 2 + yy*uiWidthExt];
+    tempblock[iWidthExtSIMD + yy*iWidthExtSIMD] = tempblock[iWidthExtSIMD + yy*iWidthExtSIMD + 1];
+    tempblock[iWidthExtSIMD + uiWidthExt - 1 + yy*iWidthExtSIMD] = tempblock[iWidthExtSIMD + uiWidthExt - 2 + yy*iWidthExtSIMD];
   }
-  std::copy(tempblock  + uiWidthExt, tempblock + uiWidthExt + uiWidthExt, tempblock);
-  std::copy(tempblock  + uiWidthExt*(uiHeightExt-2), tempblock  + uiWidthExt*(uiHeightExt-2) + uiWidthExt, tempblock + uiWidthExt*(uiHeightExt-1));
+  std::copy(tempblock + iWidthExtSIMD, tempblock + iWidthExtSIMD + uiWidthExt, tempblock);
+  std::copy(tempblock  + iWidthExtSIMD*(uiHeightExt-2), tempblock  + iWidthExtSIMD*(uiHeightExt-2) + uiWidthExt, tempblock + iWidthExtSIMD*(uiHeightExt-1));
 
-  m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, uiWidth + 4, bfac, bif_round_add, bif_round_shift, true, LUTrowPtr );
+  m_bilateralFilterDiamond5x5(uiWidth, uiHeight, tempblock, tempblockFiltered, clpRng, piReco, uiRecStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, true, LUTrowPtr );
 
   if (!useReco)
   {
diff --git a/source/Lib/CommonLib/x86/BilateralFilterX86.h b/source/Lib/CommonLib/x86/BilateralFilterX86.h
index c0ff50f95..7b429788c 100644
--- a/source/Lib/CommonLib/x86/BilateralFilterX86.h
+++ b/source/Lib/CommonLib/x86/BilateralFilterX86.h
@@ -47,7 +47,8 @@
 template<X86_VEXT vext>
 void BilateralFilter::simdFilterDiamond5x5(uint32_t uiWidth, uint32_t uiHeight, int16_t block[], int16_t blkFilt[], const ClpRng& clpRng, Pel* recPtr, int recStride, int iWidthExtSIMD, int bfac, int bif_round_add, int bif_round_shift, bool isRDO, const char* LUTrowPtr )
 {
-  if( uiWidth < 4 || ( uiWidth < 8 && isRDO ) )
+  //if( uiWidth < 4 || ( uiWidth < 8 && isRDO ) )
+  if( uiWidth < 4 )
   {
     return blockBilateralFilterDiamond5x5(uiWidth, uiHeight, block, blkFilt, clpRng, recPtr, recStride, iWidthExtSIMD, bfac, bif_round_add, bif_round_shift, isRDO, LUTrowPtr );
   }
diff --git a/source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp b/source/Lib/CommonLib/x86/sse42/BilateralFilter_sse42.cpp
similarity index 100%
rename from source/Lib/CommonLib/x86/sse42/BilateralFilater_sse42.cpp
rename to source/Lib/CommonLib/x86/sse42/BilateralFilter_sse42.cpp
-- 
GitLab