Commit 911dd6e9 authored by Xiang Li's avatar Xiang Li

Merge branch 'weighted-triangle-simd' into 'master'

JVET-O0280: SIMD implementation for weighted sample prediction process of triangle prediction mode

See merge request !639
parents c27af72c 019dc2ce
Pipeline #1851 passed with stage
......@@ -1397,21 +1397,37 @@ void InterPrediction::weightedTriangleBlk( PredictionUnit &pu, const bool splitD
{
if( channel == CHANNEL_TYPE_LUMA )
{
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
m_if.weightedTriangleBlk( pu, pu.lumaSize().width, pu.lumaSize().height, COMPONENT_Y, splitDir, predDst, predSrc0, predSrc1 );
#else
xWeightedTriangleBlk( pu, pu.lumaSize().width, pu.lumaSize().height, COMPONENT_Y, splitDir, predDst, predSrc0, predSrc1 );
#endif
}
else if( channel == CHANNEL_TYPE_CHROMA )
{
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
m_if.weightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0, predSrc1 );
m_if.weightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0, predSrc1 );
#else
xWeightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0, predSrc1 );
xWeightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0, predSrc1 );
#endif
}
else
{
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
m_if.weightedTriangleBlk( pu, pu.lumaSize().width, pu.lumaSize().height, COMPONENT_Y, splitDir, predDst, predSrc0, predSrc1 );
m_if.weightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0, predSrc1 );
m_if.weightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0, predSrc1 );
#else
xWeightedTriangleBlk( pu, pu.lumaSize().width, pu.lumaSize().height, COMPONENT_Y, splitDir, predDst, predSrc0, predSrc1 );
xWeightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cb, splitDir, predDst, predSrc0, predSrc1 );
xWeightedTriangleBlk( pu, pu.chromaSize().width, pu.chromaSize().height, COMPONENT_Cr, splitDir, predDst, predSrc0, predSrc1 );
#endif
}
}
#if !JVET_O0280_SIMD_TRIANGLE_WEIGHTING
void InterPrediction::xWeightedTriangleBlk( const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1 )
{
Pel* dst = predDst .get(compIdx).buf;
......@@ -1486,6 +1502,7 @@ void InterPrediction::xWeightedTriangleBlk( const PredictionUnit &pu, const uint
weightedEndPos += weightedPosoffset;
}
}
#endif
void InterPrediction::xPrefetchPad(PredictionUnit& pu, PelUnitBuf &pcPad, RefPicList refId)
{
......
......@@ -210,6 +210,9 @@ InterpolationFilter::InterpolationFilter()
m_filterCopy[1][0] = filterCopy<true, false>;
m_filterCopy[1][1] = filterCopy<true, true>;
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
m_weightedTriangleBlk = xWeightedTriangleBlk;
#endif
}
......@@ -657,6 +660,88 @@ void InterpolationFilter::filterVer( const ComponentID compID, Pel const *src, i
}
}
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
void InterpolationFilter::xWeightedTriangleBlk( const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1 )
{
Pel* dst = predDst .get(compIdx).buf;
Pel* src0 = predSrc0.get(compIdx).buf;
Pel* src1 = predSrc1.get(compIdx).buf;
int32_t strideDst = predDst .get(compIdx).stride - width;
int32_t strideSrc0 = predSrc0.get(compIdx).stride - width;
int32_t strideSrc1 = predSrc1.get(compIdx).stride - width;
const char log2WeightBase = 3;
const ClpRng clipRng = pu.cu->slice->clpRngs().comp[compIdx];
const int32_t clipbd = clipRng.bd;
const int32_t shiftDefault = std::max<int>(2, (IF_INTERNAL_PREC - clipbd));
const int32_t offsetDefault = (1<<(shiftDefault-1)) + IF_INTERNAL_OFFS;
const int32_t shiftWeighted = std::max<int>(2, (IF_INTERNAL_PREC - clipbd)) + log2WeightBase;
const int32_t offsetWeighted = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);
const int32_t ratioWH = (width > height) ? (width / height) : 1;
const int32_t ratioHW = (width > height) ? 1 : (height / width);
const bool longWeight = (compIdx == COMPONENT_Y);
const int32_t weightedLength = longWeight ? 7 : 3;
int32_t weightedStartPos = ( splitDir == 0 ) ? ( 0 - (weightedLength >> 1) * ratioWH ) : ( width - ((weightedLength + 1) >> 1) * ratioWH );
int32_t weightedEndPos = weightedStartPos + weightedLength * ratioWH - 1;
int32_t weightedPosoffset = ( splitDir == 0 ) ? ratioWH : -ratioWH;
Pel tmpPelWeighted;
int32_t weightIdx;
int32_t x, y, tmpX, tmpY, tmpWeightedStart, tmpWeightedEnd;
for( y = 0; y < height; y+= ratioHW )
{
for( tmpY = ratioHW; tmpY > 0; tmpY-- )
{
for( x = 0; x < weightedStartPos; x++ )
{
*dst++ = ClipPel( rightShift( (splitDir == 0 ? *src1 : *src0) + offsetDefault, shiftDefault), clipRng );
src0++;
src1++;
}
tmpWeightedStart = std::max((int32_t)0, weightedStartPos);
tmpWeightedEnd = std::min(weightedEndPos, (int32_t)(width - 1));
weightIdx = 1;
if( weightedStartPos < 0 )
{
weightIdx += abs(weightedStartPos) / ratioWH;
}
for( x = tmpWeightedStart; x <= tmpWeightedEnd; x+= ratioWH )
{
for( tmpX = ratioWH; tmpX > 0; tmpX-- )
{
tmpPelWeighted = Clip3( 1, 7, longWeight ? weightIdx : (weightIdx * 2));
tmpPelWeighted = splitDir ? ( 8 - tmpPelWeighted ) : tmpPelWeighted;
*dst++ = ClipPel( rightShift( (tmpPelWeighted*(*src0++) + ((8 - tmpPelWeighted) * (*src1++)) + offsetWeighted), shiftWeighted ), clipRng );
}
weightIdx ++;
}
for( x = weightedEndPos + 1; x < width; x++ )
{
*dst++ = ClipPel( rightShift( (splitDir == 0 ? *src0 : *src1) + offsetDefault, shiftDefault ), clipRng );
src0++;
src1++;
}
dst += strideDst;
src0 += strideSrc0;
src1 += strideSrc1;
}
weightedStartPos += weightedPosoffset;
weightedEndPos += weightedPosoffset;
}
}
void InterpolationFilter::weightedTriangleBlk(const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1)
{
m_weightedTriangleBlk(pu, width, height, compIdx, splitDir, predDst, predSrc0, predSrc1);
}
#endif
/**
* \brief turn on SIMD fuc
*
......
......@@ -72,6 +72,10 @@ public:
template<int N>
void filterVer(const ClpRng& clpRng, Pel const* src, int srcStride, Pel *dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR);
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
static void xWeightedTriangleBlk(const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1);
void weightedTriangleBlk(const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1);
#endif
protected:
#if JVET_J0090_MEMORY_BANDWITH_MEASURE
static CacheModel* m_cacheModel;
......@@ -82,6 +86,9 @@ public:
void( *m_filterHor[3][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR);
void( *m_filterVer[3][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR);
void( *m_filterCopy[2][2] ) ( const ClpRng& clpRng, Pel const *src, int srcStride, Pel *dst, int dstStride, int width, int height, bool biMCForDMVR);
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
void( *m_weightedTriangleBlk )(const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1);
#endif
void initInterpolationFilter( bool enable );
#ifdef TARGET_SIMD_X86
......
......@@ -401,6 +401,35 @@ void initROM()
}
}
}
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
for (int idxH = 0; idxH < MAX_CU_DEPTH - MIN_CU_LOG2 + 2; ++idxH)
{
for (int idxW = 0; idxW < MAX_CU_DEPTH - MIN_CU_LOG2 + 2; ++idxW)
{
const int nCbH = 1 << (idxH + 1);
const int nCbW = 1 << (idxW + 1);
const int nCbR = (nCbW > nCbH) ? nCbW / nCbH : nCbH / nCbW;
// let SIMD can read at least 64-bit when at last row
g_triangleWeights[0][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
g_triangleWeights[0][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
g_triangleWeights[1][0][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
g_triangleWeights[1][1][idxH][idxW] = new int16_t[nCbH * nCbW + 4];
for (int y = 0; y < nCbH; y++)
{
for (int x = 0; x < nCbW; x++)
{
g_triangleWeights[0][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, x / nCbR - y + 4) : Clip3(0, 8, x - y / nCbR + 4);
g_triangleWeights[0][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 8, nCbH - 1 - x / nCbR - y + 4) : Clip3(0, 8, nCbW - 1 - x - y / nCbR + 4);
g_triangleWeights[1][0][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 4, x / nCbR - y + 2) * 2 : Clip3(0, 4, x - y / nCbR + 2) * 2;
g_triangleWeights[1][1][idxH][idxW][y*nCbW + x] = (nCbW > nCbH) ? Clip3(0, 4, nCbH - 1 - x / nCbR - y + 2) * 2 : Clip3(0, 4, nCbW - 1 - x - y / nCbR + 2) * 2;
}
}
}
}
#endif
}
void destroyROM()
......@@ -425,6 +454,23 @@ void destroyROM()
delete gp_sizeIdxInfo;
gp_sizeIdxInfo = nullptr;
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
for (int idxH = 0; idxH < MAX_CU_DEPTH - MIN_CU_LOG2 + 2; ++idxH)
{
for (int idxW = 0; idxW < MAX_CU_DEPTH - MIN_CU_LOG2 + 2; ++idxW)
{
delete[] g_triangleWeights[0][0][idxH][idxW];
delete[] g_triangleWeights[0][1][idxH][idxW];
delete[] g_triangleWeights[1][0][idxH][idxW];
delete[] g_triangleWeights[1][1][idxH][idxW];
g_triangleWeights[0][0][idxH][idxW] = nullptr;
g_triangleWeights[0][1][idxH][idxW] = nullptr;
g_triangleWeights[1][0][idxH][idxW] = nullptr;
g_triangleWeights[1][1][idxH][idxW] = nullptr;
}
}
#endif
}
// ====================================================================================================================
......@@ -700,5 +746,8 @@ const uint32_t g_scalingListSizeX[SCALING_LIST_SIZE_NUM] = { 1, 2, 4, 8, 16,
uint8_t g_triangleMvStorage[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_SIZE >> MIN_CU_LOG2][MAX_CU_SIZE >> MIN_CU_LOG2];
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
int16_t *g_triangleWeights[2][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
#endif
//! \}
......@@ -215,6 +215,10 @@ constexpr uint8_t g_tbMax[257] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
//! \}
extern uint8_t g_triangleMvStorage[TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_DEPTH - MIN_CU_LOG2 + 1][MAX_CU_SIZE >> MIN_CU_LOG2][MAX_CU_SIZE >> MIN_CU_LOG2];
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
// 7-tap/3-tap, direction, 2/4/8/16/32/64/128
extern int16_t *g_triangleWeights[2][TRIANGLE_DIR_NUM][MAX_CU_DEPTH - MIN_CU_LOG2 + 2][MAX_CU_DEPTH - MIN_CU_LOG2 + 2];
#endif
extern bool g_mctsDecCheckEnabled;
......
......@@ -137,6 +137,7 @@
#define JVET_O0280_SIMD_TRIANGLE_WEIGHTING 1 // JVET-O0280: SIMD implementation for weighted sample prediction process of triangle prediction mode
#define FIX_DB_MAX_TRANSFORM_SIZE 1
......
......@@ -1217,6 +1217,110 @@ static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel
}
}
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
template< X86_VEXT vext >
void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, const uint32_t height, const ComponentID compIdx, const bool splitDir, PelUnitBuf& predDst, PelUnitBuf& predSrc0, PelUnitBuf& predSrc1)
{
Pel* dst = predDst.get(compIdx).buf;
Pel* src0 = predSrc0.get(compIdx).buf;
Pel* src1 = predSrc1.get(compIdx).buf;
int32_t strideDst = predDst.get(compIdx).stride;
int32_t strideSrc0 = predSrc0.get(compIdx).stride;
int32_t strideSrc1 = predSrc1.get(compIdx).stride;
int8_t log2Width = g_aucLog2[width] - 1;
int8_t log2Height = g_aucLog2[height] - 1;
const char log2WeightBase = 3;
const ClpRng clpRng = pu.cu->slice->clpRngs().comp[compIdx];
const int32_t shiftWeighted = std::max<int>(2, (IF_INTERNAL_PREC - clpRng.bd)) + log2WeightBase;
const int32_t offsetWeighted = (1 << (shiftWeighted - 1)) + (IF_INTERNAL_OFFS << log2WeightBase);
const bool longWeight = (compIdx == COMPONENT_Y);
const bool shortWeight = !longWeight;
int16_t *weight = g_triangleWeights[shortWeight][splitDir][log2Height][log2Width];
const __m128i mmEight = _mm_set1_epi16(8);
const __m128i mmOffset = _mm_set1_epi32(offsetWeighted);
const __m128i mmShift = _mm_cvtsi32_si128(shiftWeighted);
const __m128i mmMin = _mm_set1_epi16(clpRng.min);
const __m128i mmMax = _mm_set1_epi16(clpRng.max);
if (width == 2)
{
for (int y = 0; y < height; y++)
{
__m128i s0 = _mm_loadl_epi64((__m128i *) (src0));
__m128i s1 = _mm_loadl_epi64((__m128i *) (src1));
__m128i w0 = _mm_loadl_epi64((__m128i *) (weight));
__m128i w1 = _mm_sub_epi16(mmEight, w0);
s0 = _mm_unpacklo_epi16(s0, s1);
w0 = _mm_unpacklo_epi16(w0, w1);
s0 = _mm_add_epi32(_mm_madd_epi16(s0, w0), mmOffset);
s0 = _mm_sra_epi32(s0, mmShift);
s0 = _mm_packs_epi32(s0, s0);
s0 = _mm_min_epi16(mmMax, _mm_max_epi16(s0, mmMin));
*(int*)(dst) = _mm_cvtsi128_si32(s0);
dst += strideDst;
src0 += strideSrc0;
src1 += strideSrc1;
weight += 2;
}
}
else if(width == 4)
{
for (int y = 0; y < height; y++)
{
__m128i s0 = _mm_loadl_epi64((__m128i *) (src0));
__m128i s1 = _mm_loadl_epi64((__m128i *) (src1));
__m128i w0 = _mm_loadl_epi64((__m128i *) (weight));
__m128i w1 = _mm_sub_epi16(mmEight, w0);
s0 = _mm_unpacklo_epi16(s0, s1);
w0 = _mm_unpacklo_epi16(w0, w1);
s0 = _mm_add_epi32(_mm_madd_epi16(s0, w0), mmOffset);
s0 = _mm_sra_epi32(s0, mmShift);
s0 = _mm_packs_epi32(s0, s0);
s0 = _mm_min_epi16(mmMax, _mm_max_epi16(s0, mmMin));
_mm_storel_epi64((__m128i *) (dst), s0);
dst += strideDst;
src0 += strideSrc0;
src1 += strideSrc1;
weight += 4;
}
}
else
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x += 8)
{
__m128i s0 = _mm_loadu_si128((__m128i *) (src0 + x));
__m128i s1 = _mm_loadu_si128((__m128i *) (src1 + x));
__m128i w0 = _mm_loadu_si128((__m128i *) (weight + x));
__m128i w1 = _mm_sub_epi16(mmEight, w0);
__m128i s0tmp = _mm_unpacklo_epi16(s0, s1);
__m128i w0tmp = _mm_unpacklo_epi16(w0, w1);
s0tmp = _mm_add_epi32(_mm_madd_epi16(s0tmp, w0tmp), mmOffset);
s0tmp = _mm_sra_epi32(s0tmp, mmShift);
s0 = _mm_unpackhi_epi16(s0, s1);
w0 = _mm_unpackhi_epi16(w0, w1);
s0 = _mm_add_epi32(_mm_madd_epi16(s0, w0), mmOffset);
s0 = _mm_sra_epi32(s0, mmShift);
s0 = _mm_packs_epi32(s0tmp, s0);
s0 = _mm_min_epi16(mmMax, _mm_max_epi16(s0, mmMin));
_mm_storeu_si128((__m128i *) (dst + x), s0);
}
dst += strideDst;
src0 += strideSrc0;
src1 += strideSrc1;
weight += width;
}
}
}
#endif
template <X86_VEXT vext>
void InterpolationFilter::_initInterpolationFilterX86()
{
......@@ -1255,6 +1359,10 @@ void InterpolationFilter::_initInterpolationFilterX86()
m_filterCopy[0][1] = simdFilterCopy<vext, false, true>;
m_filterCopy[1][0] = simdFilterCopy<vext, true, false>;
m_filterCopy[1][1] = simdFilterCopy<vext, true, true>;
#if JVET_O0280_SIMD_TRIANGLE_WEIGHTING
m_weightedTriangleBlk = xWeightedTriangleBlk_SSE<vext>;
#endif
}
template void InterpolationFilter::_initInterpolationFilterX86<SIMDX86>();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment