avfilter/x86/vf_idetdsp: add AVX2 and AVX512 implementations

The only thing that changes slightly is the horizontal sum at the end.
This commit is contained in:
Niklas Haas
2025-09-15 17:47:39 +02:00
committed by Niklas Haas
parent 4c067d0778
commit 843920d5d6
2 changed files with 42 additions and 6 deletions

View File

@@ -39,7 +39,7 @@ SECTION .text
paddd %1, %2
%endmacro
%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
%macro IDET_FILTER_LINE_16BIT 0
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
xor indexq, indexq
%define m_zero m1
@@ -54,7 +54,7 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
psubusw m5, m2, m3 ; ba
movu m4, [cq + indexq * 2] ; C
add indexq, %1
add indexq, mmsize >> 1
psubusw m3, m2 ; ab
CMP indexd, widthd
@@ -67,13 +67,23 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
paddd m_sum, m5
jl .loop_16bit
%if mmsize > 32
vextracti64x4 ym1, m0, 1
paddq ym0, ym1
%endif
HADDD m_sum, m2
movd eax, m_sum
RET
%endmacro
INIT_XMM sse2
IDET_FILTER_LINE_16BIT 8
IDET_FILTER_LINE_16BIT
INIT_XMM avx2
IDET_FILTER_LINE_16BIT
INIT_XMM avx512icl
IDET_FILTER_LINE_16BIT
;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:
@@ -106,11 +116,25 @@ cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
jl .sse2_loop
paddq m0, m1
movhlps m1, m0
paddq m0, m1
movd eax, m0
%if mmsize > 32
vextracti64x4 ym1, m0, 1
paddq ym0, ym1
%endif
%if mmsize > 16
vextracti128 xm1, ym0, 1
paddq xm0, xm1
%endif
movhlps xm1, xm0
paddq xm0, xm1
movd eax, xm0
RET
%endmacro
INIT_XMM sse2
IDET_FILTER_LINE
INIT_YMM avx2
IDET_FILTER_LINE
INIT_ZMM avx512icl
IDET_FILTER_LINE

View File

@@ -62,6 +62,12 @@ static int idet_filter_line_16bit_##KIND(const uint8_t *a, const uint8_t *b, \
FUNC_MAIN_DECL(sse2, 16)
FUNC_MAIN_DECL_16bit(sse2, 8)
FUNC_MAIN_DECL(avx2, 32)
FUNC_MAIN_DECL_16bit(avx2, 16)
FUNC_MAIN_DECL(avx512icl, 64)
FUNC_MAIN_DECL_16bit(avx512icl, 32)
#endif
av_cold void ff_idet_dsp_init_x86(IDETDSPContext *dsp, int depth)
{
@@ -71,5 +77,11 @@ av_cold void ff_idet_dsp_init_x86(IDETDSPContext *dsp, int depth)
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->filter_line = depth > 8 ? idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
}
if (EXTERNAL_AVX2(cpu_flags)) {
dsp->filter_line = depth > 8 ? idet_filter_line_16bit_avx2 : idet_filter_line_avx2;
}
if (EXTERNAL_AVX512ICL(cpu_flags)) {
dsp->filter_line = depth > 8 ? idet_filter_line_16bit_avx512icl : idet_filter_line_avx512icl;
}
#endif // HAVE_X86ASM
}