avcodec/x86/cavs_qpel: Add SSE2 vertical motion compensation

This is not based on the MMXEXT one, because the latter is quite
suboptimal: Motion vector types mc01 and mc03 (vertical motion vectors
with remainder of one quarter or three quarter) use different neighboring
lines for interpolation: mc01 uses two lines above and two lines below,
mc03 one line above and three lines below. The MMXEXT code uses
a common macro for all of them and therefore reads six lines
before it processes them (even reading lines which are not used
at all), leading to severe register pressure.

Another difference to the old code is that the positive and negative
parts of the sum to calculate are accumulated separately and
the subtraction is performed with unsigned saturation, so
that one can avoid biasing the sum.

The fact that the mc01 and mc03 filter coefficients are mirrors
of each other has been exploited to reduce mc01 to mc03.

But of course the most important different difference between
this code and the MMXEXT one is that XMM registers allow to
process eight words at a time, ideal for 8x8 subblocks,
whereas the MMXEXT code processes them in 4x8 or 4x16 blocks.

Benchmarks:
avg_cavs_qpel_pixels_tab[0][4]_c:                      917.0 ( 1.00x)
avg_cavs_qpel_pixels_tab[0][4]_mmxext:                 222.0 ( 4.13x)
avg_cavs_qpel_pixels_tab[0][4]_sse2:                    89.0 (10.31x)
avg_cavs_qpel_pixels_tab[0][12]_c:                     885.7 ( 1.00x)
avg_cavs_qpel_pixels_tab[0][12]_mmxext:                223.2 ( 3.97x)
avg_cavs_qpel_pixels_tab[0][12]_sse2:                   88.5 (10.01x)
avg_cavs_qpel_pixels_tab[1][4]_c:                      222.4 ( 1.00x)
avg_cavs_qpel_pixels_tab[1][4]_mmxext:                  57.2 ( 3.89x)
avg_cavs_qpel_pixels_tab[1][4]_sse2:                    23.3 ( 9.55x)
avg_cavs_qpel_pixels_tab[1][12]_c:                     216.0 ( 1.00x)
avg_cavs_qpel_pixels_tab[1][12]_mmxext:                 57.4 ( 3.76x)
avg_cavs_qpel_pixels_tab[1][12]_sse2:                   22.6 ( 9.56x)
put_cavs_qpel_pixels_tab[0][4]_c:                      750.9 ( 1.00x)
put_cavs_qpel_pixels_tab[0][4]_mmxext:                 210.4 ( 3.57x)
put_cavs_qpel_pixels_tab[0][4]_sse2:                    84.2 ( 8.92x)
put_cavs_qpel_pixels_tab[0][12]_c:                     731.6 ( 1.00x)
put_cavs_qpel_pixels_tab[0][12]_mmxext:                210.7 ( 3.47x)
put_cavs_qpel_pixels_tab[0][12]_sse2:                   84.1 ( 8.70x)
put_cavs_qpel_pixels_tab[1][4]_c:                      191.7 ( 1.00x)
put_cavs_qpel_pixels_tab[1][4]_mmxext:                  53.8 ( 3.56x)
put_cavs_qpel_pixels_tab[1][4]_sse2:                    24.5 ( 7.83x)
put_cavs_qpel_pixels_tab[1][12]_c:                     179.1 ( 1.00x)
put_cavs_qpel_pixels_tab[1][12]_mmxext:                 53.9 ( 3.32x)
put_cavs_qpel_pixels_tab[1][12]_sse2:                   24.0 ( 7.47x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-10-05 16:22:07 +02:00
parent 74a88c0c11
commit 650098955e
5 changed files with 177 additions and 1 deletions

View File

@@ -24,8 +24,14 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_4
cextern pw_5
cextern pw_7
cextern pw_64
pw_42: times 8 dw 42
pw_96: times 8 dw 96
SECTION .text
@@ -78,3 +84,137 @@ cglobal %1_cavs_qpel8_h, 4,4,6
INIT_XMM sse2
CAVS_QPEL_H avg
CAVS_QPEL_H put
%macro FILT_V 1
movh m3, [r1]
punpcklbw m3, m7
mova m4, m1
paddw m4, m2
paddw m0, m3
add r1, r2
pmullw m4, m5
psubw m4, m0
paddw m4, m6
psraw m4, 3
packuswb m4, m7
op_%1h m4, [r0], m0
add r0, r2
SWAP 0, 1, 2, 3
%endmacro
%macro CAVS_QPEL_MC02 1
; ff_put_cavs_qpel8_mc02(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
cglobal %1_cavs_qpel8_mc02, 3,4,8
mov r3d, 8
jmp %1_cavs_qpel8_v2_after_prologue
; ff_put_cavs_qpel8_v2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h)
cglobal %1_cavs_qpel8_v2, 4,4,8
%1_cavs_qpel8_v2_after_prologue:
movh m1, [r1]
sub r1, r2
movh m0, [r1]
lea r1, [r1+2*r2]
pxor m7, m7
movh m2, [r1]
add r1, r2
punpcklbw m1, m7
punpcklbw m0, m7
punpcklbw m2, m7
mova m5, [pw_5]
mova m6, [pw_4]
.loop:
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
sub r3d, 4
jne .loop
RET
%endmacro
INIT_XMM sse2
CAVS_QPEL_MC02 avg
CAVS_QPEL_MC02 put
%macro FILT_V3 1
pmullw m0, PW_7
movh m4, [r1]
mova m5, m1
mova m6, m2
pmullw m5, PW_42
punpcklbw m4, m7
pmullw m6, PW_96
paddw m0, m3
add r1, r2
paddw m0, m3
paddw m5, m6
paddw m0, m4
; m5-m0 can be in the -10*255..(42 + 96)*255 range and
; therefore is not guaranteed to fit into either a signed or
; an unsigned word. Because we need to clamp the result to 0..255
; anyway, we use saturated subtraction and a logical right shift
; for rescaling.
psubusw m5, m0
paddw m5, PW_64
psrlw m5, 7
packuswb m5, m7
op_%1h m5, [r0], m0
add r0, r2
SWAP 0, 1, 2, 3, 4
%endmacro
%macro CAVS_QPEL_MC03 1
; ff_put_cavs_qpel8_mc03(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
cglobal %1_cavs_qpel8_mc03, 3,4,8+4*ARCH_X86_64
mov r3d, 8
jmp %1_cavs_qpel8_v3_after_prologue
; ff_put_cavs_qpel8_v3(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h)
cglobal %1_cavs_qpel8_v3, 4,4,8+4*ARCH_X86_64
%1_cavs_qpel8_v3_after_prologue:
movh m1, [r1]
movh m2, [r1+r2]
movh m3, [r1+2*r2]
sub r1, r2
pxor m7, m7
movh m0, [r1]
lea r1, [r1+4*r2]
punpcklbw m1, m7
punpcklbw m2, m7
%if ARCH_X86_64
%define PW_7 m8
%define PW_42 m9
%define PW_96 m10
%define PW_64 m11
mova m8, [pw_7]
mova m9, [pw_42]
mova m10, [pw_96]
mova m11, [pw_64]
%else
%define PW_7 [pw_7]
%define PW_42 [pw_42]
%define PW_96 [pw_96]
%define PW_64 [pw_64]
%endif
punpcklbw m3, m7
punpcklbw m0, m7
.loop:
FILT_V3 %1
FILT_V3 %1
FILT_V3 %1
FILT_V3 %1
SWAP 0, 1, 2, 3, 4
mova m3, m2
mova m2, m1
mova m1, m0
mova m0, m4
sub r3d, 4
jne .loop
RET
%endmacro
INIT_XMM sse2
CAVS_QPEL_MC03 avg
CAVS_QPEL_MC03 put

View File

@@ -373,12 +373,34 @@ CAVS_MC(avg_, 16, mmxext)
#if HAVE_SSE2_EXTERNAL
#define DEF_QPEL(OPNAME) \
void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
void ff_ ## OPNAME ## _cavs_qpel8_mc02_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
void ff_ ## OPNAME ## _cavs_qpel8_mc03_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
void ff_ ## OPNAME ## _cavs_qpel8_h_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); \
void ff_ ## OPNAME ## _cavs_qpel8_v2_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h);\
void ff_ ## OPNAME ## _cavs_qpel8_v3_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h);\
DEF_QPEL(put)
DEF_QPEL(avg)
#define QPEL_CAVS_XMM(OPNAME, XMM) \
static void OPNAME ## _cavs_qpel16_mc02_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## _cavs_qpel8_v2_ ## XMM(dst, src, stride, 16); \
ff_ ## OPNAME ## _cavs_qpel8_v2_ ## XMM(dst + 8, src + 8, stride, 16); \
} \
static void OPNAME ## _cavs_qpel16_mc03_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## _cavs_qpel8_v3_ ## XMM(dst, src, stride, 16); \
ff_ ## OPNAME ## _cavs_qpel8_v3_ ## XMM(dst + 8, src + 8, stride, 16); \
} \
static void OPNAME ## _cavs_qpel8_mc01_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## _cavs_qpel8_mc03_ ## XMM(dst + 7 * stride, src + 8 * stride, -stride); \
} \
static void OPNAME ## _cavs_qpel16_mc01_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
{ \
OPNAME ## _cavs_qpel16_mc03_ ## XMM(dst + 15 * stride, src + 16 * stride, -stride); \
} \
static void OPNAME ## _cavs_qpel16_mc20_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst, src, stride, 16); \
@@ -413,11 +435,23 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2;
c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2;
c->put_cavs_qpel_pixels_tab[0][ 4] = put_cavs_qpel16_mc01_sse2;
c->put_cavs_qpel_pixels_tab[0][ 8] = put_cavs_qpel16_mc02_sse2;
c->put_cavs_qpel_pixels_tab[0][12] = put_cavs_qpel16_mc03_sse2;
c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2;
c->put_cavs_qpel_pixels_tab[1][ 4] = put_cavs_qpel8_mc01_sse2;
c->put_cavs_qpel_pixels_tab[1][ 8] = ff_put_cavs_qpel8_mc02_sse2;
c->put_cavs_qpel_pixels_tab[1][12] = ff_put_cavs_qpel8_mc03_sse2;
c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2;
c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2;
c->avg_cavs_qpel_pixels_tab[0][ 4] = avg_cavs_qpel16_mc01_sse2;
c->avg_cavs_qpel_pixels_tab[0][ 8] = avg_cavs_qpel16_mc02_sse2;
c->avg_cavs_qpel_pixels_tab[0][12] = avg_cavs_qpel16_mc03_sse2;
c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2;
c->avg_cavs_qpel_pixels_tab[1][ 4] = avg_cavs_qpel8_mc01_sse2;
c->avg_cavs_qpel_pixels_tab[1][ 8] = ff_avg_cavs_qpel8_mc02_sse2;
c->avg_cavs_qpel_pixels_tab[1][12] = ff_avg_cavs_qpel8_mc03_sse2;
c->cavs_idct8_add = cavs_idct8_add_sse2;
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;

View File

@@ -30,6 +30,7 @@ DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
0x0004000400040004ULL, 0x0004000400040004ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_7) = { 0x0007000700070007ULL, 0x0007000700070007ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;

View File

@@ -30,6 +30,7 @@ extern const ymm_reg ff_pw_2;
extern const xmm_reg ff_pw_3;
extern const ymm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_7;
extern const xmm_reg ff_pw_8;
extern const xmm_reg ff_pw_9;
extern const uint64_t ff_pw_15;

View File

@@ -21,11 +21,11 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_7: times 8 dw 7
convert_to_unsigned_10bit: times 4 dd 0x200
clip_10bit: times 8 dw 0x3ff
cextern pw_3
cextern pw_7
cextern pw_16
cextern pw_32
cextern pb_80