mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2026-02-04 14:30:55 +08:00
avcodec/x86/cavs_qpel: Add SSE2 vertical motion compensation
This is not based on the MMXEXT one, because the latter is quite suboptimal: Motion vector types mc01 and mc03 (vertical motion vectors with remainder of one quarter or three quarter) use different neighboring lines for interpolation: mc01 uses two lines above and two lines below, mc03 one line above and three lines below. The MMXEXT code uses a common macro for all of them and therefore reads six lines before it processes them (even reading lines which are not used at all), leading to severe register pressure. Another difference to the old code is that the positive and negative parts of the sum to calculate are accumulated separately and the subtraction is performed with unsigned saturation, so that one can avoid biasing the sum. The fact that the mc01 and mc03 filter coefficients are mirrors of each other has been exploited to reduce mc01 to mc03. But of course the most important different difference between this code and the MMXEXT one is that XMM registers allow to process eight words at a time, ideal for 8x8 subblocks, whereas the MMXEXT code processes them in 4x8 or 4x16 blocks. Benchmarks: avg_cavs_qpel_pixels_tab[0][4]_c: 917.0 ( 1.00x) avg_cavs_qpel_pixels_tab[0][4]_mmxext: 222.0 ( 4.13x) avg_cavs_qpel_pixels_tab[0][4]_sse2: 89.0 (10.31x) avg_cavs_qpel_pixels_tab[0][12]_c: 885.7 ( 1.00x) avg_cavs_qpel_pixels_tab[0][12]_mmxext: 223.2 ( 3.97x) avg_cavs_qpel_pixels_tab[0][12]_sse2: 88.5 (10.01x) avg_cavs_qpel_pixels_tab[1][4]_c: 222.4 ( 1.00x) avg_cavs_qpel_pixels_tab[1][4]_mmxext: 57.2 ( 3.89x) avg_cavs_qpel_pixels_tab[1][4]_sse2: 23.3 ( 9.55x) avg_cavs_qpel_pixels_tab[1][12]_c: 216.0 ( 1.00x) avg_cavs_qpel_pixels_tab[1][12]_mmxext: 57.4 ( 3.76x) avg_cavs_qpel_pixels_tab[1][12]_sse2: 22.6 ( 9.56x) put_cavs_qpel_pixels_tab[0][4]_c: 750.9 ( 1.00x) put_cavs_qpel_pixels_tab[0][4]_mmxext: 210.4 ( 3.57x) put_cavs_qpel_pixels_tab[0][4]_sse2: 84.2 ( 8.92x) put_cavs_qpel_pixels_tab[0][12]_c: 731.6 ( 1.00x) put_cavs_qpel_pixels_tab[0][12]_mmxext: 210.7 ( 3.47x) put_cavs_qpel_pixels_tab[0][12]_sse2: 84.1 ( 8.70x) put_cavs_qpel_pixels_tab[1][4]_c: 191.7 ( 1.00x) put_cavs_qpel_pixels_tab[1][4]_mmxext: 53.8 ( 3.56x) put_cavs_qpel_pixels_tab[1][4]_sse2: 24.5 ( 7.83x) put_cavs_qpel_pixels_tab[1][12]_c: 179.1 ( 1.00x) put_cavs_qpel_pixels_tab[1][12]_mmxext: 53.9 ( 3.32x) put_cavs_qpel_pixels_tab[1][12]_sse2: 24.0 ( 7.47x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -24,8 +24,14 @@
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_5
|
||||
cextern pw_7
|
||||
cextern pw_64
|
||||
pw_42: times 8 dw 42
|
||||
pw_96: times 8 dw 96
|
||||
|
||||
SECTION .text
|
||||
|
||||
@@ -78,3 +84,137 @@ cglobal %1_cavs_qpel8_h, 4,4,6
|
||||
INIT_XMM sse2
|
||||
CAVS_QPEL_H avg
|
||||
CAVS_QPEL_H put
|
||||
|
||||
%macro FILT_V 1
|
||||
movh m3, [r1]
|
||||
punpcklbw m3, m7
|
||||
mova m4, m1
|
||||
paddw m4, m2
|
||||
paddw m0, m3
|
||||
add r1, r2
|
||||
pmullw m4, m5
|
||||
psubw m4, m0
|
||||
paddw m4, m6
|
||||
psraw m4, 3
|
||||
packuswb m4, m7
|
||||
op_%1h m4, [r0], m0
|
||||
add r0, r2
|
||||
SWAP 0, 1, 2, 3
|
||||
%endmacro
|
||||
|
||||
%macro CAVS_QPEL_MC02 1
|
||||
; ff_put_cavs_qpel8_mc02(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
cglobal %1_cavs_qpel8_mc02, 3,4,8
|
||||
mov r3d, 8
|
||||
jmp %1_cavs_qpel8_v2_after_prologue
|
||||
|
||||
; ff_put_cavs_qpel8_v2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h)
|
||||
cglobal %1_cavs_qpel8_v2, 4,4,8
|
||||
%1_cavs_qpel8_v2_after_prologue:
|
||||
movh m1, [r1]
|
||||
sub r1, r2
|
||||
movh m0, [r1]
|
||||
lea r1, [r1+2*r2]
|
||||
pxor m7, m7
|
||||
movh m2, [r1]
|
||||
add r1, r2
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m2, m7
|
||||
mova m5, [pw_5]
|
||||
mova m6, [pw_4]
|
||||
.loop:
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
CAVS_QPEL_MC02 avg
|
||||
CAVS_QPEL_MC02 put
|
||||
|
||||
%macro FILT_V3 1
|
||||
pmullw m0, PW_7
|
||||
movh m4, [r1]
|
||||
mova m5, m1
|
||||
mova m6, m2
|
||||
pmullw m5, PW_42
|
||||
punpcklbw m4, m7
|
||||
pmullw m6, PW_96
|
||||
paddw m0, m3
|
||||
add r1, r2
|
||||
paddw m0, m3
|
||||
paddw m5, m6
|
||||
paddw m0, m4
|
||||
; m5-m0 can be in the -10*255..(42 + 96)*255 range and
|
||||
; therefore is not guaranteed to fit into either a signed or
|
||||
; an unsigned word. Because we need to clamp the result to 0..255
|
||||
; anyway, we use saturated subtraction and a logical right shift
|
||||
; for rescaling.
|
||||
psubusw m5, m0
|
||||
paddw m5, PW_64
|
||||
psrlw m5, 7
|
||||
packuswb m5, m7
|
||||
op_%1h m5, [r0], m0
|
||||
add r0, r2
|
||||
SWAP 0, 1, 2, 3, 4
|
||||
%endmacro
|
||||
|
||||
%macro CAVS_QPEL_MC03 1
|
||||
; ff_put_cavs_qpel8_mc03(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
cglobal %1_cavs_qpel8_mc03, 3,4,8+4*ARCH_X86_64
|
||||
mov r3d, 8
|
||||
jmp %1_cavs_qpel8_v3_after_prologue
|
||||
|
||||
; ff_put_cavs_qpel8_v3(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h)
|
||||
cglobal %1_cavs_qpel8_v3, 4,4,8+4*ARCH_X86_64
|
||||
%1_cavs_qpel8_v3_after_prologue:
|
||||
movh m1, [r1]
|
||||
movh m2, [r1+r2]
|
||||
movh m3, [r1+2*r2]
|
||||
sub r1, r2
|
||||
pxor m7, m7
|
||||
movh m0, [r1]
|
||||
lea r1, [r1+4*r2]
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
%if ARCH_X86_64
|
||||
%define PW_7 m8
|
||||
%define PW_42 m9
|
||||
%define PW_96 m10
|
||||
%define PW_64 m11
|
||||
mova m8, [pw_7]
|
||||
mova m9, [pw_42]
|
||||
mova m10, [pw_96]
|
||||
mova m11, [pw_64]
|
||||
%else
|
||||
%define PW_7 [pw_7]
|
||||
%define PW_42 [pw_42]
|
||||
%define PW_96 [pw_96]
|
||||
%define PW_64 [pw_64]
|
||||
%endif
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m0, m7
|
||||
|
||||
.loop:
|
||||
FILT_V3 %1
|
||||
FILT_V3 %1
|
||||
FILT_V3 %1
|
||||
FILT_V3 %1
|
||||
SWAP 0, 1, 2, 3, 4
|
||||
mova m3, m2
|
||||
mova m2, m1
|
||||
mova m1, m0
|
||||
mova m0, m4
|
||||
sub r3d, 4
|
||||
jne .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
CAVS_QPEL_MC03 avg
|
||||
CAVS_QPEL_MC03 put
|
||||
|
||||
@@ -373,12 +373,34 @@ CAVS_MC(avg_, 16, mmxext)
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
#define DEF_QPEL(OPNAME) \
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_mc02_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_mc03_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_h_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); \
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_v2_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h);\
|
||||
void ff_ ## OPNAME ## _cavs_qpel8_v3_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h);\
|
||||
|
||||
DEF_QPEL(put)
|
||||
DEF_QPEL(avg)
|
||||
|
||||
#define QPEL_CAVS_XMM(OPNAME, XMM) \
|
||||
static void OPNAME ## _cavs_qpel16_mc02_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_v2_ ## XMM(dst, src, stride, 16); \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_v2_ ## XMM(dst + 8, src + 8, stride, 16); \
|
||||
} \
|
||||
static void OPNAME ## _cavs_qpel16_mc03_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_v3_ ## XMM(dst, src, stride, 16); \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_v3_ ## XMM(dst + 8, src + 8, stride, 16); \
|
||||
} \
|
||||
static void OPNAME ## _cavs_qpel8_mc01_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_mc03_ ## XMM(dst + 7 * stride, src + 8 * stride, -stride); \
|
||||
} \
|
||||
static void OPNAME ## _cavs_qpel16_mc01_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
OPNAME ## _cavs_qpel16_mc03_ ## XMM(dst + 15 * stride, src + 16 * stride, -stride); \
|
||||
} \
|
||||
static void OPNAME ## _cavs_qpel16_mc20_ ## XMM(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## _cavs_qpel8_h_ ## XMM(dst, src, stride, 16); \
|
||||
@@ -413,11 +435,23 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_cavs_qpel_pixels_tab[0][ 0] = put_cavs_qpel16_mc00_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[0][ 2] = put_cavs_qpel16_mc20_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[0][ 4] = put_cavs_qpel16_mc01_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[0][ 8] = put_cavs_qpel16_mc02_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[0][12] = put_cavs_qpel16_mc03_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[1][ 4] = put_cavs_qpel8_mc01_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[1][ 8] = ff_put_cavs_qpel8_mc02_sse2;
|
||||
c->put_cavs_qpel_pixels_tab[1][12] = ff_put_cavs_qpel8_mc03_sse2;
|
||||
|
||||
c->avg_cavs_qpel_pixels_tab[0][ 0] = avg_cavs_qpel16_mc00_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][ 2] = avg_cavs_qpel16_mc20_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][ 4] = avg_cavs_qpel16_mc01_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][ 8] = avg_cavs_qpel16_mc02_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][12] = avg_cavs_qpel16_mc03_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[1][ 4] = avg_cavs_qpel8_mc01_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[1][ 8] = ff_avg_cavs_qpel8_mc02_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[1][12] = ff_avg_cavs_qpel8_mc03_sse2;
|
||||
|
||||
c->cavs_idct8_add = cavs_idct8_add_sse2;
|
||||
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
|
||||
|
||||
@@ -30,6 +30,7 @@ DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0
|
||||
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
|
||||
0x0004000400040004ULL, 0x0004000400040004ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_7) = { 0x0007000700070007ULL, 0x0007000700070007ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
|
||||
|
||||
@@ -30,6 +30,7 @@ extern const ymm_reg ff_pw_2;
|
||||
extern const xmm_reg ff_pw_3;
|
||||
extern const ymm_reg ff_pw_4;
|
||||
extern const xmm_reg ff_pw_5;
|
||||
extern const xmm_reg ff_pw_7;
|
||||
extern const xmm_reg ff_pw_8;
|
||||
extern const xmm_reg ff_pw_9;
|
||||
extern const uint64_t ff_pw_15;
|
||||
|
||||
@@ -21,11 +21,11 @@
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pw_7: times 8 dw 7
|
||||
convert_to_unsigned_10bit: times 4 dd 0x200
|
||||
clip_10bit: times 8 dw 0x3ff
|
||||
|
||||
cextern pw_3
|
||||
cextern pw_7
|
||||
cextern pw_16
|
||||
cextern pw_32
|
||||
cextern pb_80
|
||||
|
||||
Reference in New Issue
Block a user