avcodec/x86/fpel: Port ff_put_pixels8_mmx() to SSE2

This has the advantage of not violating the ABI by using
MMX registers without issuing emms; it e.g. allows
to remove an emms_c from bink.c.

This commit uses GP registers on Unix64 (there are not
enough volatile registers to do likewise on Win64) which
reduces codesize and is faster on some CPUs.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-10-14 15:06:13 +02:00
parent d91b1559e0
commit ed007ad427
7 changed files with 28 additions and 37 deletions

View File

@@ -21,7 +21,6 @@
*/
#include "libavutil/attributes.h"
#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
@@ -1297,7 +1296,6 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *frame,
if (get_bits_count(&gb) >= bits_count)
break;
}
emms_c();
if (c->version > 'b') {
if ((ret = av_frame_replace(c->last, frame)) < 0)

View File

@@ -46,13 +46,6 @@ static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
#endif /* HAVE_SSE2_EXTERNAL */
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c)
{
#if HAVE_MMX_EXTERNAL
c->put_cavs_qpel_pixels_tab[1][0] = ff_put_pixels8x8_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
#if HAVE_SSE2_EXTERNAL
#define DEF_QPEL(OPNAME) \
void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
@@ -98,9 +91,6 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
{
av_unused int cpu_flags = av_get_cpu_flags();
if (X86_MMX(cpu_flags))
cavsdsp_init_mmx(c);
#if HAVE_MMX_EXTERNAL
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_cavs_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext;
@@ -113,6 +103,7 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
c->put_cavs_qpel_pixels_tab[0][ 4] = put_cavs_qpel16_mc01_sse2;
c->put_cavs_qpel_pixels_tab[0][ 8] = put_cavs_qpel16_mc02_sse2;
c->put_cavs_qpel_pixels_tab[0][12] = put_cavs_qpel16_mc03_sse2;
c->put_cavs_qpel_pixels_tab[1][ 0] = ff_put_pixels8x8_sse2;
c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2;
c->put_cavs_qpel_pixels_tab[1][ 4] = put_cavs_qpel8_mc01_sse2;
c->put_cavs_qpel_pixels_tab[1][ 8] = ff_put_cavs_qpel8_mc02_sse2;

View File

@@ -27,7 +27,7 @@ SECTION .text
; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
; ptrdiff_t line_size, int h)
%macro OP_PIXELS 2
%macro OP_PIXELS 2-3 0
%if %2 == mmsize/2
%define LOAD movh
%define SAVE movh
@@ -35,14 +35,25 @@ SECTION .text
%define LOAD movu
%define SAVE mova
%endif
cglobal %1_pixels%2x%2, 3,5,4
cglobal %1_pixels%2x%2, 3,5+4*%3,%3 ? 4 : 0
mov r3d, %2
jmp %1_pixels%2_after_prologue
cglobal %1_pixels%2, 4,5,4
cglobal %1_pixels%2, 4,5+4*%3,%3 ? 4 : 0
%1_pixels%2_after_prologue:
lea r4, [r2*3]
.loop:
%if %3
; Use GPRs on UNIX64 for put8, but not on Win64 due to a lack of volatile GPRs
mov r5q, [r1]
mov r6q, [r1+r2]
mov r7q, [r1+r2*2]
mov r8q, [r1+r4]
mov [r0], r5q
mov [r0+r2], r6q
mov [r0+r2*2], r7q
mov [r0+r4], r8q
%else
LOAD m0, [r1]
LOAD m1, [r1+r2]
LOAD m2, [r1+r2*2]
@@ -57,6 +68,7 @@ cglobal %1_pixels%2, 4,5,4
SAVE [r0+r2], m1
SAVE [r0+r2*2], m2
SAVE [r0+r4], m3
%endif
sub r3d, 4
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
@@ -64,12 +76,10 @@ cglobal %1_pixels%2, 4,5,4
RET
%endmacro
INIT_MMX mmx
OP_PIXELS put, 8
INIT_MMX mmxext
OP_PIXELS avg, 8
INIT_XMM sse2
OP_PIXELS put, 8, UNIX64
OP_PIXELS put, 16
OP_PIXELS avg, 16

View File

@@ -30,10 +30,10 @@ void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16x16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8x8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels8_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8x8_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16x16_sse2(uint8_t *block, const uint8_t *pixels,

View File

@@ -74,14 +74,6 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
#if HAVE_MMX_EXTERNAL
c->put_no_rnd_pixels_tab[1][0] =
c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
#endif
}
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
{
#if HAVE_MMXEXT_EXTERNAL
@@ -115,6 +107,9 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2;
c->put_no_rnd_pixels_tab[1][0] =
c->put_pixels_tab[1][0] = ff_put_pixels8_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
@@ -143,9 +138,6 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags))
hpeldsp_init_mmx(c, flags);
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags);

View File

@@ -521,8 +521,6 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
c->put_no_rnd_qpel_pixels_tab[1][0] =
c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_mmx;
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
@@ -532,6 +530,8 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_no_rnd_qpel_pixels_tab[0][0] =
c->put_qpel_pixels_tab[0][0] = ff_put_pixels16x16_sse2;
c->put_no_rnd_qpel_pixels_tab[1][0] =
c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_sse2;
c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2;
}
#endif

View File

@@ -73,7 +73,7 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, ptrdiff_t stride, int pq)
ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH); \
}
DECLARE_FUNCTION(put_, 8, _mmx)
DECLARE_FUNCTION(put_, 8, _sse2)
DECLARE_FUNCTION(avg_, 8, _mmxext)
DECLARE_FUNCTION(put_, 16, _sse2)
DECLARE_FUNCTION(avg_, 16, _sse2)
@@ -125,7 +125,6 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_LF4(mmxext);
@@ -142,6 +141,7 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
ASSIGN_LF816(sse2);
dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_sse2;
dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_sse2;
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {