mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2026-01-12 00:06:51 +08:00
avcodec/x86/hevc/idct: Port ff_hevc_idct_4x4_dc_{8,10,12}_mmxext to SSE2
Practically no change in benchmarks (and in codesize). hevc_idct_4x4_dc_8_c: 7.8 ( 1.00x) hevc_idct_4x4_dc_8_mmxext: 6.9 ( 1.14x) hevc_idct_4x4_dc_8_sse2: 6.8 ( 1.15x) hevc_idct_4x4_dc_10_c: 7.9 ( 1.00x) hevc_idct_4x4_dc_10_mmxext: 6.9 ( 1.16x) hevc_idct_4x4_dc_10_sse2: 6.8 ( 1.16x) hevc_idct_4x4_dc_12_c: 7.8 ( 1.00x) hevc_idct_4x4_dc_12_mmxext: 7.0 ( 1.13x) hevc_idct_4x4_dc_12_sse2: 6.8 ( 1.15x) Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -65,7 +65,7 @@ void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
|
||||
void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
|
||||
void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
|
||||
|
||||
IDCT_DC_FUNCS(4x4, mmxext);
|
||||
IDCT_DC_FUNCS(4x4, sse2);
|
||||
IDCT_DC_FUNCS(8x8, sse2);
|
||||
IDCT_DC_FUNCS(16x16, sse2);
|
||||
IDCT_DC_FUNCS(32x32, sse2);
|
||||
@@ -816,8 +816,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
|
||||
|
||||
c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
@@ -832,6 +830,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
}
|
||||
SAO_BAND_INIT(8, sse2);
|
||||
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_sse2;
|
||||
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
|
||||
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
|
||||
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
|
||||
@@ -998,7 +997,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
|
||||
@@ -1013,6 +1011,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
SAO_BAND_INIT(10, sse2);
|
||||
SAO_EDGE_INIT(10, sse2);
|
||||
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_sse2;
|
||||
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
|
||||
c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
|
||||
c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
|
||||
@@ -1218,9 +1217,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
}
|
||||
#endif /* HAVE_AVX2_EXTERNAL */
|
||||
} else if (bit_depth == 12) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
|
||||
@@ -1231,6 +1227,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
SAO_BAND_INIT(12, sse2);
|
||||
SAO_EDGE_INIT(12, sse2);
|
||||
|
||||
c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_sse2;
|
||||
c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
|
||||
c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
|
||||
c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
|
||||
|
||||
@@ -273,16 +273,11 @@ cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
|
||||
sar tmpd, (15 - %2)
|
||||
movd m0, tmpd
|
||||
SPLATW m0, xm0
|
||||
mova [coeffq+mmsize*0], m0
|
||||
mova [coeffq+mmsize*1], m0
|
||||
mova [coeffq+mmsize*2], m0
|
||||
mova [coeffq+mmsize*3], m0
|
||||
%if mmsize == 16
|
||||
mova [coeffq+mmsize*4], m0
|
||||
mova [coeffq+mmsize*5], m0
|
||||
mova [coeffq+mmsize*6], m0
|
||||
mova [coeffq+mmsize*7], m0
|
||||
%endif
|
||||
%assign %%offset 0
|
||||
%rep 2*%1*%1/mmsize
|
||||
mova [coeffq+%%offset], m0
|
||||
%assign %%offset %%offset+mmsize
|
||||
%endrep
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
@@ -809,10 +804,8 @@ cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
|
||||
%endmacro
|
||||
|
||||
%macro INIT_IDCT_DC 1
|
||||
INIT_MMX mmxext
|
||||
IDCT_DC_NL 4, %1
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_DC_NL 4, %1
|
||||
IDCT_DC_NL 8, %1
|
||||
IDCT_DC 16, 4, %1
|
||||
IDCT_DC 32, 16, %1
|
||||
|
||||
@@ -69,7 +69,7 @@ static void check_idct_dc(HEVCDSPContext *h, int bit_depth)
|
||||
for (i = 2; i <= 5; i++) {
|
||||
int block_size = 1 << i;
|
||||
int size = block_size * block_size;
|
||||
declare_func_emms(AV_CPU_FLAG_MMXEXT, void, int16_t *coeffs);
|
||||
declare_func(void, int16_t *coeffs);
|
||||
|
||||
randomize_buffers(coeffs0, size);
|
||||
memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
|
||||
|
||||
Reference in New Issue
Block a user