vp9: Remove 8bpc AVX asm for inverse transforms

There's very little performance difference vs SSE2/SSSE3 and most
systems will use the AVX2 implementations anyway.

This reduces code size and compilation time by a significant amount.
This commit is contained in:
Henrik Gramner
2025-09-15 14:11:58 +02:00
committed by Henrik Gramner
parent 0b5d46ee1c
commit 82b5a0faba
2 changed files with 0 additions and 25 deletions

View File

@@ -111,14 +111,11 @@ itxfm_funcs(4, ssse3);
itxfm_funcs(4, avx2);
itxfm_funcs(8, sse2);
itxfm_funcs(8, ssse3);
itxfm_funcs(8, avx);
itxfm_funcs(8, avx2);
itxfm_funcs(16, sse2);
itxfm_funcs(16, ssse3);
itxfm_funcs(16, avx);
itxfm_func(idct, idct, 32, sse2);
itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
itxfm_func(iwht, iwht, 4, mmx);
itxfm_func(iwht, iwht, 4, avx2);
itxfm_funcs(16, avx2);
@@ -368,18 +365,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
}
if (EXTERNAL_AVX(cpu_flags)) {
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx;
dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx;
dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
init_lpf(avx);
init_dir_tm_h_ipred(8, avx);
init_dir_tm_h_ipred(16, avx);

View File

@@ -463,7 +463,6 @@ IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
pmulhrsw m7, W_11585x2_REG ; m7=t5
pmulhrsw m5, W_11585x2_REG ; m5=t6
SWAP 5, 1
; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
psubw m6, m0, m3 ; m6=t0-t7
paddw m3, m0 ; m3=t0+t7
psubw m2, m0, m1 ; m2=t1-t6
@@ -711,7 +710,6 @@ cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
;---------------------------------------------------------------------------------------------
; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -885,11 +883,8 @@ IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15
IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15
IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16
IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16
IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16
IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16
IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
;---------------------------------------------------------------------------------------------
; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -1427,7 +1422,6 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
VP9_IDCT_IDCT_16x16_ADD_XMM sse2
VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
VP9_IDCT_IDCT_16x16_ADD_XMM avx
%macro VP9_IDCT16_YMM_1D 0
VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15
@@ -1894,9 +1888,6 @@ IADST16_FN iadst, IADST16, iadst, IADST16, sse2
IADST16_FN idct, IDCT16, iadst, IADST16, ssse3
IADST16_FN iadst, IADST16, idct, IDCT16, ssse3
IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
IADST16_FN idct, IDCT16, iadst, IADST16, avx
IADST16_FN iadst, IADST16, idct, IDCT16, avx
IADST16_FN iadst, IADST16, iadst, IADST16, avx
; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
; out: m[0-15] except m6, which is in [blockq+192]
@@ -2836,4 +2827,3 @@ cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride,
VP9_IDCT_IDCT_32x32_ADD_XMM sse2
VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
VP9_IDCT_IDCT_32x32_ADD_XMM avx