vp9: Remove 8bpc AVX asm for inverse transforms

There's very little performance difference vs SSE2/SSSE3 and most systems will use the AVX2 implementations anyway. This reduces code size and compilation time by a significant amount.
2026-02-04 14:30:55 +08:00 · 2025-09-15 14:11:58 +02:00
parent 0b5d46ee1c
commit 82b5a0faba
2 changed files with 0 additions and 25 deletions
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -111,14 +111,11 @@ itxfm_funcs(4, ssse3);
 itxfm_funcs(4, avx2);
 itxfm_funcs(8, sse2);
 itxfm_funcs(8, ssse3);
-itxfm_funcs(8, avx);
 itxfm_funcs(8, avx2);
 itxfm_funcs(16, sse2);
 itxfm_funcs(16, ssse3);
-itxfm_funcs(16, avx);
 itxfm_func(idct, idct, 32, sse2);
 itxfm_func(idct, idct, 32, ssse3);
-itxfm_func(idct, idct, 32, avx);
 itxfm_func(iwht, iwht, 4, mmx);
 itxfm_func(iwht, iwht, 4, avx2);
 itxfm_funcs(16, avx2);
@@ -368,18 +365,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
    }

    if (EXTERNAL_AVX(cpu_flags)) {
-        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
-        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
-        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
-        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
-        dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
-        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
-        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
-        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
-        dsp->itxfm_add[TX_32X32][ADST_ADST] =
-        dsp->itxfm_add[TX_32X32][ADST_DCT] =
-        dsp->itxfm_add[TX_32X32][DCT_ADST] =
-        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
        init_lpf(avx);
        init_dir_tm_h_ipred(8, avx);
        init_dir_tm_h_ipred(16, avx);
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -463,7 +463,6 @@ IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
    SWAP                 5,  1
-    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
    psubw               m6, m0, m3                          ; m6=t0-t7
    paddw               m3, m0                              ; m3=t0+t7
    psubw               m2, m0, m1                          ; m2=t1-t6
@@ -711,7 +710,6 @@ cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob

 VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
 VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
-VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13

 ;---------------------------------------------------------------------------------------------
 ; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -885,11 +883,8 @@ IADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
 IADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
 IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
 IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
-IADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
 IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
-IADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
 IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
-IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16

 ;---------------------------------------------------------------------------------------------
 ; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
@@ -1427,7 +1422,6 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob

 VP9_IDCT_IDCT_16x16_ADD_XMM sse2
 VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
-VP9_IDCT_IDCT_16x16_ADD_XMM avx

 %macro VP9_IDCT16_YMM_1D 0
    VP9_UNPACK_MULSUB_2W_4X  1,  15, 16305,  1606, [pd_8192], 0, 4 ; t8,  t15
@@ -1894,9 +1888,6 @@ IADST16_FN iadst, IADST16, iadst, IADST16, sse2
 IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
 IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
 IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
-IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
-IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
-IADST16_FN iadst, IADST16, iadst, IADST16, avx

 ; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
 ; out: m[0-15] except m6, which is in [blockq+192]
@@ -2836,4 +2827,3 @@ cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride,

 VP9_IDCT_IDCT_32x32_ADD_XMM sse2
 VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
-VP9_IDCT_IDCT_32x32_ADD_XMM avx