diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile index d6a66f860a..832d802daf 100644 --- a/libavcodec/x86/vvc/Makefile +++ b/libavcodec/x86/vvc/Makefile @@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \ x86/h26x/h2656dsp.o X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_alf.o \ x86/vvc/vvc_mc.o \ + x86/vvc/vvc_sad.o \ x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm new file mode 100644 index 0000000000..13224583a5 --- /dev/null +++ b/libavcodec/x86/vvc/vvc_sad.asm @@ -0,0 +1,133 @@ +; /* +; * Provide SIMD DMVR SAD functions for VVC decoding +; * +; * Copyright (c) 2024 Stone Chen +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ + +%include "libavutil/x86/x86util.asm" +%define MAX_PB_SIZE 128 +%define ROWS 2 + +SECTION_RODATA + +pw_1: times 2 dw 1 + +; DMVR SAD is only calculated on even rows to reduce complexity +SECTION .text + +%macro MIN_MAX_SAD 3 + pminuw %3, %2, %1 + pmaxuw %1, %2, %1 + psubusw %1, %1, %3 +%endmacro + +%macro HORIZ_ADD 3 ; xm0, xm1, m1 + vextracti128 %1, %3, q0001 ; 3 2 1 0 + paddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0) + pshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2) + paddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2) + pshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3) + paddd %1, %1, %2 ; (01234567) +%endmacro + +%if ARCH_X86_64 +%if HAVE_AVX2_EXTERNAL + +INIT_YMM avx2 + +cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx + movsxdifnidn dxq, dxd + movsxdifnidn dyq, dyd + + sub dxq, 2 + sub dyq, 2 + + mov off1q, 2 + mov off2q, 2 + + add off1q, dyq + sub off2q, dyq + + shl off1q, 7 + shl off2q, 7 + + add off1q, dxq + sub off2q, dxq + + lea src1q, [src1q + off1q * 2 + 2 * 2] + lea src2q, [src2q + off2q * 2 + 2 * 2] + + pxor m3, m3 + vpbroadcastd m4, [pw_1] + + cmp block_wd, 16 + jge vvc_sad_16_128 + + vvc_sad_8: + .loop_height: + movu xm0, [src1q] + vinserti128 m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1 + movu xm1, [src2q] + vinserti128 m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1 + + MIN_MAX_SAD m1, m0, m2 + pmaddwd m1, m4 + paddd m3, m1 + + add src1q, 2 * MAX_PB_SIZE * ROWS * 2 + add src2q, 2 * MAX_PB_SIZE * ROWS * 2 + + sub block_hd, 4 + jg .loop_height + + HORIZ_ADD xm0, xm3, m3 + movd eax, xm0 + RET + + vvc_sad_16_128: + sar block_wd, 4 + .loop_height: + mov off1q, src1q + mov off2q, src2q + mov row_idxd, block_wd + + .loop_width: + movu m0, [src1q] + movu m1, [src2q] + MIN_MAX_SAD m1, m0, m2 + pmaddwd m1, m4 + paddd m3, m1 + + add src1q, 32 + add src2q, 32 + dec row_idxd + jg .loop_width + + lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2] + lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2] + + sub block_hd, 2 + jg .loop_height + + HORIZ_ADD xm0, xm3, m3 + movd eax, xm0 + RET + +%endif +%endif diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index 0e68971b2c..4b4a2aa937 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2) c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \ c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \ } while (0) + +int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); +#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2 #endif void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) @@ -327,6 +330,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) ALF_INIT(8); AVG_INIT(8, avx2); MC_LINKS_AVX2(8); + SAD_INIT(); } break; case 10: @@ -338,6 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(10, avx2); MC_LINKS_AVX2(10); MC_LINKS_16BPC_AVX2(10); + SAD_INIT(); } break; case 12: @@ -349,6 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(12, avx2); MC_LINKS_AVX2(12); MC_LINKS_16BPC_AVX2(12); + SAD_INIT(); } break; default: