From 6eced881885201e4834cea8d6aee4ce114d7fc9a Mon Sep 17 00:00:00 2001 From: Lynne Date: Thu, 18 Dec 2025 13:30:30 +0100 Subject: [PATCH] vulkan: merge ProRes and ProRes RAW iDCTs This cleans up the code a bit, and reduces binary size. --- libavcodec/vulkan/Makefile | 4 +- libavcodec/vulkan/dct.comp | 119 +++++++++++++++++++++++++ libavcodec/vulkan/prores_idct.comp | 84 ----------------- libavcodec/vulkan/prores_raw_idct.comp | 82 ----------------- libavcodec/vulkan_prores.c | 4 + libavcodec/vulkan_prores_raw.c | 5 ++ 6 files changed, 130 insertions(+), 168 deletions(-) create mode 100644 libavcodec/vulkan/dct.comp diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index 78d511a90e..35e96c506d 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -14,11 +14,11 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \ vulkan/ffv1_common.o vulkan/ffv1_reset.o \ vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o -OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \ +OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \ vulkan/prores_raw_decode.o \ vulkan/prores_raw_idct.o -OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o \ +OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \ vulkan/prores_vld.o \ vulkan/prores_idct.o diff --git a/libavcodec/vulkan/dct.comp b/libavcodec/vulkan/dct.comp new file mode 100644 index 0000000000..34c6ad128f --- /dev/null +++ b/libavcodec/vulkan/dct.comp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2025 Lynne + * Copyright (c) 2016 Nathan Egge + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * Orthonormal inverse 8-point Type-II DCT based on the Chen factorization[1]. + * 1D with scale factors moved up front. + * This computes an n-point Type-II DCT by first computing an n/2-point Type-II DCT + * of the even indexed inputs and an n/2-point Type-IV DST of the odd indexed inputs, + * and then combining them using a "butterfly" operation. + * + * [1] W.H. Chen, C. Smith, and S. Fralick, + * "A Fast Computational Algorithm for the Discrete Cosine Transform", + * IEEE Transactions on Communications, Vol. 25, No. 9, pp 1004-1009, Sept. 1977 + */ + +#ifndef NB_COMPONENTS +#define NB_COMPONENTS 1 +#endif + +/* Padded by 1 row to avoid bank conflicts */ +shared float blocks[NB_BLOCKS][NB_COMPONENTS*8*(8 + 1)]; + +const float idct_scale[64] = { + 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, + 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, + 0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293, + 0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362, + 0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541, + 0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343, + 0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362, + 0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822, + 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, + 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, + 0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109, + 0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924, + 0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021, + 0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857, + 0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822, + 0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892, +}; + +void idct8(uint block, uint offset, uint stride) +{ + float t0, t1, t2, t3, t4, t5, t6, t7, u8; + float u0, u1, u2, u3, u4, u5, u6, u7; + + /* Input */ + t0 = blocks[block][0*stride + offset]; + u4 = blocks[block][1*stride + offset]; + t2 = blocks[block][2*stride + offset]; + u6 = blocks[block][3*stride + offset]; + t1 = blocks[block][4*stride + offset]; + u5 = blocks[block][5*stride + offset]; + t3 = blocks[block][6*stride + offset]; + u7 = blocks[block][7*stride + offset]; + + /* Embedded scaled inverse 4-point Type-II DCT */ + u0 = t0 + t1; + u1 = t0 - t1; + u3 = t2 + t3; + u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3; + t0 = u0 + u3; + t3 = u0 - u3; + t1 = u1 + u2; + t2 = u1 - u2; + + /* Embedded scaled inverse 4-point Type-IV DST */ + t5 = u5 + u6; + t6 = u5 - u6; + t7 = u4 + u7; + t4 = u4 - u7; + u7 = t7 + t5; + u5 = (t7 - t5)*(1.4142135623730950488016887242097f); + u8 = (t4 + t6)*(1.8477590650225735122563663787936f); + u4 = u8 - t4*(1.0823922002923939687994464107328f); + u6 = u8 - t6*(2.6131259297527530557132863468544f); + t7 = u7; + t6 = t7 - u6; + t5 = t6 + u5; + t4 = t5 - u4; + + /* Butterflies */ + u0 = t0 + t7; + u7 = t0 - t7; + u6 = t1 + t6; + u1 = t1 - t6; + u2 = t2 + t5; + u5 = t2 - t5; + u4 = t3 + t4; + u3 = t3 - t4; + + /* Output */ + blocks[block][0*stride + offset] = u0; + blocks[block][1*stride + offset] = u1; + blocks[block][2*stride + offset] = u2; + blocks[block][3*stride + offset] = u3; + blocks[block][4*stride + offset] = u4; + blocks[block][5*stride + offset] = u5; + blocks[block][6*stride + offset] = u6; + blocks[block][7*stride + offset] = u7; +} diff --git a/libavcodec/vulkan/prores_idct.comp b/libavcodec/vulkan/prores_idct.comp index 5eef61e57a..25431d61c1 100644 --- a/libavcodec/vulkan/prores_idct.comp +++ b/libavcodec/vulkan/prores_idct.comp @@ -16,9 +16,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -/* Two macroblocks, padded to avoid bank conflicts */ -shared float blocks[4*2][8*(8+1)]; - uint get_px(uint tex_idx, ivec2 pos) { #ifndef INTERLACED @@ -37,87 +34,6 @@ void put_px(uint tex_idx, ivec2 pos, uint v) #endif } -const float idct_scale[64] = { - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293, - 0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362, - 0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541, - 0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343, - 0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362, - 0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822, - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109, - 0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924, - 0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021, - 0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857, - 0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822, - 0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892, -}; - -/* 7.4 Inverse Transform */ -void idct8(uint block, uint offset, uint stride) -{ - float t0, t1, t2, t3, t4, t5, t6, t7, u8; - float u0, u1, u2, u3, u4, u5, u6, u7; - - /* Input */ - t0 = blocks[block][0*stride + offset]; - u4 = blocks[block][1*stride + offset]; - t2 = blocks[block][2*stride + offset]; - u6 = blocks[block][3*stride + offset]; - t1 = blocks[block][4*stride + offset]; - u5 = blocks[block][5*stride + offset]; - t3 = blocks[block][6*stride + offset]; - u7 = blocks[block][7*stride + offset]; - - /* Embedded scaled inverse 4-point Type-II DCT */ - u0 = t0 + t1; - u1 = t0 - t1; - u3 = t2 + t3; - u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3; - t0 = u0 + u3; - t3 = u0 - u3; - t1 = u1 + u2; - t2 = u1 - u2; - - /* Embedded scaled inverse 4-point Type-IV DST */ - t5 = u5 + u6; - t6 = u5 - u6; - t7 = u4 + u7; - t4 = u4 - u7; - u7 = t7 + t5; - u5 = (t7 - t5)*(1.4142135623730950488016887242097f); - u8 = (t4 + t6)*(1.8477590650225735122563663787936f); - u4 = u8 - t4*(1.0823922002923939687994464107328f); - u6 = u8 - t6*(2.6131259297527530557132863468544f); - t7 = u7; - t6 = t7 - u6; - t5 = t6 + u5; - t4 = t5 - u4; - - /* Butterflies */ - u0 = t0 + t7; - u7 = t0 - t7; - u6 = t1 + t6; - u1 = t1 - t6; - u2 = t2 + t5; - u5 = t2 - t5; - u4 = t3 + t4; - u3 = t3 - t4; - - /* Output */ - blocks[block][0*stride + offset] = u0; - blocks[block][1*stride + offset] = u1; - blocks[block][2*stride + offset] = u2; - blocks[block][3*stride + offset] = u3; - blocks[block][4*stride + offset] = u4; - blocks[block][5*stride + offset] = u5; - blocks[block][6*stride + offset] = u6; - blocks[block][7*stride + offset] = u7; -} - void main(void) { uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID; diff --git a/libavcodec/vulkan/prores_raw_idct.comp b/libavcodec/vulkan/prores_raw_idct.comp index 29ddf3b9e8..01aad98330 100644 --- a/libavcodec/vulkan/prores_raw_idct.comp +++ b/libavcodec/vulkan/prores_raw_idct.comp @@ -24,8 +24,6 @@ #define BLOCK_ID (gl_LocalInvocationID.y) #define ROW_ID (gl_LocalInvocationID.x) -shared float blocks[16][4*64]; - const ivec2 scan[64] = { ivec2( 0, 0), ivec2( 4, 0), ivec2( 0, 2), ivec2( 4, 2), ivec2( 0, 8), ivec2( 4, 8), ivec2( 6, 8), ivec2( 2, 10), @@ -45,86 +43,6 @@ const ivec2 scan[64] = { ivec2(12, 12), ivec2( 8, 14), ivec2(10, 14), ivec2(14, 14), }; -const float idct_scale[64] = { - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293, - 0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362, - 0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541, - 0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343, - 0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362, - 0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822, - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109, - 0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924, - 0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021, - 0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857, - 0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822, - 0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892, -}; - -void idct8(uint block, uint offset, uint stride) -{ - float t0, t1, t2, t3, t4, t5, t6, t7, u8; - float u0, u1, u2, u3, u4, u5, u6, u7; - - /* Input */ - t0 = blocks[block][0*stride + offset]; - u4 = blocks[block][1*stride + offset]; - t2 = blocks[block][2*stride + offset]; - u6 = blocks[block][3*stride + offset]; - t1 = blocks[block][4*stride + offset]; - u5 = blocks[block][5*stride + offset]; - t3 = blocks[block][6*stride + offset]; - u7 = blocks[block][7*stride + offset]; - - /* Embedded scaled inverse 4-point Type-II DCT */ - u0 = t0 + t1; - u1 = t0 - t1; - u3 = t2 + t3; - u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3; - t0 = u0 + u3; - t3 = u0 - u3; - t1 = u1 + u2; - t2 = u1 - u2; - - /* Embedded scaled inverse 4-point Type-IV DST */ - t5 = u5 + u6; - t6 = u5 - u6; - t7 = u4 + u7; - t4 = u4 - u7; - u7 = t7 + t5; - u5 = (t7 - t5)*(1.4142135623730950488016887242097f); - u8 = (t4 + t6)*(1.8477590650225735122563663787936f); - u4 = u8 - t4*(1.0823922002923939687994464107328f); - u6 = u8 - t6*(2.6131259297527530557132863468544f); - t7 = u7; - t6 = t7 - u6; - t5 = t6 + u5; - t4 = t5 - u4; - - /* Butterflies */ - u0 = t0 + t7; - u7 = t0 - t7; - u6 = t1 + t6; - u1 = t1 - t6; - u2 = t2 + t5; - u5 = t2 - t5; - u4 = t3 + t4; - u3 = t3 - t4; - - /* Output */ - blocks[block][0*stride + offset] = u0; - blocks[block][1*stride + offset] = u1; - blocks[block][2*stride + offset] = u2; - blocks[block][3*stride + offset] = u3; - blocks[block][4*stride + offset] = u4; - blocks[block][5*stride + offset] = u5; - blocks[block][6*stride + offset] = u6; - blocks[block][7*stride + offset] = u7; -} - void main(void) { const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c index bc0364609c..afea8857e8 100644 --- a/libavcodec/vulkan_prores.c +++ b/libavcodec/vulkan_prores.c @@ -24,6 +24,7 @@ #include "libavutil/vulkan_spirv.h" extern const char *ff_source_common_comp; +extern const char *ff_source_dct_comp; extern const char *ff_source_prores_vld_comp; extern const char *ff_source_prores_idct_comp; @@ -511,6 +512,9 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, }; RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0)); + GLSLC(0, #define NB_BLOCKS 4*2); + GLSLD(ff_source_dct_comp); + GLSLD(ff_source_prores_idct_comp); RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c index 82e8f3ad16..0a844156fb 100644 --- a/libavcodec/vulkan_prores_raw.c +++ b/libavcodec/vulkan_prores_raw.c @@ -26,6 +26,7 @@ #include "libavutil/mem.h" extern const char *ff_source_common_comp; +extern const char *ff_source_dct_comp; extern const char *ff_source_prores_raw_decode_comp; extern const char *ff_source_prores_raw_idct_comp; @@ -385,6 +386,10 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, RET(add_common_data(avctx, s, shd, 0)); + GLSLC(0, #define NB_BLOCKS 16); + GLSLC(0, #define NB_COMPONENTS 4); + GLSLD(ff_source_dct_comp); + GLSLD(ff_source_prores_raw_idct_comp); RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",