mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2026-01-12 00:06:51 +08:00
vulkan: merge ProRes and ProRes RAW iDCTs
This cleans up the code a bit, and reduces binary size.
This commit is contained in:
@@ -14,11 +14,11 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \
|
||||
vulkan/ffv1_common.o vulkan/ffv1_reset.o \
|
||||
vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o
|
||||
|
||||
OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \
|
||||
OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \
|
||||
vulkan/prores_raw_decode.o \
|
||||
vulkan/prores_raw_idct.o
|
||||
|
||||
OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o \
|
||||
OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \
|
||||
vulkan/prores_vld.o \
|
||||
vulkan/prores_idct.o
|
||||
|
||||
|
||||
119
libavcodec/vulkan/dct.comp
Normal file
119
libavcodec/vulkan/dct.comp
Normal file
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright (c) 2025 Lynne <dev@lynne.ee>
|
||||
* Copyright (c) 2016 Nathan Egge <unlord@xiph.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* Orthonormal inverse 8-point Type-II DCT based on the Chen factorization[1].
|
||||
* 1D with scale factors moved up front.
|
||||
* This computes an n-point Type-II DCT by first computing an n/2-point Type-II DCT
|
||||
* of the even indexed inputs and an n/2-point Type-IV DST of the odd indexed inputs,
|
||||
* and then combining them using a "butterfly" operation.
|
||||
*
|
||||
* [1] W.H. Chen, C. Smith, and S. Fralick,
|
||||
* "A Fast Computational Algorithm for the Discrete Cosine Transform",
|
||||
* IEEE Transactions on Communications, Vol. 25, No. 9, pp 1004-1009, Sept. 1977
|
||||
*/
|
||||
|
||||
#ifndef NB_COMPONENTS
|
||||
#define NB_COMPONENTS 1
|
||||
#endif
|
||||
|
||||
/* Padded by 1 row to avoid bank conflicts */
|
||||
shared float blocks[NB_BLOCKS][NB_COMPONENTS*8*(8 + 1)];
|
||||
|
||||
const float idct_scale[64] = {
|
||||
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
||||
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
||||
0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293,
|
||||
0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362,
|
||||
0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541,
|
||||
0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343,
|
||||
0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362,
|
||||
0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822,
|
||||
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
||||
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
||||
0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109,
|
||||
0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924,
|
||||
0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021,
|
||||
0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857,
|
||||
0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822,
|
||||
0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892,
|
||||
};
|
||||
|
||||
void idct8(uint block, uint offset, uint stride)
|
||||
{
|
||||
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
|
||||
float u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
|
||||
/* Input */
|
||||
t0 = blocks[block][0*stride + offset];
|
||||
u4 = blocks[block][1*stride + offset];
|
||||
t2 = blocks[block][2*stride + offset];
|
||||
u6 = blocks[block][3*stride + offset];
|
||||
t1 = blocks[block][4*stride + offset];
|
||||
u5 = blocks[block][5*stride + offset];
|
||||
t3 = blocks[block][6*stride + offset];
|
||||
u7 = blocks[block][7*stride + offset];
|
||||
|
||||
/* Embedded scaled inverse 4-point Type-II DCT */
|
||||
u0 = t0 + t1;
|
||||
u1 = t0 - t1;
|
||||
u3 = t2 + t3;
|
||||
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
|
||||
t0 = u0 + u3;
|
||||
t3 = u0 - u3;
|
||||
t1 = u1 + u2;
|
||||
t2 = u1 - u2;
|
||||
|
||||
/* Embedded scaled inverse 4-point Type-IV DST */
|
||||
t5 = u5 + u6;
|
||||
t6 = u5 - u6;
|
||||
t7 = u4 + u7;
|
||||
t4 = u4 - u7;
|
||||
u7 = t7 + t5;
|
||||
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
|
||||
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
|
||||
u4 = u8 - t4*(1.0823922002923939687994464107328f);
|
||||
u6 = u8 - t6*(2.6131259297527530557132863468544f);
|
||||
t7 = u7;
|
||||
t6 = t7 - u6;
|
||||
t5 = t6 + u5;
|
||||
t4 = t5 - u4;
|
||||
|
||||
/* Butterflies */
|
||||
u0 = t0 + t7;
|
||||
u7 = t0 - t7;
|
||||
u6 = t1 + t6;
|
||||
u1 = t1 - t6;
|
||||
u2 = t2 + t5;
|
||||
u5 = t2 - t5;
|
||||
u4 = t3 + t4;
|
||||
u3 = t3 - t4;
|
||||
|
||||
/* Output */
|
||||
blocks[block][0*stride + offset] = u0;
|
||||
blocks[block][1*stride + offset] = u1;
|
||||
blocks[block][2*stride + offset] = u2;
|
||||
blocks[block][3*stride + offset] = u3;
|
||||
blocks[block][4*stride + offset] = u4;
|
||||
blocks[block][5*stride + offset] = u5;
|
||||
blocks[block][6*stride + offset] = u6;
|
||||
blocks[block][7*stride + offset] = u7;
|
||||
}
|
||||
@@ -16,9 +16,6 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/* Two macroblocks, padded to avoid bank conflicts */
|
||||
shared float blocks[4*2][8*(8+1)];
|
||||
|
||||
uint get_px(uint tex_idx, ivec2 pos)
|
||||
{
|
||||
#ifndef INTERLACED
|
||||
@@ -37,87 +34,6 @@ void put_px(uint tex_idx, ivec2 pos, uint v)
|
||||
#endif
|
||||
}
|
||||
|
||||
const float idct_scale[64] = {
|
||||
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
||||
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
||||
0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293,
|
||||
0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362,
|
||||
0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541,
|
||||
0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343,
|
||||
0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362,
|
||||
0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822,
|
||||
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
||||
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
||||
0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109,
|
||||
0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924,
|
||||
0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021,
|
||||
0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857,
|
||||
0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822,
|
||||
0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892,
|
||||
};
|
||||
|
||||
/* 7.4 Inverse Transform */
|
||||
void idct8(uint block, uint offset, uint stride)
|
||||
{
|
||||
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
|
||||
float u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
|
||||
/* Input */
|
||||
t0 = blocks[block][0*stride + offset];
|
||||
u4 = blocks[block][1*stride + offset];
|
||||
t2 = blocks[block][2*stride + offset];
|
||||
u6 = blocks[block][3*stride + offset];
|
||||
t1 = blocks[block][4*stride + offset];
|
||||
u5 = blocks[block][5*stride + offset];
|
||||
t3 = blocks[block][6*stride + offset];
|
||||
u7 = blocks[block][7*stride + offset];
|
||||
|
||||
/* Embedded scaled inverse 4-point Type-II DCT */
|
||||
u0 = t0 + t1;
|
||||
u1 = t0 - t1;
|
||||
u3 = t2 + t3;
|
||||
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
|
||||
t0 = u0 + u3;
|
||||
t3 = u0 - u3;
|
||||
t1 = u1 + u2;
|
||||
t2 = u1 - u2;
|
||||
|
||||
/* Embedded scaled inverse 4-point Type-IV DST */
|
||||
t5 = u5 + u6;
|
||||
t6 = u5 - u6;
|
||||
t7 = u4 + u7;
|
||||
t4 = u4 - u7;
|
||||
u7 = t7 + t5;
|
||||
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
|
||||
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
|
||||
u4 = u8 - t4*(1.0823922002923939687994464107328f);
|
||||
u6 = u8 - t6*(2.6131259297527530557132863468544f);
|
||||
t7 = u7;
|
||||
t6 = t7 - u6;
|
||||
t5 = t6 + u5;
|
||||
t4 = t5 - u4;
|
||||
|
||||
/* Butterflies */
|
||||
u0 = t0 + t7;
|
||||
u7 = t0 - t7;
|
||||
u6 = t1 + t6;
|
||||
u1 = t1 - t6;
|
||||
u2 = t2 + t5;
|
||||
u5 = t2 - t5;
|
||||
u4 = t3 + t4;
|
||||
u3 = t3 - t4;
|
||||
|
||||
/* Output */
|
||||
blocks[block][0*stride + offset] = u0;
|
||||
blocks[block][1*stride + offset] = u1;
|
||||
blocks[block][2*stride + offset] = u2;
|
||||
blocks[block][3*stride + offset] = u3;
|
||||
blocks[block][4*stride + offset] = u4;
|
||||
blocks[block][5*stride + offset] = u5;
|
||||
blocks[block][6*stride + offset] = u6;
|
||||
blocks[block][7*stride + offset] = u7;
|
||||
}
|
||||
|
||||
void main(void)
|
||||
{
|
||||
uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID;
|
||||
|
||||
@@ -24,8 +24,6 @@
|
||||
#define BLOCK_ID (gl_LocalInvocationID.y)
|
||||
#define ROW_ID (gl_LocalInvocationID.x)
|
||||
|
||||
shared float blocks[16][4*64];
|
||||
|
||||
const ivec2 scan[64] = {
|
||||
ivec2( 0, 0), ivec2( 4, 0), ivec2( 0, 2), ivec2( 4, 2),
|
||||
ivec2( 0, 8), ivec2( 4, 8), ivec2( 6, 8), ivec2( 2, 10),
|
||||
@@ -45,86 +43,6 @@ const ivec2 scan[64] = {
|
||||
ivec2(12, 12), ivec2( 8, 14), ivec2(10, 14), ivec2(14, 14),
|
||||
};
|
||||
|
||||
const float idct_scale[64] = {
|
||||
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
||||
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
||||
0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293,
|
||||
0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362,
|
||||
0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541,
|
||||
0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343,
|
||||
0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362,
|
||||
0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822,
|
||||
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
||||
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
||||
0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109,
|
||||
0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924,
|
||||
0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021,
|
||||
0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857,
|
||||
0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822,
|
||||
0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892,
|
||||
};
|
||||
|
||||
void idct8(uint block, uint offset, uint stride)
|
||||
{
|
||||
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
|
||||
float u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
|
||||
/* Input */
|
||||
t0 = blocks[block][0*stride + offset];
|
||||
u4 = blocks[block][1*stride + offset];
|
||||
t2 = blocks[block][2*stride + offset];
|
||||
u6 = blocks[block][3*stride + offset];
|
||||
t1 = blocks[block][4*stride + offset];
|
||||
u5 = blocks[block][5*stride + offset];
|
||||
t3 = blocks[block][6*stride + offset];
|
||||
u7 = blocks[block][7*stride + offset];
|
||||
|
||||
/* Embedded scaled inverse 4-point Type-II DCT */
|
||||
u0 = t0 + t1;
|
||||
u1 = t0 - t1;
|
||||
u3 = t2 + t3;
|
||||
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
|
||||
t0 = u0 + u3;
|
||||
t3 = u0 - u3;
|
||||
t1 = u1 + u2;
|
||||
t2 = u1 - u2;
|
||||
|
||||
/* Embedded scaled inverse 4-point Type-IV DST */
|
||||
t5 = u5 + u6;
|
||||
t6 = u5 - u6;
|
||||
t7 = u4 + u7;
|
||||
t4 = u4 - u7;
|
||||
u7 = t7 + t5;
|
||||
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
|
||||
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
|
||||
u4 = u8 - t4*(1.0823922002923939687994464107328f);
|
||||
u6 = u8 - t6*(2.6131259297527530557132863468544f);
|
||||
t7 = u7;
|
||||
t6 = t7 - u6;
|
||||
t5 = t6 + u5;
|
||||
t4 = t5 - u4;
|
||||
|
||||
/* Butterflies */
|
||||
u0 = t0 + t7;
|
||||
u7 = t0 - t7;
|
||||
u6 = t1 + t6;
|
||||
u1 = t1 - t6;
|
||||
u2 = t2 + t5;
|
||||
u5 = t2 - t5;
|
||||
u4 = t3 + t4;
|
||||
u3 = t3 - t4;
|
||||
|
||||
/* Output */
|
||||
blocks[block][0*stride + offset] = u0;
|
||||
blocks[block][1*stride + offset] = u1;
|
||||
blocks[block][2*stride + offset] = u2;
|
||||
blocks[block][3*stride + offset] = u3;
|
||||
blocks[block][4*stride + offset] = u4;
|
||||
blocks[block][5*stride + offset] = u5;
|
||||
blocks[block][6*stride + offset] = u6;
|
||||
blocks[block][7*stride + offset] = u7;
|
||||
}
|
||||
|
||||
void main(void)
|
||||
{
|
||||
const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "libavutil/vulkan_spirv.h"
|
||||
|
||||
extern const char *ff_source_common_comp;
|
||||
extern const char *ff_source_dct_comp;
|
||||
extern const char *ff_source_prores_vld_comp;
|
||||
extern const char *ff_source_prores_idct_comp;
|
||||
|
||||
@@ -511,6 +512,9 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s,
|
||||
};
|
||||
RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0));
|
||||
|
||||
GLSLC(0, #define NB_BLOCKS 4*2);
|
||||
GLSLD(ff_source_dct_comp);
|
||||
|
||||
GLSLD(ff_source_prores_idct_comp);
|
||||
|
||||
RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "libavutil/mem.h"
|
||||
|
||||
extern const char *ff_source_common_comp;
|
||||
extern const char *ff_source_dct_comp;
|
||||
extern const char *ff_source_prores_raw_decode_comp;
|
||||
extern const char *ff_source_prores_raw_idct_comp;
|
||||
|
||||
@@ -385,6 +386,10 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s,
|
||||
|
||||
RET(add_common_data(avctx, s, shd, 0));
|
||||
|
||||
GLSLC(0, #define NB_BLOCKS 16);
|
||||
GLSLC(0, #define NB_COMPONENTS 4);
|
||||
GLSLD(ff_source_dct_comp);
|
||||
|
||||
GLSLD(ff_source_prores_raw_idct_comp);
|
||||
|
||||
RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",
|
||||
|
||||
Reference in New Issue
Block a user