mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2026-01-12 00:06:51 +08:00
lavc: add a ProRes Vulkan hwaccel
Add a shader-based Apple ProRes decoder. It supports all codec features for profiles up to the 4444 XQ profile, ie.: - 4:2:2 and 4:4:4 chroma subsampling - 10- and 12-bit component depth - Interlacing - Alpha The implementation consists in two shaders: the VLD kernel does entropy decoding for color/alpha, and the IDCT kernel performs the inverse transform on color components. Benchmarks for a 4k yuv422p10 sample: - AMD Radeon 6700XT: 178 fps - Intel i7 Tiger Lake: 37 fps - NVidia Orin Nano: 70 fps
This commit is contained in:
2
configure
vendored
2
configure
vendored
@@ -3343,6 +3343,8 @@ prores_videotoolbox_hwaccel_deps="videotoolbox"
|
||||
prores_videotoolbox_hwaccel_select="prores_decoder"
|
||||
prores_raw_vulkan_hwaccel_deps="vulkan spirv_compiler"
|
||||
prores_raw_vulkan_hwaccel_select="prores_raw_decoder"
|
||||
prores_vulkan_hwaccel_deps="vulkan spirv_compiler"
|
||||
prores_vulkan_hwaccel_select="prores_decoder"
|
||||
vc1_d3d11va_hwaccel_deps="d3d11va"
|
||||
vc1_d3d11va_hwaccel_select="vc1_decoder"
|
||||
vc1_d3d11va2_hwaccel_deps="d3d11va"
|
||||
|
||||
@@ -1106,6 +1106,7 @@ OBJS-$(CONFIG_VP9_VULKAN_HWACCEL) += vulkan_decode.o vulkan_vp9.o
|
||||
OBJS-$(CONFIG_VP8_QSV_HWACCEL) += qsvdec.o
|
||||
OBJS-$(CONFIG_VVC_VAAPI_HWACCEL) += vaapi_vvc.o
|
||||
OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan_decode.o vulkan_prores_raw.o
|
||||
OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan_decode.o vulkan_prores.o
|
||||
|
||||
# Objects duplicated from other libraries for shared builds
|
||||
SHLIBOBJS += log2_tab.o reverse.o
|
||||
@@ -1350,7 +1351,7 @@ SKIPHEADERS-$(CONFIG_QSVENC) += qsvenc.h
|
||||
SKIPHEADERS-$(CONFIG_VAAPI) += vaapi_decode.h vaapi_hevc.h vaapi_encode.h
|
||||
SKIPHEADERS-$(CONFIG_VDPAU) += vdpau.h vdpau_internal.h
|
||||
SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX) += videotoolbox.h vt_internal.h
|
||||
SKIPHEADERS-$(CONFIG_VULKAN) += ffv1_vulkan.h vulkan_video.h \
|
||||
SKIPHEADERS-$(CONFIG_VULKAN) += ffv1_vulkan.h prores_vulkan.h vulkan_video.h \
|
||||
vulkan_encode.h vulkan_decode.h
|
||||
SKIPHEADERS-$(CONFIG_V4L2_M2M) += v4l2_buffers.h v4l2_context.h v4l2_m2m.h
|
||||
SKIPHEADERS-$(CONFIG_ZLIB) += zlib_wrapper.h
|
||||
|
||||
@@ -68,6 +68,7 @@ extern const struct FFHWAccel ff_mpeg4_vdpau_hwaccel;
|
||||
extern const struct FFHWAccel ff_mpeg4_videotoolbox_hwaccel;
|
||||
extern const struct FFHWAccel ff_prores_videotoolbox_hwaccel;
|
||||
extern const struct FFHWAccel ff_prores_raw_vulkan_hwaccel;
|
||||
extern const struct FFHWAccel ff_prores_vulkan_hwaccel;
|
||||
extern const struct FFHWAccel ff_vc1_d3d11va_hwaccel;
|
||||
extern const struct FFHWAccel ff_vc1_d3d11va2_hwaccel;
|
||||
extern const struct FFHWAccel ff_vc1_d3d12va_hwaccel;
|
||||
|
||||
@@ -251,7 +251,7 @@ static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
|
||||
}
|
||||
|
||||
if (pix_fmt != ctx->pix_fmt) {
|
||||
#define HWACCEL_MAX (CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL)
|
||||
#define HWACCEL_MAX (CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL + CONFIG_PRORES_VULKAN_HWACCEL)
|
||||
#if HWACCEL_MAX
|
||||
enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
|
||||
int ret;
|
||||
@@ -260,6 +260,9 @@ static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
|
||||
|
||||
#if CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL
|
||||
*fmtp++ = AV_PIX_FMT_VIDEOTOOLBOX;
|
||||
#endif
|
||||
#if CONFIG_PRORES_VULKAN_HWACCEL
|
||||
*fmtp++ = AV_PIX_FMT_VULKAN;
|
||||
#endif
|
||||
*fmtp++ = ctx->pix_fmt;
|
||||
*fmtp = AV_PIX_FMT_NONE;
|
||||
@@ -872,6 +875,9 @@ const FFCodec ff_prores_decoder = {
|
||||
.hw_configs = (const AVCodecHWConfigInternal *const []) {
|
||||
#if CONFIG_PRORES_VIDEOTOOLBOX_HWACCEL
|
||||
HWACCEL_VIDEOTOOLBOX(prores),
|
||||
#endif
|
||||
#if CONFIG_PRORES_VULKAN_HWACCEL
|
||||
HWACCEL_VULKAN(prores),
|
||||
#endif
|
||||
NULL
|
||||
},
|
||||
|
||||
@@ -17,6 +17,11 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \
|
||||
OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \
|
||||
vulkan/prores_raw.o
|
||||
|
||||
OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o \
|
||||
vulkan/prores_reset.o \
|
||||
vulkan/prores_vld.o \
|
||||
vulkan/prores_idct.o
|
||||
|
||||
VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp))
|
||||
.SECONDARY: $(VULKAN:.comp=.c)
|
||||
libavcodec/vulkan/%.c: TAG = VULKAN
|
||||
|
||||
123
libavcodec/vulkan/prores_idct.comp
Normal file
123
libavcodec/vulkan/prores_idct.comp
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/* Two macroblocks, padded to avoid bank conflicts */
|
||||
shared float blocks[4*2][8*(8+1)];
|
||||
|
||||
uint get_px(uint tex_idx, ivec2 pos)
|
||||
{
|
||||
#ifndef INTERLACED
|
||||
return imageLoad(dst[tex_idx], pos).x;
|
||||
#else
|
||||
return imageLoad(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field)).x;
|
||||
#endif
|
||||
}
|
||||
|
||||
void put_px(uint tex_idx, ivec2 pos, uint v)
|
||||
{
|
||||
#ifndef INTERLACED
|
||||
imageStore(dst[tex_idx], pos, uvec4(v));
|
||||
#else
|
||||
imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v));
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 7.4 Inverse Transform */
|
||||
void idct(uint block, uint offset, uint stride)
|
||||
{
|
||||
float c0 = blocks[block][0*stride + offset];
|
||||
float c1 = blocks[block][1*stride + offset];
|
||||
float c2 = blocks[block][2*stride + offset];
|
||||
float c3 = blocks[block][3*stride + offset];
|
||||
float c4 = blocks[block][4*stride + offset];
|
||||
float c5 = blocks[block][5*stride + offset];
|
||||
float c6 = blocks[block][6*stride + offset];
|
||||
float c7 = blocks[block][7*stride + offset];
|
||||
|
||||
float tmp1 = c6 * 1.4142134189605712891 + (c2 - c6);
|
||||
float tmp2 = c6 * 1.4142134189605712891 - (c2 - c6);
|
||||
|
||||
float a1 = (c0 + c4) * 0.35355341434478759766 + tmp1 * 0.46193981170654296875;
|
||||
float a4 = (c0 + c4) * 0.35355341434478759766 - tmp1 * 0.46193981170654296875;
|
||||
|
||||
float a3 = (c0 - c4) * 0.35355341434478759766 + tmp2 * 0.19134169816970825195;
|
||||
float a2 = (c0 - c4) * 0.35355341434478759766 - tmp2 * 0.19134169816970825195;
|
||||
|
||||
float tmp3 = (c3 - c5) * 0.70710682868957519531 + c7;
|
||||
float tmp4 = (c3 - c5) * 0.70710682868957519531 - c7;
|
||||
|
||||
float tmp5 = (c5 - c7) * 1.4142134189605712891 + (c5 - c7) + (c1 - c3);
|
||||
float tmp6 = (c5 - c7) * -1.4142134189605712891 + (c5 - c7) + (c1 - c3);
|
||||
|
||||
float m1 = tmp3 * 2.6131260395050048828 + tmp5;
|
||||
float m4 = tmp3 * -2.6131260395050048828 + tmp5;
|
||||
|
||||
float m2 = tmp4 * 1.0823919773101806641 + tmp6;
|
||||
float m3 = tmp4 * -1.0823919773101806641 + tmp6;
|
||||
|
||||
blocks[block][0*stride + offset] = m1 * 0.49039259552955627441 + a1;
|
||||
blocks[block][7*stride + offset] = m1 * -0.49039259552955627441 + a1;
|
||||
blocks[block][1*stride + offset] = m2 * 0.41573479771614074707 + a2;
|
||||
blocks[block][6*stride + offset] = m2 * -0.41573479771614074707 + a2;
|
||||
blocks[block][2*stride + offset] = m3 * 0.27778509259223937988 + a3;
|
||||
blocks[block][5*stride + offset] = m3 * -0.27778509259223937988 + a3;
|
||||
blocks[block][3*stride + offset] = m4 * 0.097545139491558074951 + a4;
|
||||
blocks[block][4*stride + offset] = m4 * -0.097545139491558074951 + a4;
|
||||
}
|
||||
|
||||
void main(void)
|
||||
{
|
||||
uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID;
|
||||
uint comp = gid.z, block = (lid.y << 2) | (lid.x >> 3), idx = lid.x & 0x7;
|
||||
uint chroma_shift = comp != 0 ? log2_chroma_w : 0;
|
||||
bool act = gid.x < mb_width << (4 - chroma_shift);
|
||||
|
||||
/* Coalesced load of DCT coeffs in shared memory, second part of inverse quantization */
|
||||
if (act) {
|
||||
/**
|
||||
* According to spec indexing an array in push constant memory with
|
||||
* a non-dynamically uniform value is illegal ($15.9.1 in v1.4.326),
|
||||
* so copy the whole matrix locally.
|
||||
*/
|
||||
uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma;
|
||||
[[unroll]] for (uint i = 0; i < 8; ++i) {
|
||||
int v = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) | i))), 16);
|
||||
blocks[block][i * 9 + idx] = float(v * int(qmat[(i << 3) + idx]));
|
||||
}
|
||||
}
|
||||
|
||||
/* Row-wise iDCT */
|
||||
barrier();
|
||||
idct(block, idx * 9, 1);
|
||||
|
||||
/* Column-wise iDCT */
|
||||
barrier();
|
||||
idct(block, idx, 9);
|
||||
|
||||
float fact = 1.0f / (1 << (12 - depth)), off = 1 << (depth - 1);
|
||||
int maxv = (1 << depth) - 1;
|
||||
|
||||
/* 7.5.1 Color Component Samples. Rescale, clamp and write back to global memory */
|
||||
barrier();
|
||||
if (act) {
|
||||
[[unroll]] for (uint i = 0; i < 8; ++i) {
|
||||
float v = blocks[block][i * 9 + idx] * fact + off;
|
||||
put_px(comp, ivec2(gid.x, (gid.y << 3) | i), clamp(int(v), 0, maxv));
|
||||
}
|
||||
}
|
||||
}
|
||||
38
libavcodec/vulkan/prores_reset.comp
Normal file
38
libavcodec/vulkan/prores_reset.comp
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
void main(void)
|
||||
{
|
||||
uvec3 gid = gl_GlobalInvocationID;
|
||||
#ifndef INTERLACED
|
||||
ivec2 pos = ivec2(gid);
|
||||
#else
|
||||
ivec2 pos = ivec2(gid.x, (gid.y << 1) + bottom_field);
|
||||
#endif
|
||||
|
||||
/* Clear luma plane */
|
||||
imageStore(dst[0], pos, uvec4(0));
|
||||
|
||||
/* Clear chroma plane */
|
||||
if (gid.x < mb_width << (4 - log2_chroma_w)) {
|
||||
imageStore(dst[1], pos, uvec4(0));
|
||||
imageStore(dst[2], pos, uvec4(0));
|
||||
}
|
||||
|
||||
/* Alpha plane doesn't need a clear because it is not sparsely encoded */
|
||||
}
|
||||
317
libavcodec/vulkan/prores_vld.comp
Normal file
317
libavcodec/vulkan/prores_vld.comp
Normal file
@@ -0,0 +1,317 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define U8(x) (uint8_t (x))
|
||||
#define U16(x) (uint16_t(x))
|
||||
|
||||
void put_px(uint tex_idx, ivec2 pos, uint v)
|
||||
{
|
||||
#ifndef INTERLACED
|
||||
imageStore(dst[tex_idx], pos, uvec4(v));
|
||||
#else
|
||||
imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v));
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 7.5.3 Pixel Arrangement */
|
||||
ivec2 pos_to_block(uint pos, uint luma)
|
||||
{
|
||||
return ivec2((pos & -luma - 2) + luma >> 1, pos >> luma & 1) << 3;
|
||||
}
|
||||
|
||||
/* 7.1.1.2 Signed Golomb Combination Codes */
|
||||
uint to_signed(uint x)
|
||||
{
|
||||
return (x >> 1) ^ -(x & 1);
|
||||
}
|
||||
|
||||
/* 7.1.1.1 Golomb Combination Codes */
|
||||
uint decode_codeword(inout GetBitContext gb, int codebook)
|
||||
{
|
||||
int last_rice_q = bitfieldExtract(codebook, 0, 4),
|
||||
krice = bitfieldExtract(codebook, 4, 4),
|
||||
kexp = bitfieldExtract(codebook, 8, 4);
|
||||
|
||||
int q = 31 - findMSB(show_bits(gb, 32));
|
||||
if (q <= last_rice_q) {
|
||||
/* Golomb-Rice encoding */
|
||||
return (get_bits(gb, krice + q + 1) & ~(1 << krice)) + (q << krice);
|
||||
} else {
|
||||
/* exp-Golomb encoding */
|
||||
return get_bits(gb, (q << 1) + kexp - last_rice_q) - (1 << kexp) + ((last_rice_q + 1) << krice);
|
||||
}
|
||||
}
|
||||
|
||||
void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count, uint qscale)
|
||||
{
|
||||
uvec3 gid = gl_GlobalInvocationID;
|
||||
uint is_luma = uint(gid.z == 0);
|
||||
uint chroma_shift = bool(is_luma) ? 0 : log2_chroma_w;
|
||||
|
||||
uint num_blocks = mb_count << (2 - chroma_shift);
|
||||
ivec2 base_pos = ivec2(mb_pos.x << (4 - chroma_shift), mb_pos.y << 4);
|
||||
|
||||
/* 7.1.1.3 DC Coefficients */
|
||||
{
|
||||
/* First coeff */
|
||||
uint c = to_signed(decode_codeword(gb, 0x650));
|
||||
put_px(gid.z, base_pos, c * qscale & 0xffff);
|
||||
|
||||
/**
|
||||
* Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8)
|
||||
* According to the SMPTE document, abs(prev_dc_diff) should be used
|
||||
* to index the table, duplicating the entries removes the abs operation.
|
||||
*/
|
||||
const uint16_t dc_codebook[] = { U16(0x100),
|
||||
U16(0x210), U16(0x210),
|
||||
U16(0x321), U16(0x321),
|
||||
U16(0x430), U16(0x430), };
|
||||
|
||||
uint cw = 5, prev_dc_diff = 0;
|
||||
for (int i = 1; i < num_blocks; ++i) {
|
||||
cw = decode_codeword(gb, dc_codebook[min(cw, 6)]);
|
||||
|
||||
int s = int(prev_dc_diff) >> 31;
|
||||
c += prev_dc_diff = (to_signed(cw) ^ s) - s;
|
||||
|
||||
put_px(gid.z, base_pos + pos_to_block(i, is_luma), c * qscale & 0xffff);
|
||||
}
|
||||
}
|
||||
|
||||
/* 7.1.1.4 AC Coefficients */
|
||||
{
|
||||
/* Table 10 */
|
||||
const uint16_t ac_run_codebook [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101),
|
||||
U16(0x100), U16(0x211), U16(0x211), U16(0x211),
|
||||
U16(0x211), U16(0x210), U16(0x210), U16(0x210),
|
||||
U16(0x210), U16(0x210), U16(0x210), U16(0x320), };
|
||||
|
||||
/* Table 11 */
|
||||
const uint16_t ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100),
|
||||
U16(0x210), U16(0x210), U16(0x210), U16(0x210),
|
||||
U16(0x320) };
|
||||
|
||||
#ifndef INTERLACED
|
||||
/* Figure 4, encoded as (x << 0) | (y << 4) */
|
||||
const uint8_t scan_tbl[] = {
|
||||
U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13),
|
||||
U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33),
|
||||
U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16),
|
||||
U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37),
|
||||
U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52),
|
||||
U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54),
|
||||
U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56),
|
||||
U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77),
|
||||
};
|
||||
#else
|
||||
/* Figure 5 */
|
||||
const uint8_t scan_tbl[] = {
|
||||
U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31),
|
||||
U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33),
|
||||
U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61),
|
||||
U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73),
|
||||
U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25),
|
||||
U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45),
|
||||
U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65),
|
||||
U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77),
|
||||
};
|
||||
#endif
|
||||
|
||||
uint block_mask = num_blocks - 1;
|
||||
uint block_shift = findLSB(num_blocks);
|
||||
|
||||
uint pos = num_blocks - 1, run = 4, level = 1, s;
|
||||
while (pos < num_blocks << 6) {
|
||||
int left = left_bits(gb);
|
||||
if (left <= 0 || (left < 32 && show_bits(gb, left) == 0))
|
||||
break;
|
||||
|
||||
run = decode_codeword(gb, ac_run_codebook [min(run, 15)]);
|
||||
level = decode_codeword(gb, ac_level_codebook[min(level, 8 )]);
|
||||
s = get_bits(gb, 1);
|
||||
|
||||
pos += run + 1;
|
||||
|
||||
uint bidx = pos & block_mask, scan = scan_tbl[pos >> block_shift];
|
||||
ivec2 spos = pos_to_block(bidx, is_luma);
|
||||
ivec2 bpos = ivec2(scan & 0xf, scan >> 4);
|
||||
|
||||
uint c = ((level + 1) ^ -s) + s;
|
||||
put_px(gid.z, base_pos + spos + bpos, c * qscale & 0xffff);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 7.1.2 Scanned Alpha */
|
||||
void decode_alpha(in GetBitContext gb, uvec2 mb_pos, uint mb_count)
|
||||
{
|
||||
uvec3 gid = gl_GlobalInvocationID;
|
||||
|
||||
ivec2 base_pos = ivec2(mb_pos) << 4;
|
||||
uint block_shift = findMSB(mb_count) + 4, block_mask = (1 << block_shift) - 1;
|
||||
|
||||
uint mask = (1 << (4 << alpha_info)) - 1;
|
||||
uint num_values = (mb_count << 4) * min(height - (gid.y << 4), 16);
|
||||
|
||||
int num_cw_bits = alpha_info == 1 ? 5 : 8,
|
||||
num_flc_bits = alpha_info == 1 ? 9 : 17;
|
||||
|
||||
uint alpha_rescale_lshift = alpha_info == 1 ? depth - 8 : 16,
|
||||
alpha_rescale_rshift = 16 - depth;
|
||||
|
||||
uint alpha = -1;
|
||||
for (uint pos = 0; pos < num_values;) {
|
||||
uint diff, run;
|
||||
|
||||
/* Decode run value */
|
||||
{
|
||||
uint bits = show_bits(gb, num_cw_bits), q = num_cw_bits - 1 - findMSB(bits);
|
||||
|
||||
/* Tables 13/14 */
|
||||
if (q != 0) {
|
||||
uint m = (bits >> 1) + 1, s = bits & 1;
|
||||
diff = (m ^ -s) + s;
|
||||
skip_bits(gb, num_cw_bits);
|
||||
} else {
|
||||
diff = get_bits(gb, num_flc_bits);
|
||||
}
|
||||
|
||||
alpha = alpha + diff & mask;
|
||||
}
|
||||
|
||||
/* Decode run length */
|
||||
{
|
||||
uint bits = show_bits(gb, 5), q = 4 - findMSB(bits);
|
||||
|
||||
/* Table 12 */
|
||||
if (q == 0) {
|
||||
run = 1;
|
||||
skip_bits(gb, 1);
|
||||
} else if (q <= 4) {
|
||||
run = bits + 1;
|
||||
skip_bits(gb, 5);
|
||||
} else {
|
||||
run = get_bits(gb, 16) + 1;
|
||||
}
|
||||
|
||||
run = min(run, num_values - pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* FFmpeg doesn't support color and alpha with different precision,
|
||||
* so we need to rescale to the color range.
|
||||
*/
|
||||
uint val = (alpha << alpha_rescale_lshift) | (alpha >> alpha_rescale_rshift);
|
||||
for (uint end = pos + run; pos < end; ++pos)
|
||||
put_px(3, base_pos + ivec2(pos & block_mask, pos >> block_shift), val & 0xffff);
|
||||
}
|
||||
}
|
||||
|
||||
void main(void)
|
||||
{
|
||||
uvec3 gid = gl_GlobalInvocationID;
|
||||
if (gid.x >= slice_width || gid.y >= slice_height)
|
||||
return;
|
||||
|
||||
uint slice_idx = gid.y * slice_width + gid.x;
|
||||
uint slice_off = slice_offsets[slice_idx],
|
||||
slice_size = slice_offsets[slice_idx + 1] - slice_off;
|
||||
|
||||
u8buf bs = u8buf(slice_data + slice_off);
|
||||
|
||||
/* Decode slice header */
|
||||
uint hdr_size, y_size, u_size, v_size, a_size;
|
||||
hdr_size = bs[0].v >> 3;
|
||||
|
||||
/* Table 15 */
|
||||
uint qidx = clamp(bs[1].v, 1, 224),
|
||||
qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
|
||||
|
||||
y_size = (uint(bs[2].v) << 8) | bs[3].v;
|
||||
u_size = (uint(bs[4].v) << 8) | bs[5].v;
|
||||
|
||||
/**
|
||||
* The alpha_info field can be 0 even when an alpha plane is present,
|
||||
* if skip_alpha is enabled, so use the header size instead.
|
||||
*/
|
||||
if (hdr_size > 6)
|
||||
v_size = (uint(bs[6].v) << 8) | bs[7].v;
|
||||
else
|
||||
v_size = slice_size - hdr_size - y_size - u_size;
|
||||
|
||||
a_size = slice_size - hdr_size - y_size - u_size - v_size;
|
||||
|
||||
GetBitContext gb;
|
||||
switch (gid.z) {
|
||||
case 0:
|
||||
init_get_bits(gb, u8buf(bs + hdr_size), int(y_size));
|
||||
break;
|
||||
case 1:
|
||||
init_get_bits(gb, u8buf(bs + hdr_size + y_size), int(u_size));
|
||||
break;
|
||||
case 2:
|
||||
init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size), int(v_size));
|
||||
break;
|
||||
case 3:
|
||||
init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size + v_size), int(a_size));
|
||||
break;
|
||||
}
|
||||
|
||||
/**
|
||||
* Support for the grayscale "extension" in the prores_aw encoder.
|
||||
* According to the spec, entropy coded data should never be empty,
|
||||
* and instead contain at least the DC coefficients.
|
||||
* This avoids undefined behavior.
|
||||
*/
|
||||
if (left_bits(gb) == 0)
|
||||
return;
|
||||
|
||||
/**
|
||||
* 4 ProRes Frame Structure
|
||||
* ProRes tiles pictures into a grid of slices, whose size is determined
|
||||
* by the log2_slice_width parameter (height is always 1 MB).
|
||||
* Each slice has a width of (1 << log2_slice_width) MBs, until the picture
|
||||
* cannot accommodate a full one. At this point, the remaining space
|
||||
* is recursively completed using the first smaller power of two that fits
|
||||
* (see Figure 1).
|
||||
* The maximum number of extra slices is 3, when log2_slice_width is 3,
|
||||
* with sizes 4, 2 and 1 MBs.
|
||||
* The mb_width parameter therefore also represents the number of full slices,
|
||||
* when interpreted as a fixed-point number with log2_slice_width fractional bits.
|
||||
*/
|
||||
uint frac = bitfieldExtract(uint(mb_width), 0, log2_slice_width),
|
||||
num_extra = bitCount(frac);
|
||||
|
||||
uint diff = slice_width - gid.x - 1,
|
||||
off = max(int(diff - num_extra + 1) << 2, 0);
|
||||
|
||||
uint log2_width = min(findLSB(frac - diff >> diff) + diff + off, log2_slice_width);
|
||||
|
||||
uint mb_x = (min(gid.x, slice_width - num_extra) << log2_slice_width) +
|
||||
(frac & (0xf << log2_width + 1)),
|
||||
mb_y = gid.y;
|
||||
uint mb_count = 1 << log2_width;
|
||||
|
||||
if (gid.z < 3) {
|
||||
/* Color entropy decoding, inverse scanning, first part of inverse quantization */
|
||||
decode_comp(gb, uvec2(mb_x, mb_y), mb_count, qscale);
|
||||
} else {
|
||||
/* Alpha entropy decoding */
|
||||
decode_alpha(gb, uvec2(mb_x, mb_y), mb_count);
|
||||
}
|
||||
}
|
||||
@@ -26,7 +26,8 @@
|
||||
|
||||
#define DECODER_IS_SDR(codec_id) \
|
||||
(((codec_id) == AV_CODEC_ID_FFV1) || \
|
||||
((codec_id) == AV_CODEC_ID_PRORES_RAW))
|
||||
((codec_id) == AV_CODEC_ID_PRORES_RAW) || \
|
||||
((codec_id) == AV_CODEC_ID_PRORES))
|
||||
|
||||
#if CONFIG_H264_VULKAN_HWACCEL
|
||||
extern const FFVulkanDecodeDescriptor ff_vk_dec_h264_desc;
|
||||
@@ -46,6 +47,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc;
|
||||
#if CONFIG_PRORES_RAW_VULKAN_HWACCEL
|
||||
extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_raw_desc;
|
||||
#endif
|
||||
#if CONFIG_PRORES_VULKAN_HWACCEL
|
||||
extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc;
|
||||
#endif
|
||||
|
||||
static const FFVulkanDecodeDescriptor *dec_descs[] = {
|
||||
#if CONFIG_H264_VULKAN_HWACCEL
|
||||
@@ -66,6 +70,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = {
|
||||
#if CONFIG_PRORES_RAW_VULKAN_HWACCEL
|
||||
&ff_vk_dec_prores_raw_desc,
|
||||
#endif
|
||||
#if CONFIG_PRORES_VULKAN_HWACCEL
|
||||
&ff_vk_dec_prores_desc,
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef struct FFVulkanDecodeProfileData {
|
||||
|
||||
541
libavcodec/vulkan_prores.c
Normal file
541
libavcodec/vulkan_prores.c
Normal file
@@ -0,0 +1,541 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "proresdec.h"
|
||||
#include "vulkan_decode.h"
|
||||
#include "hwaccel_internal.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/vulkan.h"
|
||||
#include "libavutil/vulkan_loader.h"
|
||||
#include "libavutil/vulkan_spirv.h"
|
||||
|
||||
extern const char *ff_source_common_comp;
|
||||
extern const char *ff_source_prores_reset_comp;
|
||||
extern const char *ff_source_prores_vld_comp;
|
||||
extern const char *ff_source_prores_idct_comp;
|
||||
|
||||
const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc = {
|
||||
.codec_id = AV_CODEC_ID_PRORES,
|
||||
.queue_flags = VK_QUEUE_COMPUTE_BIT,
|
||||
};
|
||||
|
||||
typedef struct ProresVulkanDecodePicture {
|
||||
FFVulkanDecodePicture vp;
|
||||
|
||||
AVBufferRef *slice_offset_buf;
|
||||
uint32_t slice_num;
|
||||
|
||||
uint32_t bitstream_start;
|
||||
uint32_t bitstream_size;
|
||||
} ProresVulkanDecodePicture;
|
||||
|
||||
typedef struct ProresVulkanDecodeContext {
|
||||
struct ProresVulkanShaderVariants {
|
||||
FFVulkanShader reset;
|
||||
FFVulkanShader vld;
|
||||
FFVulkanShader idct;
|
||||
} shaders[2]; /* Progressive/interlaced */
|
||||
|
||||
AVBufferPool *slice_offset_pool;
|
||||
} ProresVulkanDecodeContext;
|
||||
|
||||
typedef struct ProresVkParameters {
|
||||
VkDeviceAddress slice_data;
|
||||
uint32_t bitstream_size;
|
||||
|
||||
uint16_t width;
|
||||
uint16_t height;
|
||||
uint16_t mb_width;
|
||||
uint16_t mb_height;
|
||||
uint16_t slice_width;
|
||||
uint16_t slice_height;
|
||||
uint8_t log2_slice_width;
|
||||
uint8_t log2_chroma_w;
|
||||
uint8_t depth;
|
||||
uint8_t alpha_info;
|
||||
uint8_t bottom_field;
|
||||
|
||||
uint8_t qmat_luma [64];
|
||||
uint8_t qmat_chroma[64];
|
||||
} ProresVkParameters;
|
||||
|
||||
static int vk_prores_start_frame(AVCodecContext *avctx,
|
||||
const AVBufferRef *buffer_ref,
|
||||
av_unused const uint8_t *buffer,
|
||||
av_unused uint32_t size)
|
||||
{
|
||||
ProresContext *pr = avctx->priv_data;
|
||||
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
|
||||
FFVulkanDecodeShared *ctx = dec->shared_ctx;
|
||||
ProresVulkanDecodeContext *pv = ctx->sd_ctx;
|
||||
ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private;
|
||||
FFVulkanDecodePicture *vp = &pp->vp;
|
||||
|
||||
int err;
|
||||
|
||||
/* Host map the input slices data if supported */
|
||||
if (!vp->slices_buf && ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)
|
||||
RET(ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data,
|
||||
buffer_ref,
|
||||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
|
||||
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT));
|
||||
|
||||
/* Allocate slice offsets buffer */
|
||||
RET(ff_vk_get_pooled_buffer(&ctx->s, &pv->slice_offset_pool,
|
||||
&pp->slice_offset_buf,
|
||||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
|
||||
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
|
||||
NULL, (pr->slice_count + 1) * sizeof(uint32_t),
|
||||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
|
||||
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT));
|
||||
|
||||
/* Prepare frame to be used */
|
||||
RET(ff_vk_decode_prepare_frame_sdr(dec, pr->frame, vp, 1,
|
||||
FF_VK_REP_NATIVE, 0));
|
||||
|
||||
pp->slice_num = 0;
|
||||
pp->bitstream_start = pp->bitstream_size = 0;
|
||||
|
||||
fail:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int vk_prores_decode_slice(AVCodecContext *avctx,
|
||||
const uint8_t *data,
|
||||
uint32_t size)
|
||||
{
|
||||
ProresContext *pr = avctx->priv_data;
|
||||
ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private;
|
||||
FFVulkanDecodePicture *vp = &pp->vp;
|
||||
|
||||
FFVkBuffer *slice_offset = (FFVkBuffer *)pp->slice_offset_buf->data;
|
||||
FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL;
|
||||
|
||||
/* Skip picture header */
|
||||
if (slices_buf && slices_buf->host_ref && !pp->slice_num)
|
||||
pp->bitstream_size = data - slices_buf->mapped_mem;
|
||||
|
||||
AV_WN32(slice_offset->mapped_mem + (pp->slice_num + 0) * sizeof(uint32_t),
|
||||
pp->bitstream_size);
|
||||
AV_WN32(slice_offset->mapped_mem + (pp->slice_num + 1) * sizeof(uint32_t),
|
||||
pp->bitstream_size += size);
|
||||
|
||||
if (!slices_buf || !slices_buf->host_ref) {
|
||||
int err = ff_vk_decode_add_slice(avctx, vp, data, size, 0,
|
||||
&pp->slice_num, NULL);
|
||||
if (err < 0)
|
||||
return err;
|
||||
} else {
|
||||
pp->slice_num++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vk_prores_end_frame(AVCodecContext *avctx)
|
||||
{
|
||||
ProresContext *pr = avctx->priv_data;
|
||||
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
|
||||
FFVulkanDecodeShared *ctx = dec->shared_ctx;
|
||||
FFVulkanFunctions *vk = &ctx->s.vkfn;
|
||||
ProresVulkanDecodeContext *pv = ctx->sd_ctx;
|
||||
ProresVulkanDecodePicture *pp = pr->hwaccel_picture_private;
|
||||
FFVulkanDecodePicture *vp = &pp->vp;
|
||||
|
||||
ProresVkParameters pd;
|
||||
FFVkBuffer *slice_data, *slice_offsets;
|
||||
struct ProresVulkanShaderVariants *shaders;
|
||||
VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS];
|
||||
VkBufferMemoryBarrier2 buf_bar[2];
|
||||
int nb_img_bar = 0, nb_buf_bar = 0, err;
|
||||
const AVPixFmtDescriptor *pix_desc;
|
||||
|
||||
if (!pp->slice_num)
|
||||
return 0;
|
||||
|
||||
pix_desc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
|
||||
if (!pix_desc)
|
||||
return AVERROR(EINVAL);
|
||||
|
||||
slice_data = (FFVkBuffer *)vp->slices_buf->data;
|
||||
slice_offsets = (FFVkBuffer *)pp->slice_offset_buf->data;
|
||||
|
||||
shaders = &pv->shaders[pr->frame_type != 0];
|
||||
|
||||
pd = (ProresVkParameters) {
|
||||
.slice_data = slice_data->address,
|
||||
.bitstream_size = pp->bitstream_size,
|
||||
|
||||
.width = avctx->width,
|
||||
.height = avctx->height,
|
||||
.mb_width = pr->mb_width,
|
||||
.mb_height = pr->mb_height,
|
||||
.slice_width = pr->slice_count / pr->mb_height,
|
||||
.slice_height = pr->mb_height,
|
||||
.log2_slice_width = av_log2(pr->slice_mb_width),
|
||||
.log2_chroma_w = pix_desc->log2_chroma_w,
|
||||
.depth = avctx->bits_per_raw_sample,
|
||||
.alpha_info = pr->alpha_info,
|
||||
.bottom_field = pr->first_field ^ (pr->frame_type == 1),
|
||||
};
|
||||
|
||||
memcpy(pd.qmat_luma, pr->qmat_luma, sizeof(pd.qmat_luma ));
|
||||
memcpy(pd.qmat_chroma, pr->qmat_chroma, sizeof(pd.qmat_chroma));
|
||||
|
||||
FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool);
|
||||
RET(ff_vk_exec_start(&ctx->s, exec));
|
||||
|
||||
/* Prepare deps */
|
||||
RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, pr->frame,
|
||||
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
|
||||
|
||||
RET(ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value,
|
||||
pr->frame));
|
||||
|
||||
RET(ff_vk_exec_add_dep_buf(&ctx->s, exec,
|
||||
(AVBufferRef *[]){ vp->slices_buf, pp->slice_offset_buf },
|
||||
2, 0));
|
||||
|
||||
/* Transfer ownership to the exec context */
|
||||
vp->slices_buf = pp->slice_offset_buf = NULL;
|
||||
|
||||
/* Input frame barrier */
|
||||
ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
|
||||
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_QUEUE_FAMILY_IGNORED);
|
||||
|
||||
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
|
||||
.pBufferMemoryBarriers = buf_bar,
|
||||
.bufferMemoryBarrierCount = nb_buf_bar,
|
||||
.pImageMemoryBarriers = img_bar,
|
||||
.imageMemoryBarrierCount = nb_img_bar,
|
||||
});
|
||||
nb_img_bar = nb_buf_bar = 0;
|
||||
|
||||
/* Reset */
|
||||
ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->reset,
|
||||
pr->frame, vp->view.out,
|
||||
0, 0,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_NULL_HANDLE);
|
||||
|
||||
ff_vk_shader_update_push_const(&ctx->s, exec, &shaders->reset,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
0, sizeof(pd), &pd);
|
||||
|
||||
ff_vk_exec_bind_shader(&ctx->s, exec, &shaders->reset);
|
||||
|
||||
vk->CmdDispatch(exec->buf, pr->mb_width << 1, pr->mb_height << 1, 1);
|
||||
|
||||
/* Input frame barrier after reset */
|
||||
ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_SHADER_WRITE_BIT,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_QUEUE_FAMILY_IGNORED);
|
||||
|
||||
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
|
||||
.pBufferMemoryBarriers = buf_bar,
|
||||
.bufferMemoryBarrierCount = nb_buf_bar,
|
||||
.pImageMemoryBarriers = img_bar,
|
||||
.imageMemoryBarrierCount = nb_img_bar,
|
||||
});
|
||||
nb_img_bar = nb_buf_bar = 0;
|
||||
|
||||
/* Entropy decode */
|
||||
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &shaders->vld,
|
||||
0, 0, 0,
|
||||
slice_offsets,
|
||||
0, (pp->slice_num + 1) * sizeof(uint32_t),
|
||||
VK_FORMAT_UNDEFINED);
|
||||
ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->vld,
|
||||
pr->frame, vp->view.out,
|
||||
0, 1,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_NULL_HANDLE);
|
||||
|
||||
ff_vk_shader_update_push_const(&ctx->s, exec, &shaders->vld,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
0, sizeof(pd), &pd);
|
||||
|
||||
ff_vk_exec_bind_shader(&ctx->s, exec, &shaders->vld);
|
||||
|
||||
vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->slice_count / pr->mb_height, 3), AV_CEIL_RSHIFT(pr->mb_height, 3),
|
||||
3 + !!pr->alpha_info);
|
||||
|
||||
/* Synchronize vld and idct shaders */
|
||||
nb_img_bar = 0;
|
||||
ff_vk_frame_barrier(&ctx->s, exec, pr->frame, img_bar, &nb_img_bar,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_QUEUE_FAMILY_IGNORED);
|
||||
|
||||
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
|
||||
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
|
||||
.pBufferMemoryBarriers = buf_bar,
|
||||
.bufferMemoryBarrierCount = nb_buf_bar,
|
||||
.pImageMemoryBarriers = img_bar,
|
||||
.imageMemoryBarrierCount = nb_img_bar,
|
||||
});
|
||||
nb_img_bar = nb_buf_bar = 0;
|
||||
|
||||
/* Inverse transform */
|
||||
ff_vk_shader_update_img_array(&ctx->s, exec, &shaders->idct,
|
||||
pr->frame, vp->view.out,
|
||||
0, 0,
|
||||
VK_IMAGE_LAYOUT_GENERAL,
|
||||
VK_NULL_HANDLE);
|
||||
|
||||
ff_vk_exec_bind_shader(&ctx->s, exec, &shaders->idct);
|
||||
|
||||
ff_vk_shader_update_push_const(&ctx->s, exec, &shaders->idct,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
0, sizeof(pd), &pd);
|
||||
|
||||
vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->mb_width, 1), pr->mb_height, 3);
|
||||
|
||||
RET(ff_vk_exec_submit(&ctx->s, exec));
|
||||
|
||||
fail:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int add_push_data(FFVulkanShader *shd)
|
||||
{
|
||||
GLSLC(0, layout(push_constant, scalar) uniform pushConstants { );
|
||||
GLSLC(1, u8buf slice_data; );
|
||||
GLSLC(1, uint bitstream_size; );
|
||||
GLSLC(0, );
|
||||
GLSLC(1, uint16_t width; );
|
||||
GLSLC(1, uint16_t height; );
|
||||
GLSLC(1, uint16_t mb_width; );
|
||||
GLSLC(1, uint16_t mb_height; );
|
||||
GLSLC(1, uint16_t slice_width; );
|
||||
GLSLC(1, uint16_t slice_height; );
|
||||
GLSLC(1, uint8_t log2_slice_width; );
|
||||
GLSLC(1, uint8_t log2_chroma_w; );
|
||||
GLSLC(1, uint8_t depth; );
|
||||
GLSLC(1, uint8_t alpha_info; );
|
||||
GLSLC(1, uint8_t bottom_field; );
|
||||
GLSLC(0, );
|
||||
GLSLC(1, uint8_t qmat_luma [8*8]; );
|
||||
GLSLC(1, uint8_t qmat_chroma[8*8]; );
|
||||
GLSLC(0, }; );
|
||||
|
||||
return ff_vk_shader_add_push_const(shd, 0, sizeof(ProresVkParameters),
|
||||
VK_SHADER_STAGE_COMPUTE_BIT);
|
||||
}
|
||||
|
||||
static int init_shader(AVCodecContext *avctx, FFVulkanContext *s,
|
||||
FFVkExecPool *pool, FFVkSPIRVCompiler *spv,
|
||||
FFVulkanShader *shd, const char *name, const char *entrypoint,
|
||||
FFVulkanDescriptorSetBinding *descs, int num_descs,
|
||||
const char *source, int local_size, int interlaced)
|
||||
{
|
||||
uint8_t *spv_data;
|
||||
size_t spv_len;
|
||||
void *spv_opaque = NULL;
|
||||
int err;
|
||||
|
||||
RET(ff_vk_shader_init(s, shd, name,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
(const char *[]) { "GL_EXT_buffer_reference",
|
||||
"GL_EXT_buffer_reference2" }, 2,
|
||||
local_size >> 16 & 0xff, local_size >> 8 & 0xff, local_size >> 0 & 0xff,
|
||||
0));
|
||||
|
||||
/* Common code */
|
||||
GLSLD(ff_source_common_comp);
|
||||
|
||||
/* Push constants layout */
|
||||
RET(add_push_data(shd));
|
||||
|
||||
RET(ff_vk_shader_add_descriptor_set(s, shd, descs, num_descs, 0, 0));
|
||||
|
||||
if (interlaced)
|
||||
av_bprintf(&shd->src, "#define INTERLACED\n");
|
||||
|
||||
/* Main code */
|
||||
GLSLD(source);
|
||||
|
||||
RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, entrypoint,
|
||||
&spv_opaque));
|
||||
RET(ff_vk_shader_link(s, shd, spv_data, spv_len, entrypoint));
|
||||
|
||||
RET(ff_vk_shader_register_exec(s, pool, shd));
|
||||
|
||||
fail:
|
||||
if (spv_opaque)
|
||||
spv->free_shader(spv, &spv_opaque);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void vk_decode_prores_uninit(FFVulkanDecodeShared *ctx)
|
||||
{
|
||||
ProresVulkanDecodeContext *pv = ctx->sd_ctx;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < FF_ARRAY_ELEMS(pv->shaders); ++i) {
|
||||
ff_vk_shader_free(&ctx->s, &pv->shaders[i].reset);
|
||||
ff_vk_shader_free(&ctx->s, &pv->shaders[i].vld);
|
||||
ff_vk_shader_free(&ctx->s, &pv->shaders[i].idct);
|
||||
}
|
||||
|
||||
av_buffer_pool_uninit(&pv->slice_offset_pool);
|
||||
|
||||
av_freep(&pv);
|
||||
}
|
||||
|
||||
static int vk_decode_prores_init(AVCodecContext *avctx)
|
||||
{
|
||||
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
|
||||
FFVulkanDecodeShared *ctx = NULL;
|
||||
|
||||
AVHWFramesContext *out_frames_ctx;
|
||||
ProresVulkanDecodeContext *pv;
|
||||
FFVkSPIRVCompiler *spv;
|
||||
FFVulkanDescriptorSetBinding *desc_set;
|
||||
int max_num_slices, i, err;
|
||||
|
||||
max_num_slices = (avctx->coded_width >> 4) * (avctx->coded_height >> 4);
|
||||
|
||||
spv = ff_vk_spirv_init();
|
||||
if (!spv) {
|
||||
av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
|
||||
return AVERROR_EXTERNAL;
|
||||
}
|
||||
|
||||
err = ff_vk_decode_init(avctx);
|
||||
if (err < 0)
|
||||
return err;
|
||||
ctx = dec->shared_ctx;
|
||||
|
||||
pv = ctx->sd_ctx = av_mallocz(sizeof(*pv));
|
||||
if (!pv) {
|
||||
err = AVERROR(ENOMEM);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
out_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
|
||||
|
||||
ctx->sd_ctx_free = vk_decode_prores_uninit;
|
||||
|
||||
for (i = 0; i < FF_ARRAY_ELEMS(pv->shaders); ++i) { /* Progressive/interlaced */
|
||||
struct ProresVulkanShaderVariants *shaders = &pv->shaders[i];
|
||||
|
||||
desc_set = (FFVulkanDescriptorSetBinding []) {
|
||||
{
|
||||
.name = "dst",
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
.dimensions = 2,
|
||||
.mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format,
|
||||
FF_VK_REP_NATIVE),
|
||||
.mem_quali = "writeonly",
|
||||
.elems = av_pix_fmt_count_planes(out_frames_ctx->sw_format),
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
},
|
||||
};
|
||||
RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->reset,
|
||||
"prores_dec_reset", "main", desc_set, 1,
|
||||
ff_source_prores_reset_comp, 0x080801, i));
|
||||
|
||||
desc_set = (FFVulkanDescriptorSetBinding []) {
|
||||
{
|
||||
.name = "slice_offsets_buf",
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.mem_quali = "readonly",
|
||||
.buf_content = "uint32_t slice_offsets",
|
||||
.buf_elems = max_num_slices + 1,
|
||||
},
|
||||
{
|
||||
.name = "dst",
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
.dimensions = 2,
|
||||
.mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format,
|
||||
FF_VK_REP_NATIVE),
|
||||
.mem_quali = "writeonly",
|
||||
.elems = av_pix_fmt_count_planes(out_frames_ctx->sw_format),
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
},
|
||||
};
|
||||
RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->vld,
|
||||
"prores_dec_vld", "main", desc_set, 2,
|
||||
ff_source_prores_vld_comp, 0x080801, i));
|
||||
|
||||
desc_set = (FFVulkanDescriptorSetBinding []) {
|
||||
{
|
||||
.name = "dst",
|
||||
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
.dimensions = 2,
|
||||
.mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format,
|
||||
FF_VK_REP_NATIVE),
|
||||
.elems = av_pix_fmt_count_planes(out_frames_ctx->sw_format),
|
||||
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
},
|
||||
};
|
||||
RET(init_shader(avctx, &ctx->s, &ctx->exec_pool, spv, &shaders->idct,
|
||||
"prores_dec_idct", "main", desc_set, 1,
|
||||
ff_source_prores_idct_comp, 0x200201, i));
|
||||
}
|
||||
|
||||
err = 0;
|
||||
|
||||
fail:
|
||||
spv->uninit(&spv);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void vk_prores_free_frame_priv(AVRefStructOpaque _hwctx, void *data)
|
||||
{
|
||||
AVHWDeviceContext *dev_ctx = _hwctx.nc;
|
||||
ProresVulkanDecodePicture *pp = data;
|
||||
|
||||
ff_vk_decode_free_frame(dev_ctx, &pp->vp);
|
||||
}
|
||||
|
||||
const FFHWAccel ff_prores_vulkan_hwaccel = {
|
||||
.p.name = "prores_vulkan",
|
||||
.p.type = AVMEDIA_TYPE_VIDEO,
|
||||
.p.id = AV_CODEC_ID_PRORES,
|
||||
.p.pix_fmt = AV_PIX_FMT_VULKAN,
|
||||
.start_frame = &vk_prores_start_frame,
|
||||
.decode_slice = &vk_prores_decode_slice,
|
||||
.end_frame = &vk_prores_end_frame,
|
||||
.free_frame_priv = &vk_prores_free_frame_priv,
|
||||
.frame_priv_data_size = sizeof(ProresVulkanDecodePicture),
|
||||
.init = &vk_decode_prores_init,
|
||||
.update_thread_context = &ff_vk_update_thread_context,
|
||||
.decode_params = &ff_vk_params_invalidate,
|
||||
.flush = &ff_vk_decode_flush,
|
||||
.uninit = &ff_vk_decode_uninit,
|
||||
.frame_params = &ff_vk_frame_params,
|
||||
.priv_data_size = sizeof(FFVulkanDecodeContext),
|
||||
.caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE,
|
||||
};
|
||||
Reference in New Issue
Block a user