diff --git a/libswscale/Makefile b/libswscale/Makefile index c9dfa78c89..c6e32638e6 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -16,6 +16,7 @@ OBJS = alphablend.o \ input.o \ lut3d.o \ ops.o \ + ops_backend.o \ ops_chain.o \ ops_optimizer.o \ options.o \ diff --git a/libswscale/ops.c b/libswscale/ops.c index 7c5f315c8f..10463513f2 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -27,7 +27,10 @@ #include "ops.h" #include "ops_internal.h" +const extern SwsOpBackend backend_c; + const SwsOpBackend * const ff_sws_op_backends[] = { + &backend_c, NULL }; diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c new file mode 100644 index 0000000000..ac76dedbee --- /dev/null +++ b/libswscale/ops_backend.c @@ -0,0 +1,109 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "ops_backend.h" + +#if AV_GCC_VERSION_AT_LEAST(4, 4) +#pragma GCC optimize ("finite-math-only") +#endif + +/* Array-based reference implementation */ + +#ifndef SWS_BLOCK_SIZE +# define SWS_BLOCK_SIZE 32 +#endif + +typedef uint8_t u8block_t[SWS_BLOCK_SIZE]; +typedef uint16_t u16block_t[SWS_BLOCK_SIZE]; +typedef uint32_t u32block_t[SWS_BLOCK_SIZE]; +typedef float f32block_t[SWS_BLOCK_SIZE]; + +#define BIT_DEPTH 8 +# include "ops_tmpl_int.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 16 +# include "ops_tmpl_int.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 32 +# include "ops_tmpl_int.c" +# include "ops_tmpl_float.c" +#undef BIT_DEPTH + +static void process(const SwsOpExec *exec, const void *priv, + const int bx_start, const int y_start, int bx_end, int y_end) +{ + const SwsOpChain *chain = priv; + const SwsOpImpl *impl = chain->impl; + SwsOpIter iter; + + for (iter.y = y_start; iter.y < y_end; iter.y++) { + for (int i = 0; i < 4; i++) { + iter.in[i] = exec->in[i] + (iter.y - y_start) * exec->in_stride[i]; + iter.out[i] = exec->out[i] + (iter.y - y_start) * exec->out_stride[i]; + } + + for (int block = bx_start; block < bx_end; block++) { + iter.x = block * SWS_BLOCK_SIZE; + ((void (*)(SwsOpIter *, const SwsOpImpl *)) impl->cont) + (&iter, &impl[1]); + } + } +} + +static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) +{ + int ret; + + SwsOpChain *chain = ff_sws_op_chain_alloc(); + if (!chain) + return AVERROR(ENOMEM); + + static const SwsOpTable *const tables[] = { + &bitfn(op_table_int, u8), + &bitfn(op_table_int, u16), + &bitfn(op_table_int, u32), + &bitfn(op_table_float, f32), + }; + + do { + ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops, + SWS_BLOCK_SIZE, chain); + } while (ret == AVERROR(EAGAIN)); + if (ret < 0) { + ff_sws_op_chain_free(chain); + return ret; + } + + *out = (SwsCompiledOp) { + .func = process, + .block_size = SWS_BLOCK_SIZE, + .cpu_flags = chain->cpu_flags, + .priv = chain, + .free = (void (*)(void *)) ff_sws_op_chain_free, + }; + return 0; +} + +const SwsOpBackend backend_c = { + .name = "c", + .compile = compile, +}; diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h new file mode 100644 index 0000000000..7880bb608e --- /dev/null +++ b/libswscale/ops_backend.h @@ -0,0 +1,167 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_OPS_BACKEND_H +#define SWSCALE_OPS_BACKEND_H + +/** + * Helper macros for the C-based backend. + * + * To use these macros, the following types must be defined: + * - PIXEL_TYPE should be one of SWS_PIXEL_* + * - pixel_t should be the type of pixels + * - block_t should be the type of blocks (groups of pixels) + */ + +#include +#include +#include + +#include "libavutil/attributes.h" +#include "libavutil/mem.h" + +#include "ops_chain.h" + +/** + * Internal context holding per-iter execution data. The data pointers will be + * directly incremented by the corresponding read/write functions. + */ +typedef struct SwsOpIter { + const uint8_t *in[4]; + uint8_t *out[4]; + int x, y; +} SwsOpIter; + +#ifdef __clang__ +# define SWS_FUNC +# define SWS_LOOP AV_PRAGMA(clang loop vectorize(assume_safety)) +#elif defined(__GNUC__) +# define SWS_FUNC __attribute__((optimize("tree-vectorize"))) +# define SWS_LOOP AV_PRAGMA(GCC ivdep) +#else +# define SWS_FUNC +# define SWS_LOOP +#endif + +/* Miscellaneous helpers */ +#define bitfn2(name, ext) name ## _ ## ext +#define bitfn(name, ext) bitfn2(name, ext) + +#define FN_SUFFIX AV_JOIN(FMT_CHAR, BIT_DEPTH) +#define fn(name) bitfn(name, FN_SUFFIX) + +#define av_q2pixel(q) ((q).den ? (pixel_t) (q).num / (q).den : 0) + +/* Helper macros to make writing common function signatures less painful */ +#define DECL_FUNC(NAME, ...) \ + static av_always_inline void fn(NAME)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + block_t x, block_t y, \ + block_t z, block_t w, \ + __VA_ARGS__) + +#define DECL_READ(NAME, ...) \ + static av_always_inline void fn(NAME)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + const pixel_t *restrict in0, \ + const pixel_t *restrict in1, \ + const pixel_t *restrict in2, \ + const pixel_t *restrict in3, \ + __VA_ARGS__) + +#define DECL_WRITE(NAME, ...) \ + DECL_FUNC(NAME, pixel_t *restrict out0, pixel_t *restrict out1, \ + pixel_t *restrict out2, pixel_t *restrict out3, \ + __VA_ARGS__) + +/* Helper macros to call into functions declared with DECL_FUNC_* */ +#define CALL(FUNC, ...) \ + fn(FUNC)(iter, impl, x, y, z, w, __VA_ARGS__) + +#define CALL_READ(FUNC, ...) \ + fn(FUNC)(iter, impl, (const pixel_t *) iter->in[0], \ + (const pixel_t *) iter->in[1], \ + (const pixel_t *) iter->in[2], \ + (const pixel_t *) iter->in[3], __VA_ARGS__) + +#define CALL_WRITE(FUNC, ...) \ + CALL(FUNC, (pixel_t *) iter->out[0], (pixel_t *) iter->out[1], \ + (pixel_t *) iter->out[2], (pixel_t *) iter->out[3], __VA_ARGS__) + +/* Helper macros to declare continuation functions */ +#define DECL_IMPL(NAME) \ + static SWS_FUNC void fn(NAME)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + block_t x, block_t y, \ + block_t z, block_t w) \ + +/* Helper macro to call into the next continuation with a given type */ +#define CONTINUE(TYPE, ...) \ + ((void (*)(SwsOpIter *, const SwsOpImpl *, \ + TYPE x, TYPE y, TYPE z, TYPE w)) impl->cont) \ + (iter, &impl[1], __VA_ARGS__) + +/* Helper macros for common op setup code */ +#define DECL_SETUP(NAME) \ + static int fn(NAME)(const SwsOp *op, SwsOpPriv *out) + +#define SETUP_MEMDUP(c) ff_setup_memdup(&(c), sizeof(c), out) +static inline int ff_setup_memdup(const void *c, size_t size, SwsOpPriv *out) +{ + out->ptr = av_memdup(c, size); + return out->ptr ? 0 : AVERROR(ENOMEM); +} + +/* Helper macro for declaring op table entries */ +#define DECL_ENTRY(NAME, ...) \ + static const SwsOpEntry fn(op_##NAME) = { \ + .func = (SwsFuncPtr) fn(NAME), \ + .type = PIXEL_TYPE, \ + __VA_ARGS__ \ + } + +/* Helpers to define functions for common subsets of components */ +#define DECL_PATTERN(NAME) \ + DECL_FUNC(NAME, const bool X, const bool Y, const bool Z, const bool W) + +#define WRAP_PATTERN(FUNC, X, Y, Z, W, ...) \ + DECL_IMPL(FUNC##_##X##Y##Z##W) \ + { \ + CALL(FUNC, X, Y, Z, W); \ + } \ + \ + DECL_ENTRY(FUNC##_##X##Y##Z##W, \ + .unused = { !X, !Y, !Z, !W }, \ + __VA_ARGS__ \ + ) + +#define WRAP_COMMON_PATTERNS(FUNC, ...) \ + WRAP_PATTERN(FUNC, 1, 0, 0, 0, __VA_ARGS__); \ + WRAP_PATTERN(FUNC, 1, 0, 0, 1, __VA_ARGS__); \ + WRAP_PATTERN(FUNC, 1, 1, 1, 0, __VA_ARGS__); \ + WRAP_PATTERN(FUNC, 1, 1, 1, 1, __VA_ARGS__) + +#define REF_COMMON_PATTERNS(NAME) \ + &fn(op_##NAME##_1000), \ + &fn(op_##NAME##_1001), \ + &fn(op_##NAME##_1110), \ + &fn(op_##NAME##_1111) + +#endif diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c new file mode 100644 index 0000000000..9490be0313 --- /dev/null +++ b/libswscale/ops_tmpl_common.c @@ -0,0 +1,176 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "ops_backend.h" + +#ifndef BIT_DEPTH +# error Should only be included from ops_tmpl_*.c! +#endif + +#define WRAP_CONVERT_UINT(N) \ +DECL_PATTERN(convert_uint##N) \ +{ \ + u##N##block_t xu, yu, zu, wu; \ + \ + SWS_LOOP \ + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \ + if (X) \ + xu[i] = x[i]; \ + if (Y) \ + yu[i] = y[i]; \ + if (Z) \ + zu[i] = z[i]; \ + if (W) \ + wu[i] = w[i]; \ + } \ + \ + CONTINUE(u##N##block_t, xu, yu, zu, wu); \ +} \ + \ +WRAP_COMMON_PATTERNS(convert_uint##N, \ + .op = SWS_OP_CONVERT, \ + .convert.to = SWS_PIXEL_U##N, \ +); + +#if BIT_DEPTH != 8 +WRAP_CONVERT_UINT(8) +#endif + +#if BIT_DEPTH != 16 +WRAP_CONVERT_UINT(16) +#endif + +#if BIT_DEPTH != 32 || defined(IS_FLOAT) +WRAP_CONVERT_UINT(32) +#endif + +DECL_PATTERN(clear) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (!X) + x[i] = impl->priv.px[0]; + if (!Y) + y[i] = impl->priv.px[1]; + if (!Z) + z[i] = impl->priv.px[2]; + if (!W) + w[i] = impl->priv.px[3]; + } + + CONTINUE(block_t, x, y, z, w); +} + +#define WRAP_CLEAR(X, Y, Z, W) \ +DECL_IMPL(clear##_##X##Y##Z##W) \ +{ \ + CALL(clear, X, Y, Z, W); \ +} \ + \ +DECL_ENTRY(clear##_##X##Y##Z##W, \ + .setup = ff_sws_setup_q4, \ + .op = SWS_OP_CLEAR, \ + .flexible = true, \ + .unused = { !X, !Y, !Z, !W }, \ +); + +WRAP_CLEAR(1, 1, 1, 0) /* rgba alpha */ +WRAP_CLEAR(0, 1, 1, 1) /* argb alpha */ + +WRAP_CLEAR(0, 0, 1, 1) /* vuya chroma */ +WRAP_CLEAR(1, 0, 0, 1) /* yuva chroma */ +WRAP_CLEAR(1, 1, 0, 0) /* ayuv chroma */ +WRAP_CLEAR(0, 1, 0, 1) /* uyva chroma */ +WRAP_CLEAR(1, 0, 1, 0) /* xvyu chroma */ + +WRAP_CLEAR(1, 0, 0, 0) /* gray -> yuva */ +WRAP_CLEAR(0, 1, 0, 0) /* gray -> ayuv */ +WRAP_CLEAR(0, 0, 1, 0) /* gray -> vuya */ + +DECL_PATTERN(min) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) + x[i] = FFMIN(x[i], impl->priv.px[0]); + if (Y) + y[i] = FFMIN(y[i], impl->priv.px[1]); + if (Z) + z[i] = FFMIN(z[i], impl->priv.px[2]); + if (W) + w[i] = FFMIN(w[i], impl->priv.px[3]); + } + + CONTINUE(block_t, x, y, z, w); +} + +DECL_PATTERN(max) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) + x[i] = FFMAX(x[i], impl->priv.px[0]); + if (Y) + y[i] = FFMAX(y[i], impl->priv.px[1]); + if (Z) + z[i] = FFMAX(z[i], impl->priv.px[2]); + if (W) + w[i] = FFMAX(w[i], impl->priv.px[3]); + } + + CONTINUE(block_t, x, y, z, w); +} + +WRAP_COMMON_PATTERNS(min, + .op = SWS_OP_MIN, + .setup = ff_sws_setup_q4, + .flexible = true, +); + +WRAP_COMMON_PATTERNS(max, + .op = SWS_OP_MAX, + .setup = ff_sws_setup_q4, + .flexible = true, +); + +DECL_PATTERN(scale) +{ + const pixel_t scale = impl->priv.px[0]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) + x[i] *= scale; + if (Y) + y[i] *= scale; + if (Z) + z[i] *= scale; + if (W) + w[i] *= scale; + } + + CONTINUE(block_t, x, y, z, w); +} + +WRAP_COMMON_PATTERNS(scale, + .op = SWS_OP_SCALE, + .setup = ff_sws_setup_q, + .flexible = true, +); diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c new file mode 100644 index 0000000000..345f9dd57b --- /dev/null +++ b/libswscale/ops_tmpl_float.c @@ -0,0 +1,257 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" + +#include "ops_backend.h" + +#ifndef BIT_DEPTH +# define BIT_DEPTH 32 +#endif + +#if BIT_DEPTH == 32 +# define PIXEL_TYPE SWS_PIXEL_F32 +# define PIXEL_MAX FLT_MAX +# define PIXEL_MIN FLT_MIN +# define pixel_t float +# define block_t f32block_t +# define px f32 +#else +# error Invalid BIT_DEPTH +#endif + +#define IS_FLOAT 1 +#define FMT_CHAR f +#include "ops_tmpl_common.c" + +DECL_SETUP(setup_dither) +{ + const int size = 1 << op->dither.size_log2; + if (!size) { + /* We special case this value */ + av_assert1(!av_cmp_q(op->dither.matrix[0], av_make_q(1, 2))); + out->ptr = NULL; + return 0; + } + + const int width = FFMAX(size, SWS_BLOCK_SIZE); + pixel_t *matrix = out->ptr = av_malloc(sizeof(pixel_t) * size * width); + if (!matrix) + return AVERROR(ENOMEM); + + for (int y = 0; y < size; y++) { + for (int x = 0; x < size; x++) + matrix[y * width + x] = av_q2pixel(op->dither.matrix[y * size + x]); + for (int x = size; x < width; x++) /* pad to block size */ + matrix[y * width + x] = matrix[y * width + (x % size)]; + } + + return 0; +} + +DECL_FUNC(dither, const int size_log2) +{ + const pixel_t *restrict matrix = impl->priv.ptr; + const int mask = (1 << size_log2) - 1; + const int y_line = iter->y; + const int row0 = (y_line + 0) & mask; + const int row1 = (y_line + 3) & mask; + const int row2 = (y_line + 2) & mask; + const int row3 = (y_line + 5) & mask; + const int size = 1 << size_log2; + const int width = FFMAX(size, SWS_BLOCK_SIZE); + const int base = iter->x & ~(SWS_BLOCK_SIZE - 1) & (size - 1); + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + x[i] += size_log2 ? matrix[row0 * width + base + i] : (pixel_t) 0.5; + y[i] += size_log2 ? matrix[row1 * width + base + i] : (pixel_t) 0.5; + z[i] += size_log2 ? matrix[row2 * width + base + i] : (pixel_t) 0.5; + w[i] += size_log2 ? matrix[row3 * width + base + i] : (pixel_t) 0.5; + } + + CONTINUE(block_t, x, y, z, w); +} + +#define WRAP_DITHER(N) \ +DECL_IMPL(dither##N) \ +{ \ + CALL(dither, N); \ +} \ + \ +DECL_ENTRY(dither##N, \ + .op = SWS_OP_DITHER, \ + .dither_size = N, \ + .setup = fn(setup_dither), \ + .free = av_free, \ +); + +WRAP_DITHER(0) +WRAP_DITHER(1) +WRAP_DITHER(2) +WRAP_DITHER(3) +WRAP_DITHER(4) +WRAP_DITHER(5) +WRAP_DITHER(6) +WRAP_DITHER(7) +WRAP_DITHER(8) + +typedef struct { + /* Stored in split form for convenience */ + pixel_t m[4][4]; + pixel_t k[4]; +} fn(LinCoeffs); + +DECL_SETUP(setup_linear) +{ + fn(LinCoeffs) c; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + c.m[i][j] = av_q2pixel(op->lin.m[i][j]); + c.k[i] = av_q2pixel(op->lin.m[i][4]); + } + + return SETUP_MEMDUP(c); +} + +/** + * Fully general case for a 5x5 linear affine transformation. Should never be + * called without constant `mask`. This function will compile down to the + * appropriately optimized version for the required subset of operations when + * called with a constant mask. + */ +DECL_FUNC(linear_mask, const uint32_t mask) +{ + const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + const pixel_t xx = x[i]; + const pixel_t yy = y[i]; + const pixel_t zz = z[i]; + const pixel_t ww = w[i]; + + x[i] = (mask & SWS_MASK_OFF(0)) ? c.k[0] : 0; + x[i] += (mask & SWS_MASK(0, 0)) ? c.m[0][0] * xx : xx; + x[i] += (mask & SWS_MASK(0, 1)) ? c.m[0][1] * yy : 0; + x[i] += (mask & SWS_MASK(0, 2)) ? c.m[0][2] * zz : 0; + x[i] += (mask & SWS_MASK(0, 3)) ? c.m[0][3] * ww : 0; + + y[i] = (mask & SWS_MASK_OFF(1)) ? c.k[1] : 0; + y[i] += (mask & SWS_MASK(1, 0)) ? c.m[1][0] * xx : 0; + y[i] += (mask & SWS_MASK(1, 1)) ? c.m[1][1] * yy : yy; + y[i] += (mask & SWS_MASK(1, 2)) ? c.m[1][2] * zz : 0; + y[i] += (mask & SWS_MASK(1, 3)) ? c.m[1][3] * ww : 0; + + z[i] = (mask & SWS_MASK_OFF(2)) ? c.k[2] : 0; + z[i] += (mask & SWS_MASK(2, 0)) ? c.m[2][0] * xx : 0; + z[i] += (mask & SWS_MASK(2, 1)) ? c.m[2][1] * yy : 0; + z[i] += (mask & SWS_MASK(2, 2)) ? c.m[2][2] * zz : zz; + z[i] += (mask & SWS_MASK(2, 3)) ? c.m[2][3] * ww : 0; + + w[i] = (mask & SWS_MASK_OFF(3)) ? c.k[3] : 0; + w[i] += (mask & SWS_MASK(3, 0)) ? c.m[3][0] * xx : 0; + w[i] += (mask & SWS_MASK(3, 1)) ? c.m[3][1] * yy : 0; + w[i] += (mask & SWS_MASK(3, 2)) ? c.m[3][2] * zz : 0; + w[i] += (mask & SWS_MASK(3, 3)) ? c.m[3][3] * ww : ww; + } + + CONTINUE(block_t, x, y, z, w); +} + +#define WRAP_LINEAR(NAME, MASK) \ +DECL_IMPL(linear_##NAME) \ +{ \ + CALL(linear_mask, MASK); \ +} \ + \ +DECL_ENTRY(linear_##NAME, \ + .op = SWS_OP_LINEAR, \ + .setup = fn(setup_linear), \ + .free = av_free, \ + .linear_mask = (MASK), \ +); + +WRAP_LINEAR(luma, SWS_MASK_LUMA) +WRAP_LINEAR(alpha, SWS_MASK_ALPHA) +WRAP_LINEAR(lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) +WRAP_LINEAR(dot3, 0x7) +WRAP_LINEAR(row0, SWS_MASK_ROW(0)) +WRAP_LINEAR(row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) +WRAP_LINEAR(diag3, SWS_MASK_DIAG3) +WRAP_LINEAR(diag4, SWS_MASK_DIAG4) +WRAP_LINEAR(diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) +WRAP_LINEAR(matrix3, SWS_MASK_MAT3) +WRAP_LINEAR(affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) +WRAP_LINEAR(affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) +WRAP_LINEAR(matrix4, SWS_MASK_MAT4) +WRAP_LINEAR(affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) + +static const SwsOpTable fn(op_table_float) = { + .block_size = SWS_BLOCK_SIZE, + .entries = { + REF_COMMON_PATTERNS(convert_uint8), + REF_COMMON_PATTERNS(convert_uint16), + REF_COMMON_PATTERNS(convert_uint32), + + &fn(op_clear_1110), + REF_COMMON_PATTERNS(min), + REF_COMMON_PATTERNS(max), + REF_COMMON_PATTERNS(scale), + + &fn(op_dither0), + &fn(op_dither1), + &fn(op_dither2), + &fn(op_dither3), + &fn(op_dither4), + &fn(op_dither5), + &fn(op_dither6), + &fn(op_dither7), + &fn(op_dither8), + + &fn(op_linear_luma), + &fn(op_linear_alpha), + &fn(op_linear_lumalpha), + &fn(op_linear_dot3), + &fn(op_linear_row0), + &fn(op_linear_row0a), + &fn(op_linear_diag3), + &fn(op_linear_diag4), + &fn(op_linear_diagoff3), + &fn(op_linear_matrix3), + &fn(op_linear_affine3), + &fn(op_linear_affine3a), + &fn(op_linear_matrix4), + &fn(op_linear_affine4), + + NULL + }, +}; + +#undef PIXEL_TYPE +#undef PIXEL_MAX +#undef PIXEL_MIN +#undef pixel_t +#undef block_t +#undef px + +#undef FMT_CHAR +#undef IS_FLOAT diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c new file mode 100644 index 0000000000..0337ae5708 --- /dev/null +++ b/libswscale/ops_tmpl_int.c @@ -0,0 +1,603 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/bswap.h" + +#include "ops_backend.h" + +#ifndef BIT_DEPTH +# define BIT_DEPTH 8 +#endif + +#if BIT_DEPTH == 32 +# define PIXEL_TYPE SWS_PIXEL_U32 +# define PIXEL_MAX 0xFFFFFFFFu +# define SWAP_BYTES av_bswap32 +# define pixel_t uint32_t +# define block_t u32block_t +# define px u32 +#elif BIT_DEPTH == 16 +# define PIXEL_TYPE SWS_PIXEL_U16 +# define PIXEL_MAX 0xFFFFu +# define SWAP_BYTES av_bswap16 +# define pixel_t uint16_t +# define block_t u16block_t +# define px u16 +#elif BIT_DEPTH == 8 +# define PIXEL_TYPE SWS_PIXEL_U8 +# define PIXEL_MAX 0xFFu +# define pixel_t uint8_t +# define block_t u8block_t +# define px u8 +#else +# error Invalid BIT_DEPTH +#endif + +#define IS_FLOAT 0 +#define FMT_CHAR u +#define PIXEL_MIN 0 +#include "ops_tmpl_common.c" + +DECL_READ(read_planar, const int elems) +{ + block_t x, y, z, w; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + x[i] = in0[i]; + if (elems > 1) + y[i] = in1[i]; + if (elems > 2) + z[i] = in2[i]; + if (elems > 3) + w[i] = in3[i]; + } + + CONTINUE(block_t, x, y, z, w); +} + +DECL_READ(read_packed, const int elems) +{ + block_t x, y, z, w; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + x[i] = in0[elems * i + 0]; + if (elems > 1) + y[i] = in0[elems * i + 1]; + if (elems > 2) + z[i] = in0[elems * i + 2]; + if (elems > 3) + w[i] = in0[elems * i + 3]; + } + + CONTINUE(block_t, x, y, z, w); +} + +DECL_WRITE(write_planar, const int elems) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + out0[i] = x[i]; + if (elems > 1) + out1[i] = y[i]; + if (elems > 2) + out2[i] = z[i]; + if (elems > 3) + out3[i] = w[i]; + } +} + +DECL_WRITE(write_packed, const int elems) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + out0[elems * i + 0] = x[i]; + if (elems > 1) + out0[elems * i + 1] = y[i]; + if (elems > 2) + out0[elems * i + 2] = z[i]; + if (elems > 3) + out0[elems * i + 3] = w[i]; + } +} + +#define WRAP_READ(FUNC, ELEMS, FRAC, PACKED) \ +DECL_IMPL(FUNC##ELEMS) \ +{ \ + CALL_READ(FUNC, ELEMS); \ + for (int i = 0; i < (PACKED ? 1 : ELEMS); i++) \ + iter->in[i] += sizeof(block_t) * (PACKED ? ELEMS : 1) >> FRAC; \ +} \ + \ +DECL_ENTRY(FUNC##ELEMS, \ + .op = SWS_OP_READ, \ + .rw = { \ + .elems = ELEMS, \ + .packed = PACKED, \ + .frac = FRAC, \ + }, \ +); + +WRAP_READ(read_planar, 1, 0, false) +WRAP_READ(read_planar, 2, 0, false) +WRAP_READ(read_planar, 3, 0, false) +WRAP_READ(read_planar, 4, 0, false) +WRAP_READ(read_packed, 2, 0, true) +WRAP_READ(read_packed, 3, 0, true) +WRAP_READ(read_packed, 4, 0, true) + +#define WRAP_WRITE(FUNC, ELEMS, FRAC, PACKED) \ +DECL_IMPL(FUNC##ELEMS) \ +{ \ + CALL_WRITE(FUNC, ELEMS); \ + for (int i = 0; i < (PACKED ? 1 : ELEMS); i++) \ + iter->out[i] += sizeof(block_t) * (PACKED ? ELEMS : 1) >> FRAC; \ +} \ + \ +DECL_ENTRY(FUNC##ELEMS, \ + .op = SWS_OP_WRITE, \ + .rw = { \ + .elems = ELEMS, \ + .packed = PACKED, \ + .frac = FRAC, \ + }, \ +); + +WRAP_WRITE(write_planar, 1, 0, false) +WRAP_WRITE(write_planar, 2, 0, false) +WRAP_WRITE(write_planar, 3, 0, false) +WRAP_WRITE(write_planar, 4, 0, false) +WRAP_WRITE(write_packed, 2, 0, true) +WRAP_WRITE(write_packed, 3, 0, true) +WRAP_WRITE(write_packed, 4, 0, true) + +#if BIT_DEPTH == 8 +DECL_READ(read_nibbles, const int elems) +{ + block_t x, y, z, w; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) { + const pixel_t val = ((const pixel_t *) in0)[i >> 1]; + x[i + 0] = val >> 4; /* high nibble */ + x[i + 1] = val & 0xF; /* low nibble */ + } + + CONTINUE(block_t, x, y, z, w); +} + +DECL_READ(read_bits, const int elems) +{ + block_t x, y, z, w; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) { + const pixel_t val = ((const pixel_t *) in0)[i >> 3]; + x[i + 0] = (val >> 7) & 1; + x[i + 1] = (val >> 6) & 1; + x[i + 2] = (val >> 5) & 1; + x[i + 3] = (val >> 4) & 1; + x[i + 4] = (val >> 3) & 1; + x[i + 5] = (val >> 2) & 1; + x[i + 6] = (val >> 1) & 1; + x[i + 7] = (val >> 0) & 1; + } + + CONTINUE(block_t, x, y, z, w); +} + +WRAP_READ(read_nibbles, 1, 1, false) +WRAP_READ(read_bits, 1, 3, false) + +DECL_WRITE(write_nibbles, const int elems) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) + out0[i >> 1] = x[i] << 4 | x[i + 1]; +} + +DECL_WRITE(write_bits, const int elems) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) { + out0[i >> 3] = x[i + 0] << 7 | + x[i + 1] << 6 | + x[i + 2] << 5 | + x[i + 3] << 4 | + x[i + 4] << 3 | + x[i + 5] << 2 | + x[i + 6] << 1 | + x[i + 7]; + } +} + +WRAP_WRITE(write_nibbles, 1, 1, false) +WRAP_WRITE(write_bits, 1, 3, false) +#endif /* BIT_DEPTH == 8 */ + +#ifdef SWAP_BYTES +DECL_PATTERN(swap_bytes) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) + x[i] = SWAP_BYTES(x[i]); + if (Y) + y[i] = SWAP_BYTES(y[i]); + if (Z) + z[i] = SWAP_BYTES(z[i]); + if (W) + w[i] = SWAP_BYTES(w[i]); + } + + CONTINUE(block_t, x, y, z, w); +} + +WRAP_COMMON_PATTERNS(swap_bytes, .op = SWS_OP_SWAP_BYTES); +#endif /* SWAP_BYTES */ + +#if BIT_DEPTH == 8 +DECL_PATTERN(expand16) +{ + u16block_t x16, y16, z16, w16; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) + x16[i] = x[i] << 8 | x[i]; + if (Y) + y16[i] = y[i] << 8 | y[i]; + if (Z) + z16[i] = z[i] << 8 | z[i]; + if (W) + w16[i] = w[i] << 8 | w[i]; + } + + CONTINUE(u16block_t, x16, y16, z16, w16); +} + +WRAP_COMMON_PATTERNS(expand16, + .op = SWS_OP_CONVERT, + .convert.to = SWS_PIXEL_U16, + .convert.expand = true, +); + +DECL_PATTERN(expand32) +{ + u32block_t x32, y32, z32, w32; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + x32[i] = x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i]; + y32[i] = y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i]; + z32[i] = z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i]; + w32[i] = w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i]; + } + + CONTINUE(u32block_t, x32, y32, z32, w32); +} + +WRAP_COMMON_PATTERNS(expand32, + .op = SWS_OP_CONVERT, + .convert.to = SWS_PIXEL_U32, + .convert.expand = true, +); +#endif + +#define WRAP_PACK_UNPACK(X, Y, Z, W) \ +inline DECL_IMPL(pack_##X##Y##Z##W) \ +{ \ + SWS_LOOP \ + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \ + x[i] = x[i] << (Y+Z+W); \ + if (Y) \ + x[i] |= y[i] << (Z+W); \ + if (Z) \ + x[i] |= z[i] << W; \ + if (W) \ + x[i] |= w[i]; \ + } \ + \ + CONTINUE(block_t, x, y, z, w); \ +} \ + \ +DECL_ENTRY(pack_##X##Y##Z##W, \ + .op = SWS_OP_PACK, \ + .pack.pattern = { X, Y, Z, W }, \ +); \ + \ +inline DECL_IMPL(unpack_##X##Y##Z##W) \ +{ \ + SWS_LOOP \ + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \ + const pixel_t val = x[i]; \ + x[i] = val >> (Y+Z+W); \ + if (Y) \ + y[i] = (val >> (Z+W)) & ((1 << Y) - 1); \ + if (Z) \ + z[i] = (val >> W) & ((1 << Z) - 1); \ + if (W) \ + w[i] = val & ((1 << W) - 1); \ + } \ + \ + CONTINUE(block_t, x, y, z, w); \ +} \ + \ +DECL_ENTRY(unpack_##X##Y##Z##W, \ + .op = SWS_OP_UNPACK, \ + .pack.pattern = { X, Y, Z, W }, \ +); + +WRAP_PACK_UNPACK( 3, 3, 2, 0) +WRAP_PACK_UNPACK( 2, 3, 3, 0) +WRAP_PACK_UNPACK( 1, 2, 1, 0) +WRAP_PACK_UNPACK( 5, 6, 5, 0) +WRAP_PACK_UNPACK( 5, 5, 5, 0) +WRAP_PACK_UNPACK( 4, 4, 4, 0) +WRAP_PACK_UNPACK( 2, 10, 10, 10) +WRAP_PACK_UNPACK(10, 10, 10, 2) + +#if BIT_DEPTH != 8 +DECL_PATTERN(lshift) +{ + const uint8_t amount = impl->priv.u8[0]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + x[i] <<= amount; + y[i] <<= amount; + z[i] <<= amount; + w[i] <<= amount; + } + + CONTINUE(block_t, x, y, z, w); +} + +DECL_PATTERN(rshift) +{ + const uint8_t amount = impl->priv.u8[0]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + x[i] >>= amount; + y[i] >>= amount; + z[i] >>= amount; + w[i] >>= amount; + } + + CONTINUE(block_t, x, y, z, w); +} + +WRAP_COMMON_PATTERNS(lshift, + .op = SWS_OP_LSHIFT, + .setup = ff_sws_setup_u8, + .flexible = true, +); + +WRAP_COMMON_PATTERNS(rshift, + .op = SWS_OP_RSHIFT, + .setup = ff_sws_setup_u8, + .flexible = true, +); +#endif /* BIT_DEPTH != 8 */ + +DECL_PATTERN(convert_float) +{ + f32block_t xf, yf, zf, wf; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + xf[i] = x[i]; + yf[i] = y[i]; + zf[i] = z[i]; + wf[i] = w[i]; + } + + CONTINUE(f32block_t, xf, yf, zf, wf); +} + +WRAP_COMMON_PATTERNS(convert_float, + .op = SWS_OP_CONVERT, + .convert.to = SWS_PIXEL_F32, +); + +/** + * Swizzle by directly swapping the order of arguments to the continuation. + * Note that this is only safe to do if no arguments are duplicated. + */ +#define DECL_SWIZZLE(X, Y, Z, W) \ +static SWS_FUNC void \ +fn(swizzle_##X##Y##Z##W)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + block_t c0, block_t c1, block_t c2, block_t c3) \ +{ \ + CONTINUE(block_t, c##X, c##Y, c##Z, c##W); \ +} \ + \ +DECL_ENTRY(swizzle_##X##Y##Z##W, \ + .op = SWS_OP_SWIZZLE, \ + .swizzle.in = { X, Y, Z, W }, \ +); + +DECL_SWIZZLE(3, 0, 1, 2) +DECL_SWIZZLE(3, 0, 2, 1) +DECL_SWIZZLE(2, 1, 0, 3) +DECL_SWIZZLE(3, 2, 1, 0) +DECL_SWIZZLE(3, 1, 0, 2) +DECL_SWIZZLE(3, 2, 0, 1) +DECL_SWIZZLE(1, 2, 0, 3) +DECL_SWIZZLE(1, 0, 2, 3) +DECL_SWIZZLE(2, 0, 1, 3) +DECL_SWIZZLE(2, 3, 1, 0) +DECL_SWIZZLE(2, 1, 3, 0) +DECL_SWIZZLE(1, 2, 3, 0) +DECL_SWIZZLE(1, 3, 2, 0) +DECL_SWIZZLE(0, 2, 1, 3) +DECL_SWIZZLE(0, 2, 3, 1) +DECL_SWIZZLE(0, 3, 1, 2) +DECL_SWIZZLE(3, 1, 2, 0) +DECL_SWIZZLE(0, 3, 2, 1) + +/* Broadcast luma -> rgb (only used for y(a) -> rgb(a)) */ +#define DECL_EXPAND_LUMA(X, W, T0, T1) \ +static SWS_FUNC void \ +fn(expand_luma_##X##W)(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + block_t c0, block_t c1, block_t c2, block_t c3) \ +{ \ + SWS_LOOP \ + for (int i = 0; i < SWS_BLOCK_SIZE; i++) \ + T0[i] = T1[i] = c0[i]; \ + \ + CONTINUE(block_t, c##X, T0, T1, c##W); \ +} \ + \ +DECL_ENTRY(expand_luma_##X##W, \ + .op = SWS_OP_SWIZZLE, \ + .swizzle.in = { X, 0, 0, W }, \ +); + +DECL_EXPAND_LUMA(0, 3, c1, c2) +DECL_EXPAND_LUMA(3, 0, c1, c2) +DECL_EXPAND_LUMA(1, 0, c2, c3) +DECL_EXPAND_LUMA(0, 1, c2, c3) + +static const SwsOpTable fn(op_table_int) = { + .block_size = SWS_BLOCK_SIZE, + .entries = { + &fn(op_read_planar1), + &fn(op_read_planar2), + &fn(op_read_planar3), + &fn(op_read_planar4), + &fn(op_read_packed2), + &fn(op_read_packed3), + &fn(op_read_packed4), + + &fn(op_write_planar1), + &fn(op_write_planar2), + &fn(op_write_planar3), + &fn(op_write_planar4), + &fn(op_write_packed2), + &fn(op_write_packed3), + &fn(op_write_packed4), + +#if BIT_DEPTH == 8 + &fn(op_read_bits1), + &fn(op_read_nibbles1), + &fn(op_write_bits1), + &fn(op_write_nibbles1), + + &fn(op_pack_1210), + &fn(op_pack_2330), + &fn(op_pack_3320), + + &fn(op_unpack_1210), + &fn(op_unpack_2330), + &fn(op_unpack_3320), + + REF_COMMON_PATTERNS(expand16), + REF_COMMON_PATTERNS(expand32), +#elif BIT_DEPTH == 16 + &fn(op_pack_4440), + &fn(op_pack_5550), + &fn(op_pack_5650), + &fn(op_unpack_4440), + &fn(op_unpack_5550), + &fn(op_unpack_5650), +#elif BIT_DEPTH == 32 + &fn(op_pack_2101010), + &fn(op_pack_1010102), + &fn(op_unpack_2101010), + &fn(op_unpack_1010102), +#endif + +#ifdef SWAP_BYTES + REF_COMMON_PATTERNS(swap_bytes), +#endif + + REF_COMMON_PATTERNS(min), + REF_COMMON_PATTERNS(max), + REF_COMMON_PATTERNS(scale), + REF_COMMON_PATTERNS(convert_float), + + &fn(op_clear_1110), + &fn(op_clear_0111), + &fn(op_clear_0011), + &fn(op_clear_1001), + &fn(op_clear_1100), + &fn(op_clear_0101), + &fn(op_clear_1010), + &fn(op_clear_1000), + &fn(op_clear_0100), + &fn(op_clear_0010), + + &fn(op_swizzle_3012), + &fn(op_swizzle_3021), + &fn(op_swizzle_2103), + &fn(op_swizzle_3210), + &fn(op_swizzle_3102), + &fn(op_swizzle_3201), + &fn(op_swizzle_1203), + &fn(op_swizzle_1023), + &fn(op_swizzle_2013), + &fn(op_swizzle_2310), + &fn(op_swizzle_2130), + &fn(op_swizzle_1230), + &fn(op_swizzle_1320), + &fn(op_swizzle_0213), + &fn(op_swizzle_0231), + &fn(op_swizzle_0312), + &fn(op_swizzle_3120), + &fn(op_swizzle_0321), + + &fn(op_expand_luma_03), + &fn(op_expand_luma_30), + &fn(op_expand_luma_10), + &fn(op_expand_luma_01), + +#if BIT_DEPTH != 8 + REF_COMMON_PATTERNS(lshift), + REF_COMMON_PATTERNS(rshift), + REF_COMMON_PATTERNS(convert_uint8), +#endif /* BIT_DEPTH != 8 */ + +#if BIT_DEPTH != 16 + REF_COMMON_PATTERNS(convert_uint16), +#endif +#if BIT_DEPTH != 32 + REF_COMMON_PATTERNS(convert_uint32), +#endif + + NULL + }, +}; + +#undef PIXEL_TYPE +#undef PIXEL_MAX +#undef PIXEL_MIN +#undef SWAP_BYTES +#undef pixel_t +#undef block_t +#undef px + +#undef FMT_CHAR +#undef IS_FLOAT