diff --git a/libswscale/Makefile b/libswscale/Makefile
index c9dfa78c89..c6e32638e6 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -16,6 +16,7 @@ OBJS = alphablend.o                                     \
        input.o                                          \
        lut3d.o                                          \
        ops.o                                            \
+       ops_backend.o                                    \
        ops_chain.o                                      \
        ops_optimizer.o                                  \
        options.o                                        \
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 7c5f315c8f..10463513f2 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -27,7 +27,10 @@
 #include "ops.h"
 #include "ops_internal.h"
 
+const extern SwsOpBackend backend_c;
+
 const SwsOpBackend * const ff_sws_op_backends[] = {
+    &backend_c,
     NULL
 };
 
diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c
new file mode 100644
index 0000000000..ac76dedbee
--- /dev/null
+++ b/libswscale/ops_backend.c
@@ -0,0 +1,109 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ops_backend.h"
+
+#if AV_GCC_VERSION_AT_LEAST(4, 4)
+#pragma GCC optimize ("finite-math-only")
+#endif
+
+/* Array-based reference implementation */
+
+#ifndef SWS_BLOCK_SIZE
+#  define SWS_BLOCK_SIZE 32
+#endif
+
+typedef  uint8_t  u8block_t[SWS_BLOCK_SIZE];
+typedef uint16_t u16block_t[SWS_BLOCK_SIZE];
+typedef uint32_t u32block_t[SWS_BLOCK_SIZE];
+typedef    float f32block_t[SWS_BLOCK_SIZE];
+
+#define BIT_DEPTH 8
+# include "ops_tmpl_int.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 16
+# include "ops_tmpl_int.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 32
+# include "ops_tmpl_int.c"
+# include "ops_tmpl_float.c"
+#undef BIT_DEPTH
+
+static void process(const SwsOpExec *exec, const void *priv,
+                    const int bx_start, const int y_start, int bx_end, int y_end)
+{
+    const SwsOpChain *chain = priv;
+    const SwsOpImpl *impl = chain->impl;
+    SwsOpIter iter;
+
+    for (iter.y = y_start; iter.y < y_end; iter.y++) {
+        for (int i = 0; i < 4; i++) {
+            iter.in[i]  = exec->in[i]  + (iter.y - y_start) * exec->in_stride[i];
+            iter.out[i] = exec->out[i] + (iter.y - y_start) * exec->out_stride[i];
+        }
+
+        for (int block = bx_start; block < bx_end; block++) {
+            iter.x = block * SWS_BLOCK_SIZE;
+            ((void (*)(SwsOpIter *, const SwsOpImpl *)) impl->cont)
+                (&iter, &impl[1]);
+        }
+    }
+}
+
+static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
+{
+    int ret;
+
+    SwsOpChain *chain = ff_sws_op_chain_alloc();
+    if (!chain)
+        return AVERROR(ENOMEM);
+
+    static const SwsOpTable *const tables[] = {
+        &bitfn(op_table_int,    u8),
+        &bitfn(op_table_int,   u16),
+        &bitfn(op_table_int,   u32),
+        &bitfn(op_table_float, f32),
+    };
+
+    do {
+        ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops,
+                                       SWS_BLOCK_SIZE, chain);
+    } while (ret == AVERROR(EAGAIN));
+    if (ret < 0) {
+        ff_sws_op_chain_free(chain);
+        return ret;
+    }
+
+    *out = (SwsCompiledOp) {
+        .func       = process,
+        .block_size = SWS_BLOCK_SIZE,
+        .cpu_flags  = chain->cpu_flags,
+        .priv       = chain,
+        .free       = (void (*)(void *)) ff_sws_op_chain_free,
+    };
+    return 0;
+}
+
+const SwsOpBackend backend_c = {
+    .name       = "c",
+    .compile    = compile,
+};
diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h
new file mode 100644
index 0000000000..7880bb608e
--- /dev/null
+++ b/libswscale/ops_backend.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_OPS_BACKEND_H
+#define SWSCALE_OPS_BACKEND_H
+
+/**
+ * Helper macros for the C-based backend.
+ *
+ * To use these macros, the following types must be defined:
+ *  - PIXEL_TYPE should be one of SWS_PIXEL_*
+ *  - pixel_t should be the type of pixels
+ *  - block_t should be the type of blocks (groups of pixels)
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+
+#include "ops_chain.h"
+
+/**
+ * Internal context holding per-iter execution data. The data pointers will be
+ * directly incremented by the corresponding read/write functions.
+ */
+typedef struct SwsOpIter {
+    const uint8_t *in[4];
+    uint8_t *out[4];
+    int x, y;
+} SwsOpIter;
+
+#ifdef __clang__
+#  define SWS_FUNC
+#  define SWS_LOOP AV_PRAGMA(clang loop vectorize(assume_safety))
+#elif defined(__GNUC__)
+#  define SWS_FUNC __attribute__((optimize("tree-vectorize")))
+#  define SWS_LOOP AV_PRAGMA(GCC ivdep)
+#else
+#  define SWS_FUNC
+#  define SWS_LOOP
+#endif
+
+/* Miscellaneous helpers */
+#define bitfn2(name, ext) name ## _ ## ext
+#define bitfn(name, ext)  bitfn2(name, ext)
+
+#define FN_SUFFIX AV_JOIN(FMT_CHAR, BIT_DEPTH)
+#define fn(name)  bitfn(name, FN_SUFFIX)
+
+#define av_q2pixel(q) ((q).den ? (pixel_t) (q).num / (q).den : 0)
+
+/* Helper macros to make writing common function signatures less painful */
+#define DECL_FUNC(NAME, ...)                                                    \
+    static av_always_inline void fn(NAME)(SwsOpIter *restrict iter,             \
+                                          const SwsOpImpl *restrict impl,       \
+                                          block_t x, block_t y,                 \
+                                          block_t z, block_t w,                 \
+                                          __VA_ARGS__)
+
+#define DECL_READ(NAME, ...)                                                    \
+    static av_always_inline void fn(NAME)(SwsOpIter *restrict iter,             \
+                                          const SwsOpImpl *restrict impl,       \
+                                          const pixel_t *restrict in0,          \
+                                          const pixel_t *restrict in1,          \
+                                          const pixel_t *restrict in2,          \
+                                          const pixel_t *restrict in3,          \
+                                          __VA_ARGS__)
+
+#define DECL_WRITE(NAME, ...)                                                   \
+    DECL_FUNC(NAME, pixel_t *restrict out0, pixel_t *restrict out1,             \
+                    pixel_t *restrict out2, pixel_t *restrict out3,             \
+                    __VA_ARGS__)
+
+/* Helper macros to call into functions declared with DECL_FUNC_* */
+#define CALL(FUNC, ...) \
+    fn(FUNC)(iter, impl, x, y, z, w, __VA_ARGS__)
+
+#define CALL_READ(FUNC, ...)                                                    \
+    fn(FUNC)(iter, impl, (const pixel_t *) iter->in[0],                         \
+                         (const pixel_t *) iter->in[1],                         \
+                         (const pixel_t *) iter->in[2],                         \
+                         (const pixel_t *) iter->in[3], __VA_ARGS__)
+
+#define CALL_WRITE(FUNC, ...)                                                   \
+    CALL(FUNC, (pixel_t *) iter->out[0], (pixel_t *) iter->out[1],              \
+               (pixel_t *) iter->out[2], (pixel_t *) iter->out[3], __VA_ARGS__)
+
+/* Helper macros to declare continuation functions */
+#define DECL_IMPL(NAME)                                                         \
+    static SWS_FUNC void fn(NAME)(SwsOpIter *restrict iter,                     \
+                                  const SwsOpImpl *restrict impl,               \
+                                  block_t x, block_t y,                         \
+                                  block_t z, block_t w)                         \
+
+/* Helper macro to call into the next continuation with a given type */
+#define CONTINUE(TYPE, ...)                                                     \
+    ((void (*)(SwsOpIter *, const SwsOpImpl *,                                  \
+               TYPE x, TYPE y, TYPE z, TYPE w)) impl->cont)                     \
+        (iter, &impl[1], __VA_ARGS__)
+
+/* Helper macros for common op setup code */
+#define DECL_SETUP(NAME)                                                        \
+    static int fn(NAME)(const SwsOp *op, SwsOpPriv *out)
+
+#define SETUP_MEMDUP(c) ff_setup_memdup(&(c), sizeof(c), out)
+static inline int ff_setup_memdup(const void *c, size_t size, SwsOpPriv *out)
+{
+    out->ptr = av_memdup(c, size);
+    return out->ptr ? 0 : AVERROR(ENOMEM);
+}
+
+/* Helper macro for declaring op table entries */
+#define DECL_ENTRY(NAME, ...)                                                   \
+    static const SwsOpEntry fn(op_##NAME) = {                                   \
+        .func = (SwsFuncPtr) fn(NAME),                                          \
+        .type = PIXEL_TYPE,                                                     \
+        __VA_ARGS__                                                             \
+    }
+
+/* Helpers to define functions for common subsets of components */
+#define DECL_PATTERN(NAME) \
+    DECL_FUNC(NAME, const bool X, const bool Y, const bool Z, const bool W)
+
+#define WRAP_PATTERN(FUNC, X, Y, Z, W, ...)                                     \
+    DECL_IMPL(FUNC##_##X##Y##Z##W)                                              \
+    {                                                                           \
+        CALL(FUNC, X, Y, Z, W);                                                 \
+    }                                                                           \
+                                                                                \
+    DECL_ENTRY(FUNC##_##X##Y##Z##W,                                             \
+        .unused = { !X, !Y, !Z, !W },                                           \
+        __VA_ARGS__                                                             \
+    )
+
+#define WRAP_COMMON_PATTERNS(FUNC, ...)                                         \
+    WRAP_PATTERN(FUNC, 1, 0, 0, 0, __VA_ARGS__);                                \
+    WRAP_PATTERN(FUNC, 1, 0, 0, 1, __VA_ARGS__);                                \
+    WRAP_PATTERN(FUNC, 1, 1, 1, 0, __VA_ARGS__);                                \
+    WRAP_PATTERN(FUNC, 1, 1, 1, 1, __VA_ARGS__)
+
+#define REF_COMMON_PATTERNS(NAME)                                               \
+    &fn(op_##NAME##_1000),                                                      \
+    &fn(op_##NAME##_1001),                                                      \
+    &fn(op_##NAME##_1110),                                                      \
+    &fn(op_##NAME##_1111)
+
+#endif
diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c
new file mode 100644
index 0000000000..9490be0313
--- /dev/null
+++ b/libswscale/ops_tmpl_common.c
@@ -0,0 +1,176 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ops_backend.h"
+
+#ifndef BIT_DEPTH
+#  error Should only be included from ops_tmpl_*.c!
+#endif
+
+#define WRAP_CONVERT_UINT(N)                                                    \
+DECL_PATTERN(convert_uint##N)                                                   \
+{                                                                               \
+    u##N##block_t xu, yu, zu, wu;                                               \
+                                                                                \
+    SWS_LOOP                                                                    \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                  \
+        if (X)                                                                  \
+            xu[i] = x[i];                                                       \
+        if (Y)                                                                  \
+            yu[i] = y[i];                                                       \
+        if (Z)                                                                  \
+            zu[i] = z[i];                                                       \
+        if (W)                                                                  \
+            wu[i] = w[i];                                                       \
+    }                                                                           \
+                                                                                \
+    CONTINUE(u##N##block_t, xu, yu, zu, wu);                                    \
+}                                                                               \
+                                                                                \
+WRAP_COMMON_PATTERNS(convert_uint##N,                                           \
+    .op = SWS_OP_CONVERT,                                                       \
+    .convert.to = SWS_PIXEL_U##N,                                               \
+);
+
+#if BIT_DEPTH != 8
+WRAP_CONVERT_UINT(8)
+#endif
+
+#if BIT_DEPTH != 16
+WRAP_CONVERT_UINT(16)
+#endif
+
+#if BIT_DEPTH != 32 || defined(IS_FLOAT)
+WRAP_CONVERT_UINT(32)
+#endif
+
+DECL_PATTERN(clear)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (!X)
+            x[i] = impl->priv.px[0];
+        if (!Y)
+            y[i] = impl->priv.px[1];
+        if (!Z)
+            z[i] = impl->priv.px[2];
+        if (!W)
+            w[i] = impl->priv.px[3];
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+#define WRAP_CLEAR(X, Y, Z, W)                                                  \
+DECL_IMPL(clear##_##X##Y##Z##W)                                                 \
+{                                                                               \
+    CALL(clear, X, Y, Z, W);                                                    \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(clear##_##X##Y##Z##W,                                                \
+    .setup = ff_sws_setup_q4,                                                   \
+    .op = SWS_OP_CLEAR,                                                         \
+    .flexible = true,                                                           \
+    .unused = { !X, !Y, !Z, !W },                                               \
+);
+
+WRAP_CLEAR(1, 1, 1, 0) /* rgba alpha */
+WRAP_CLEAR(0, 1, 1, 1) /* argb alpha */
+
+WRAP_CLEAR(0, 0, 1, 1) /* vuya chroma */
+WRAP_CLEAR(1, 0, 0, 1) /* yuva chroma */
+WRAP_CLEAR(1, 1, 0, 0) /* ayuv chroma */
+WRAP_CLEAR(0, 1, 0, 1) /* uyva chroma */
+WRAP_CLEAR(1, 0, 1, 0) /* xvyu chroma */
+
+WRAP_CLEAR(1, 0, 0, 0) /* gray -> yuva */
+WRAP_CLEAR(0, 1, 0, 0) /* gray -> ayuv */
+WRAP_CLEAR(0, 0, 1, 0) /* gray -> vuya */
+
+DECL_PATTERN(min)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] = FFMIN(x[i], impl->priv.px[0]);
+        if (Y)
+            y[i] = FFMIN(y[i], impl->priv.px[1]);
+        if (Z)
+            z[i] = FFMIN(z[i], impl->priv.px[2]);
+        if (W)
+            w[i] = FFMIN(w[i], impl->priv.px[3]);
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_PATTERN(max)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] = FFMAX(x[i], impl->priv.px[0]);
+        if (Y)
+            y[i] = FFMAX(y[i], impl->priv.px[1]);
+        if (Z)
+            z[i] = FFMAX(z[i], impl->priv.px[2]);
+        if (W)
+            w[i] = FFMAX(w[i], impl->priv.px[3]);
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(min,
+    .op = SWS_OP_MIN,
+    .setup = ff_sws_setup_q4,
+    .flexible = true,
+);
+
+WRAP_COMMON_PATTERNS(max,
+    .op = SWS_OP_MAX,
+    .setup = ff_sws_setup_q4,
+    .flexible = true,
+);
+
+DECL_PATTERN(scale)
+{
+    const pixel_t scale = impl->priv.px[0];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] *= scale;
+        if (Y)
+            y[i] *= scale;
+        if (Z)
+            z[i] *= scale;
+        if (W)
+            w[i] *= scale;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(scale,
+    .op = SWS_OP_SCALE,
+    .setup = ff_sws_setup_q,
+    .flexible = true,
+);
diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c
new file mode 100644
index 0000000000..345f9dd57b
--- /dev/null
+++ b/libswscale/ops_tmpl_float.c
@@ -0,0 +1,257 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "ops_backend.h"
+
+#ifndef BIT_DEPTH
+#  define BIT_DEPTH 32
+#endif
+
+#if BIT_DEPTH == 32
+#  define PIXEL_TYPE SWS_PIXEL_F32
+#  define PIXEL_MAX  FLT_MAX
+#  define PIXEL_MIN  FLT_MIN
+#  define pixel_t    float
+#  define block_t    f32block_t
+#  define px         f32
+#else
+#  error Invalid BIT_DEPTH
+#endif
+
+#define IS_FLOAT 1
+#define FMT_CHAR f
+#include "ops_tmpl_common.c"
+
+DECL_SETUP(setup_dither)
+{
+    const int size = 1 << op->dither.size_log2;
+    if (!size) {
+        /* We special case this value */
+        av_assert1(!av_cmp_q(op->dither.matrix[0], av_make_q(1, 2)));
+        out->ptr = NULL;
+        return 0;
+    }
+
+    const int width = FFMAX(size, SWS_BLOCK_SIZE);
+    pixel_t *matrix = out->ptr = av_malloc(sizeof(pixel_t) * size * width);
+    if (!matrix)
+        return AVERROR(ENOMEM);
+
+    for (int y = 0; y < size; y++) {
+        for (int x = 0; x < size; x++)
+            matrix[y * width + x] = av_q2pixel(op->dither.matrix[y * size + x]);
+        for (int x = size; x < width; x++) /* pad to block size */
+            matrix[y * width + x] = matrix[y * width + (x % size)];
+    }
+
+    return 0;
+}
+
+DECL_FUNC(dither, const int size_log2)
+{
+    const pixel_t *restrict matrix = impl->priv.ptr;
+    const int mask = (1 << size_log2) - 1;
+    const int y_line = iter->y;
+    const int row0 = (y_line +  0) & mask;
+    const int row1 = (y_line +  3) & mask;
+    const int row2 = (y_line +  2) & mask;
+    const int row3 = (y_line +  5) & mask;
+    const int size = 1 << size_log2;
+    const int width = FFMAX(size, SWS_BLOCK_SIZE);
+    const int base = iter->x & ~(SWS_BLOCK_SIZE - 1) & (size - 1);
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] += size_log2 ? matrix[row0 * width + base + i] : (pixel_t) 0.5;
+        y[i] += size_log2 ? matrix[row1 * width + base + i] : (pixel_t) 0.5;
+        z[i] += size_log2 ? matrix[row2 * width + base + i] : (pixel_t) 0.5;
+        w[i] += size_log2 ? matrix[row3 * width + base + i] : (pixel_t) 0.5;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+#define WRAP_DITHER(N)                                                          \
+DECL_IMPL(dither##N)                                                            \
+{                                                                               \
+    CALL(dither, N);                                                            \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(dither##N,                                                           \
+    .op = SWS_OP_DITHER,                                                        \
+    .dither_size = N,                                                           \
+    .setup = fn(setup_dither),                                                  \
+    .free = av_free,                                                            \
+);
+
+WRAP_DITHER(0)
+WRAP_DITHER(1)
+WRAP_DITHER(2)
+WRAP_DITHER(3)
+WRAP_DITHER(4)
+WRAP_DITHER(5)
+WRAP_DITHER(6)
+WRAP_DITHER(7)
+WRAP_DITHER(8)
+
+typedef struct {
+    /* Stored in split form for convenience */
+    pixel_t m[4][4];
+    pixel_t k[4];
+} fn(LinCoeffs);
+
+DECL_SETUP(setup_linear)
+{
+    fn(LinCoeffs) c;
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+            c.m[i][j] = av_q2pixel(op->lin.m[i][j]);
+        c.k[i] = av_q2pixel(op->lin.m[i][4]);
+    }
+
+    return SETUP_MEMDUP(c);
+}
+
+/**
+ * Fully general case for a 5x5 linear affine transformation. Should never be
+ * called without constant `mask`. This function will compile down to the
+ * appropriately optimized version for the required subset of operations when
+ * called with a constant mask.
+ */
+DECL_FUNC(linear_mask, const uint32_t mask)
+{
+    const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        const pixel_t xx = x[i];
+        const pixel_t yy = y[i];
+        const pixel_t zz = z[i];
+        const pixel_t ww = w[i];
+
+        x[i]  = (mask & SWS_MASK_OFF(0)) ? c.k[0] : 0;
+        x[i] += (mask & SWS_MASK(0, 0))  ? c.m[0][0] * xx : xx;
+        x[i] += (mask & SWS_MASK(0, 1))  ? c.m[0][1] * yy : 0;
+        x[i] += (mask & SWS_MASK(0, 2))  ? c.m[0][2] * zz : 0;
+        x[i] += (mask & SWS_MASK(0, 3))  ? c.m[0][3] * ww : 0;
+
+        y[i]  = (mask & SWS_MASK_OFF(1)) ? c.k[1] : 0;
+        y[i] += (mask & SWS_MASK(1, 0))  ? c.m[1][0] * xx : 0;
+        y[i] += (mask & SWS_MASK(1, 1))  ? c.m[1][1] * yy : yy;
+        y[i] += (mask & SWS_MASK(1, 2))  ? c.m[1][2] * zz : 0;
+        y[i] += (mask & SWS_MASK(1, 3))  ? c.m[1][3] * ww : 0;
+
+        z[i]  = (mask & SWS_MASK_OFF(2)) ? c.k[2] : 0;
+        z[i] += (mask & SWS_MASK(2, 0))  ? c.m[2][0] * xx : 0;
+        z[i] += (mask & SWS_MASK(2, 1))  ? c.m[2][1] * yy : 0;
+        z[i] += (mask & SWS_MASK(2, 2))  ? c.m[2][2] * zz : zz;
+        z[i] += (mask & SWS_MASK(2, 3))  ? c.m[2][3] * ww : 0;
+
+        w[i]  = (mask & SWS_MASK_OFF(3)) ? c.k[3] : 0;
+        w[i] += (mask & SWS_MASK(3, 0))  ? c.m[3][0] * xx : 0;
+        w[i] += (mask & SWS_MASK(3, 1))  ? c.m[3][1] * yy : 0;
+        w[i] += (mask & SWS_MASK(3, 2))  ? c.m[3][2] * zz : 0;
+        w[i] += (mask & SWS_MASK(3, 3))  ? c.m[3][3] * ww : ww;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+#define WRAP_LINEAR(NAME, MASK)                                                 \
+DECL_IMPL(linear_##NAME)                                                        \
+{                                                                               \
+    CALL(linear_mask, MASK);                                                    \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(linear_##NAME,                                                       \
+    .op    = SWS_OP_LINEAR,                                                     \
+    .setup = fn(setup_linear),                                                  \
+    .free  = av_free,                                                           \
+    .linear_mask = (MASK),                                                      \
+);
+
+WRAP_LINEAR(luma,      SWS_MASK_LUMA)
+WRAP_LINEAR(alpha,     SWS_MASK_ALPHA)
+WRAP_LINEAR(lumalpha,  SWS_MASK_LUMA | SWS_MASK_ALPHA)
+WRAP_LINEAR(dot3,      0x7)
+WRAP_LINEAR(row0,      SWS_MASK_ROW(0))
+WRAP_LINEAR(row0a,     SWS_MASK_ROW(0) | SWS_MASK_ALPHA)
+WRAP_LINEAR(diag3,     SWS_MASK_DIAG3)
+WRAP_LINEAR(diag4,     SWS_MASK_DIAG4)
+WRAP_LINEAR(diagoff3,  SWS_MASK_DIAG3 | SWS_MASK_OFF3)
+WRAP_LINEAR(matrix3,   SWS_MASK_MAT3)
+WRAP_LINEAR(affine3,   SWS_MASK_MAT3 | SWS_MASK_OFF3)
+WRAP_LINEAR(affine3a,  SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA)
+WRAP_LINEAR(matrix4,   SWS_MASK_MAT4)
+WRAP_LINEAR(affine4,   SWS_MASK_MAT4 | SWS_MASK_OFF4)
+
+static const SwsOpTable fn(op_table_float) = {
+    .block_size = SWS_BLOCK_SIZE,
+    .entries = {
+        REF_COMMON_PATTERNS(convert_uint8),
+        REF_COMMON_PATTERNS(convert_uint16),
+        REF_COMMON_PATTERNS(convert_uint32),
+
+        &fn(op_clear_1110),
+        REF_COMMON_PATTERNS(min),
+        REF_COMMON_PATTERNS(max),
+        REF_COMMON_PATTERNS(scale),
+
+        &fn(op_dither0),
+        &fn(op_dither1),
+        &fn(op_dither2),
+        &fn(op_dither3),
+        &fn(op_dither4),
+        &fn(op_dither5),
+        &fn(op_dither6),
+        &fn(op_dither7),
+        &fn(op_dither8),
+
+        &fn(op_linear_luma),
+        &fn(op_linear_alpha),
+        &fn(op_linear_lumalpha),
+        &fn(op_linear_dot3),
+        &fn(op_linear_row0),
+        &fn(op_linear_row0a),
+        &fn(op_linear_diag3),
+        &fn(op_linear_diag4),
+        &fn(op_linear_diagoff3),
+        &fn(op_linear_matrix3),
+        &fn(op_linear_affine3),
+        &fn(op_linear_affine3a),
+        &fn(op_linear_matrix4),
+        &fn(op_linear_affine4),
+
+        NULL
+    },
+};
+
+#undef PIXEL_TYPE
+#undef PIXEL_MAX
+#undef PIXEL_MIN
+#undef pixel_t
+#undef block_t
+#undef px
+
+#undef FMT_CHAR
+#undef IS_FLOAT
diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c
new file mode 100644
index 0000000000..0337ae5708
--- /dev/null
+++ b/libswscale/ops_tmpl_int.c
@@ -0,0 +1,603 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/bswap.h"
+
+#include "ops_backend.h"
+
+#ifndef BIT_DEPTH
+#  define BIT_DEPTH 8
+#endif
+
+#if BIT_DEPTH == 32
+#  define PIXEL_TYPE SWS_PIXEL_U32
+#  define PIXEL_MAX  0xFFFFFFFFu
+#  define SWAP_BYTES av_bswap32
+#  define pixel_t    uint32_t
+#  define block_t    u32block_t
+#  define px         u32
+#elif BIT_DEPTH == 16
+#  define PIXEL_TYPE SWS_PIXEL_U16
+#  define PIXEL_MAX  0xFFFFu
+#  define SWAP_BYTES av_bswap16
+#  define pixel_t    uint16_t
+#  define block_t    u16block_t
+#  define px         u16
+#elif BIT_DEPTH == 8
+#  define PIXEL_TYPE SWS_PIXEL_U8
+#  define PIXEL_MAX  0xFFu
+#  define pixel_t    uint8_t
+#  define block_t    u8block_t
+#  define px         u8
+#else
+#  error Invalid BIT_DEPTH
+#endif
+
+#define IS_FLOAT  0
+#define FMT_CHAR  u
+#define PIXEL_MIN 0
+#include "ops_tmpl_common.c"
+
+DECL_READ(read_planar, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] = in0[i];
+        if (elems > 1)
+            y[i] = in1[i];
+        if (elems > 2)
+            z[i] = in2[i];
+        if (elems > 3)
+            w[i] = in3[i];
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_READ(read_packed, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] = in0[elems * i + 0];
+        if (elems > 1)
+            y[i] = in0[elems * i + 1];
+        if (elems > 2)
+            z[i] = in0[elems * i + 2];
+        if (elems > 3)
+            w[i] = in0[elems * i + 3];
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_WRITE(write_planar, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        out0[i] = x[i];
+        if (elems > 1)
+            out1[i] = y[i];
+        if (elems > 2)
+            out2[i] = z[i];
+        if (elems > 3)
+            out3[i] = w[i];
+    }
+}
+
+DECL_WRITE(write_packed, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        out0[elems * i + 0] = x[i];
+        if (elems > 1)
+            out0[elems * i + 1] = y[i];
+        if (elems > 2)
+            out0[elems * i + 2] = z[i];
+        if (elems > 3)
+            out0[elems * i + 3] = w[i];
+    }
+}
+
+#define WRAP_READ(FUNC, ELEMS, FRAC, PACKED)                                    \
+DECL_IMPL(FUNC##ELEMS)                                                          \
+{                                                                               \
+    CALL_READ(FUNC, ELEMS);                                                     \
+    for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                              \
+        iter->in[i] += sizeof(block_t) * (PACKED ? ELEMS : 1) >> FRAC;          \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(FUNC##ELEMS,                                                         \
+    .op = SWS_OP_READ,                                                          \
+    .rw = {                                                                     \
+        .elems  = ELEMS,                                                        \
+        .packed = PACKED,                                                       \
+        .frac   = FRAC,                                                         \
+    },                                                                          \
+);
+
+WRAP_READ(read_planar, 1, 0, false)
+WRAP_READ(read_planar, 2, 0, false)
+WRAP_READ(read_planar, 3, 0, false)
+WRAP_READ(read_planar, 4, 0, false)
+WRAP_READ(read_packed, 2, 0, true)
+WRAP_READ(read_packed, 3, 0, true)
+WRAP_READ(read_packed, 4, 0, true)
+
+#define WRAP_WRITE(FUNC, ELEMS, FRAC, PACKED)                                   \
+DECL_IMPL(FUNC##ELEMS)                                                          \
+{                                                                               \
+    CALL_WRITE(FUNC, ELEMS);                                                    \
+    for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                              \
+        iter->out[i] += sizeof(block_t) * (PACKED ? ELEMS : 1) >> FRAC;         \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(FUNC##ELEMS,                                                         \
+    .op = SWS_OP_WRITE,                                                         \
+    .rw = {                                                                     \
+        .elems  = ELEMS,                                                        \
+        .packed = PACKED,                                                       \
+        .frac   = FRAC,                                                         \
+    },                                                                          \
+);
+
+WRAP_WRITE(write_planar, 1, 0, false)
+WRAP_WRITE(write_planar, 2, 0, false)
+WRAP_WRITE(write_planar, 3, 0, false)
+WRAP_WRITE(write_planar, 4, 0, false)
+WRAP_WRITE(write_packed, 2, 0, true)
+WRAP_WRITE(write_packed, 3, 0, true)
+WRAP_WRITE(write_packed, 4, 0, true)
+
+#if BIT_DEPTH == 8
+DECL_READ(read_nibbles, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
+        const pixel_t val = ((const pixel_t *) in0)[i >> 1];
+        x[i + 0] = val >> 4;  /* high nibble */
+        x[i + 1] = val & 0xF; /* low nibble */
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_READ(read_bits, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
+        const pixel_t val = ((const pixel_t *) in0)[i >> 3];
+        x[i + 0] = (val >> 7) & 1;
+        x[i + 1] = (val >> 6) & 1;
+        x[i + 2] = (val >> 5) & 1;
+        x[i + 3] = (val >> 4) & 1;
+        x[i + 4] = (val >> 3) & 1;
+        x[i + 5] = (val >> 2) & 1;
+        x[i + 6] = (val >> 1) & 1;
+        x[i + 7] = (val >> 0) & 1;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_READ(read_nibbles, 1, 1, false)
+WRAP_READ(read_bits,    1, 3, false)
+
+DECL_WRITE(write_nibbles, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
+        out0[i >> 1] = x[i] << 4 | x[i + 1];
+}
+
+DECL_WRITE(write_bits, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
+        out0[i >> 3] = x[i + 0] << 7 |
+                       x[i + 1] << 6 |
+                       x[i + 2] << 5 |
+                       x[i + 3] << 4 |
+                       x[i + 4] << 3 |
+                       x[i + 5] << 2 |
+                       x[i + 6] << 1 |
+                       x[i + 7];
+    }
+}
+
+WRAP_WRITE(write_nibbles, 1, 1, false)
+WRAP_WRITE(write_bits,    1, 3, false)
+#endif /* BIT_DEPTH == 8 */
+
+#ifdef SWAP_BYTES
+DECL_PATTERN(swap_bytes)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] = SWAP_BYTES(x[i]);
+        if (Y)
+            y[i] = SWAP_BYTES(y[i]);
+        if (Z)
+            z[i] = SWAP_BYTES(z[i]);
+        if (W)
+            w[i] = SWAP_BYTES(w[i]);
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(swap_bytes, .op = SWS_OP_SWAP_BYTES);
+#endif /* SWAP_BYTES */
+
+#if BIT_DEPTH == 8
+DECL_PATTERN(expand16)
+{
+    u16block_t x16, y16, z16, w16;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x16[i] = x[i] << 8 | x[i];
+        if (Y)
+            y16[i] = y[i] << 8 | y[i];
+        if (Z)
+            z16[i] = z[i] << 8 | z[i];
+        if (W)
+            w16[i] = w[i] << 8 | w[i];
+    }
+
+    CONTINUE(u16block_t, x16, y16, z16, w16);
+}
+
+WRAP_COMMON_PATTERNS(expand16,
+    .op = SWS_OP_CONVERT,
+    .convert.to = SWS_PIXEL_U16,
+    .convert.expand = true,
+);
+
+DECL_PATTERN(expand32)
+{
+    u32block_t x32, y32, z32, w32;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x32[i] = x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i];
+        y32[i] = y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i];
+        z32[i] = z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i];
+        w32[i] = w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];
+    }
+
+    CONTINUE(u32block_t, x32, y32, z32, w32);
+}
+
+WRAP_COMMON_PATTERNS(expand32,
+    .op = SWS_OP_CONVERT,
+    .convert.to = SWS_PIXEL_U32,
+    .convert.expand = true,
+);
+#endif
+
+#define WRAP_PACK_UNPACK(X, Y, Z, W)                                            \
+inline DECL_IMPL(pack_##X##Y##Z##W)                                             \
+{                                                                               \
+    SWS_LOOP                                                                    \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                  \
+        x[i] = x[i] << (Y+Z+W);                                                 \
+        if (Y)                                                                  \
+            x[i] |= y[i] << (Z+W);                                              \
+        if (Z)                                                                  \
+            x[i] |= z[i] << W;                                                  \
+        if (W)                                                                  \
+            x[i] |= w[i];                                                       \
+    }                                                                           \
+                                                                                \
+    CONTINUE(block_t, x, y, z, w);                                              \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(pack_##X##Y##Z##W,                                                   \
+    .op = SWS_OP_PACK,                                                          \
+    .pack.pattern = { X, Y, Z, W },                                             \
+);                                                                              \
+                                                                                \
+inline DECL_IMPL(unpack_##X##Y##Z##W)                                           \
+{                                                                               \
+    SWS_LOOP                                                                    \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                  \
+        const pixel_t val = x[i];                                               \
+        x[i] = val >> (Y+Z+W);                                                  \
+        if (Y)                                                                  \
+            y[i] = (val >> (Z+W)) & ((1 << Y) - 1);                             \
+        if (Z)                                                                  \
+            z[i] = (val >> W) & ((1 << Z) - 1);                                 \
+        if (W)                                                                  \
+            w[i] = val & ((1 << W) - 1);                                        \
+    }                                                                           \
+                                                                                \
+    CONTINUE(block_t, x, y, z, w);                                              \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(unpack_##X##Y##Z##W,                                                 \
+    .op = SWS_OP_UNPACK,                                                        \
+    .pack.pattern = { X, Y, Z, W },                                             \
+);
+
+WRAP_PACK_UNPACK( 3,  3,  2,  0)
+WRAP_PACK_UNPACK( 2,  3,  3,  0)
+WRAP_PACK_UNPACK( 1,  2,  1,  0)
+WRAP_PACK_UNPACK( 5,  6,  5,  0)
+WRAP_PACK_UNPACK( 5,  5,  5,  0)
+WRAP_PACK_UNPACK( 4,  4,  4,  0)
+WRAP_PACK_UNPACK( 2, 10, 10, 10)
+WRAP_PACK_UNPACK(10, 10, 10,  2)
+
+#if BIT_DEPTH != 8
+DECL_PATTERN(lshift)
+{
+    const uint8_t amount = impl->priv.u8[0];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] <<= amount;
+        y[i] <<= amount;
+        z[i] <<= amount;
+        w[i] <<= amount;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_PATTERN(rshift)
+{
+    const uint8_t amount = impl->priv.u8[0];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] >>= amount;
+        y[i] >>= amount;
+        z[i] >>= amount;
+        w[i] >>= amount;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(lshift,
+    .op       = SWS_OP_LSHIFT,
+    .setup    = ff_sws_setup_u8,
+    .flexible = true,
+);
+
+WRAP_COMMON_PATTERNS(rshift,
+    .op       = SWS_OP_RSHIFT,
+    .setup    = ff_sws_setup_u8,
+    .flexible = true,
+);
+#endif /* BIT_DEPTH != 8 */
+
+DECL_PATTERN(convert_float)
+{
+    f32block_t xf, yf, zf, wf;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        xf[i] = x[i];
+        yf[i] = y[i];
+        zf[i] = z[i];
+        wf[i] = w[i];
+    }
+
+    CONTINUE(f32block_t, xf, yf, zf, wf);
+}
+
+WRAP_COMMON_PATTERNS(convert_float,
+    .op = SWS_OP_CONVERT,
+    .convert.to = SWS_PIXEL_F32,
+);
+
+/**
+ * Swizzle by directly swapping the order of arguments to the continuation.
+ * Note that this is only safe to do if no arguments are duplicated.
+ */
+#define DECL_SWIZZLE(X, Y, Z, W)                                                \
+static SWS_FUNC void                                                            \
+fn(swizzle_##X##Y##Z##W)(SwsOpIter *restrict iter,                              \
+                         const SwsOpImpl *restrict impl,                        \
+                         block_t c0, block_t c1, block_t c2, block_t c3)        \
+{                                                                               \
+    CONTINUE(block_t, c##X, c##Y, c##Z, c##W);                                  \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(swizzle_##X##Y##Z##W,                                                \
+    .op = SWS_OP_SWIZZLE,                                                       \
+    .swizzle.in = { X, Y, Z, W },                                               \
+);
+
+DECL_SWIZZLE(3, 0, 1, 2)
+DECL_SWIZZLE(3, 0, 2, 1)
+DECL_SWIZZLE(2, 1, 0, 3)
+DECL_SWIZZLE(3, 2, 1, 0)
+DECL_SWIZZLE(3, 1, 0, 2)
+DECL_SWIZZLE(3, 2, 0, 1)
+DECL_SWIZZLE(1, 2, 0, 3)
+DECL_SWIZZLE(1, 0, 2, 3)
+DECL_SWIZZLE(2, 0, 1, 3)
+DECL_SWIZZLE(2, 3, 1, 0)
+DECL_SWIZZLE(2, 1, 3, 0)
+DECL_SWIZZLE(1, 2, 3, 0)
+DECL_SWIZZLE(1, 3, 2, 0)
+DECL_SWIZZLE(0, 2, 1, 3)
+DECL_SWIZZLE(0, 2, 3, 1)
+DECL_SWIZZLE(0, 3, 1, 2)
+DECL_SWIZZLE(3, 1, 2, 0)
+DECL_SWIZZLE(0, 3, 2, 1)
+
+/* Broadcast luma -> rgb (only used for y(a) -> rgb(a)) */
+#define DECL_EXPAND_LUMA(X, W, T0, T1)                                          \
+static SWS_FUNC void                                                            \
+fn(expand_luma_##X##W)(SwsOpIter *restrict iter,                                \
+                       const SwsOpImpl *restrict impl,                          \
+                       block_t c0, block_t c1,  block_t c2, block_t c3)         \
+{                                                                               \
+    SWS_LOOP                                                                    \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++)                                    \
+        T0[i] = T1[i] = c0[i];                                                  \
+                                                                                \
+    CONTINUE(block_t, c##X, T0, T1, c##W);                                      \
+}                                                                               \
+                                                                                \
+DECL_ENTRY(expand_luma_##X##W,                                                  \
+    .op = SWS_OP_SWIZZLE,                                                       \
+    .swizzle.in = { X, 0, 0, W },                                               \
+);
+
+DECL_EXPAND_LUMA(0, 3, c1, c2)
+DECL_EXPAND_LUMA(3, 0, c1, c2)
+DECL_EXPAND_LUMA(1, 0, c2, c3)
+DECL_EXPAND_LUMA(0, 1, c2, c3)
+
+static const SwsOpTable fn(op_table_int) = {
+    .block_size = SWS_BLOCK_SIZE,
+    .entries = {
+        &fn(op_read_planar1),
+        &fn(op_read_planar2),
+        &fn(op_read_planar3),
+        &fn(op_read_planar4),
+        &fn(op_read_packed2),
+        &fn(op_read_packed3),
+        &fn(op_read_packed4),
+
+        &fn(op_write_planar1),
+        &fn(op_write_planar2),
+        &fn(op_write_planar3),
+        &fn(op_write_planar4),
+        &fn(op_write_packed2),
+        &fn(op_write_packed3),
+        &fn(op_write_packed4),
+
+#if BIT_DEPTH == 8
+        &fn(op_read_bits1),
+        &fn(op_read_nibbles1),
+        &fn(op_write_bits1),
+        &fn(op_write_nibbles1),
+
+        &fn(op_pack_1210),
+        &fn(op_pack_2330),
+        &fn(op_pack_3320),
+
+        &fn(op_unpack_1210),
+        &fn(op_unpack_2330),
+        &fn(op_unpack_3320),
+
+        REF_COMMON_PATTERNS(expand16),
+        REF_COMMON_PATTERNS(expand32),
+#elif BIT_DEPTH == 16
+        &fn(op_pack_4440),
+        &fn(op_pack_5550),
+        &fn(op_pack_5650),
+        &fn(op_unpack_4440),
+        &fn(op_unpack_5550),
+        &fn(op_unpack_5650),
+#elif BIT_DEPTH == 32
+        &fn(op_pack_2101010),
+        &fn(op_pack_1010102),
+        &fn(op_unpack_2101010),
+        &fn(op_unpack_1010102),
+#endif
+
+#ifdef SWAP_BYTES
+        REF_COMMON_PATTERNS(swap_bytes),
+#endif
+
+        REF_COMMON_PATTERNS(min),
+        REF_COMMON_PATTERNS(max),
+        REF_COMMON_PATTERNS(scale),
+        REF_COMMON_PATTERNS(convert_float),
+
+        &fn(op_clear_1110),
+        &fn(op_clear_0111),
+        &fn(op_clear_0011),
+        &fn(op_clear_1001),
+        &fn(op_clear_1100),
+        &fn(op_clear_0101),
+        &fn(op_clear_1010),
+        &fn(op_clear_1000),
+        &fn(op_clear_0100),
+        &fn(op_clear_0010),
+
+        &fn(op_swizzle_3012),
+        &fn(op_swizzle_3021),
+        &fn(op_swizzle_2103),
+        &fn(op_swizzle_3210),
+        &fn(op_swizzle_3102),
+        &fn(op_swizzle_3201),
+        &fn(op_swizzle_1203),
+        &fn(op_swizzle_1023),
+        &fn(op_swizzle_2013),
+        &fn(op_swizzle_2310),
+        &fn(op_swizzle_2130),
+        &fn(op_swizzle_1230),
+        &fn(op_swizzle_1320),
+        &fn(op_swizzle_0213),
+        &fn(op_swizzle_0231),
+        &fn(op_swizzle_0312),
+        &fn(op_swizzle_3120),
+        &fn(op_swizzle_0321),
+
+        &fn(op_expand_luma_03),
+        &fn(op_expand_luma_30),
+        &fn(op_expand_luma_10),
+        &fn(op_expand_luma_01),
+
+#if BIT_DEPTH != 8
+        REF_COMMON_PATTERNS(lshift),
+        REF_COMMON_PATTERNS(rshift),
+        REF_COMMON_PATTERNS(convert_uint8),
+#endif /* BIT_DEPTH != 8 */
+
+#if BIT_DEPTH != 16
+        REF_COMMON_PATTERNS(convert_uint16),
+#endif
+#if BIT_DEPTH != 32
+        REF_COMMON_PATTERNS(convert_uint32),
+#endif
+
+        NULL
+    },
+};
+
+#undef PIXEL_TYPE
+#undef PIXEL_MAX
+#undef PIXEL_MIN
+#undef SWAP_BYTES
+#undef pixel_t
+#undef block_t
+#undef px
+
+#undef FMT_CHAR
+#undef IS_FLOAT