swscale/optimizer: add packed shuffle solver

This can turn any compatible sequence of operations into a single packed shuffle, including packed swizzling, grayscale->RGB conversion, endianness swapping, RGB bit depth conversions, rgb24->rgb0 alpha clearing and more.
2026-02-04 14:30:55 +08:00 · 2025-05-21 13:49:37 +02:00
parent db2bc11a97
commit 696f3be322
2 changed files with 123 additions and 0 deletions
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -130,4 +130,32 @@ int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
 */
 int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out);

+/**
+ * "Solve" an op list into a fixed shuffle mask, with an optional ability to
+ * also directly clear the output value (for e.g. rgb24 -> rgb0). This can
+ * accept any operation chain that only consists of the following operations:
+ *
+ * - SWS_OP_READ (non-planar, non-fractional)
+ * - SWS_OP_SWIZZLE
+ * - SWS_OP_SWAP_BYTES
+ * - SWS_OP_CLEAR to zero (when clear_val is specified)
+ * - SWS_OP_CONVERT (integer expand)
+ * - SWS_OP_WRITE (non-planar, non-fractional)
+ *
+ * Basically, any operation that purely consists of moving around and reordering
+ * bytes within a single plane, can be turned into a shuffle mask.
+ *
+ * @param ops         The operation list to decompose.
+ * @param shuffle     The output shuffle mask.
+ * @param size        The size (in bytes) of the output shuffle mask.
+ * @param clear_val   If nonzero, this index will be used to clear the output.
+ * @param read_bytes  Returns the number of bytes read per shuffle iteration.
+ * @param write_bytes Returns the number of bytes written per shuffle iteration.
+ *
+ * @return  The number of pixels processed per iteration, or a negative error
+            code; in particular AVERROR(ENOTSUP) for unsupported operations.
+ */
+int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size,
+                         uint8_t clear_val, int *read_bytes, int *write_bytes);
+
 #endif /* SWSCALE_OPS_INTERNAL_H */
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -19,6 +19,7 @@
 */

 #include "libavutil/avassert.h"
+#include <libavutil/bswap.h>
 #include "libavutil/rational.h"

 #include "ops.h"
@@ -769,3 +770,97 @@ retry:

    return 0;
 }
+
+int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
+                         int size, uint8_t clear_val,
+                         int *read_bytes, int *write_bytes)
+{
+    const SwsOp read = ops->ops[0];
+    const int read_size = ff_sws_pixel_type_size(read.type);
+    uint32_t mask[4] = {0};
+
+    if (!ops->num_ops || read.op != SWS_OP_READ)
+        return AVERROR(EINVAL);
+    if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
+        return AVERROR(ENOTSUP);
+
+    for (int i = 0; i < read.rw.elems; i++)
+        mask[i] = 0x01010101 * i * read_size + 0x03020100;
+
+    for (int opidx = 1; opidx < ops->num_ops; opidx++) {
+        const SwsOp *op = &ops->ops[opidx];
+        switch (op->op) {
+        case SWS_OP_SWIZZLE: {
+            uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
+            for (int i = 0; i < 4; i++)
+                mask[i] = orig[op->swizzle.in[i]];
+            break;
+        }
+
+        case SWS_OP_SWAP_BYTES:
+            for (int i = 0; i < 4; i++) {
+                switch (ff_sws_pixel_type_size(op->type)) {
+                case 2: mask[i] = av_bswap16(mask[i]); break;
+                case 4: mask[i] = av_bswap32(mask[i]); break;
+                }
+            }
+            break;
+
+        case SWS_OP_CLEAR:
+            for (int i = 0; i < 4; i++) {
+                if (!op->c.q4[i].den)
+                    continue;
+                if (op->c.q4[i].num != 0 || !clear_val)
+                    return AVERROR(ENOTSUP);
+                mask[i] = 0x1010101ul * clear_val;
+            }
+            break;
+
+        case SWS_OP_CONVERT: {
+            if (!op->convert.expand)
+                return AVERROR(ENOTSUP);
+            for (int i = 0; i < 4; i++) {
+                switch (ff_sws_pixel_type_size(op->type)) {
+                case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF);   break;
+                case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
+                }
+            }
+            break;
+        }
+
+        case SWS_OP_WRITE: {
+            if (op->rw.frac || (!op->rw.packed && op->rw.elems > 1))
+                return AVERROR(ENOTSUP);
+
+            /* Initialize to no-op */
+            memset(shuffle, clear_val, size);
+
+            const int write_size  = ff_sws_pixel_type_size(op->type);
+            const int read_chunk  = read.rw.elems * read_size;
+            const int write_chunk = op->rw.elems * write_size;
+            const int num_groups  = size / FFMAX(read_chunk, write_chunk);
+            for (int n = 0; n < num_groups; n++) {
+                const int base_in  = n * read_chunk;
+                const int base_out = n * write_chunk;
+                for (int i = 0; i < op->rw.elems; i++) {
+                    const int offset = base_out + i * write_size;
+                    for (int b = 0; b < write_size; b++) {
+                        const uint8_t idx = mask[i] >> (b * 8);
+                        if (idx != clear_val)
+                            shuffle[offset + b] = base_in + idx;
+                    }
+                }
+            }
+
+            *read_bytes  = num_groups * read_chunk;
+            *write_bytes = num_groups * write_chunk;
+            return num_groups;
+        }
+
+        default:
+            return AVERROR(ENOTSUP);
+        }
+    }
+
+    return AVERROR(EINVAL);
+}