Files
FFmpeg/libswscale/ops_optimizer.c
Niklas Haas 61eca588dc swscale/ops: move ff_sws_op_list_update_comps() to ops.c
I think this is ultimately a better home, since the semantics of this are
not really tied to optimization itself; and because I want to make it an
explicitly suported part of the user-facing API (rather than just an
internal-use field).

The secondary motivating reason here is that I intend to use internal
helpers of `ops.c` inside the next commit. (Though this is a weak reason
on its own, and not sufficient to justify this move by itself.)
2025-12-24 16:37:22 +00:00

722 lines
22 KiB
C

/**
* Copyright (C) 2025 Niklas Haas
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/avassert.h"
#include "libavutil/bswap.h"
#include "libavutil/rational.h"
#include "ops.h"
#include "ops_internal.h"
#define RET(x) \
do { \
if ((ret = (x)) < 0) \
return ret; \
} while (0)
/**
* Try to commute a clear op with the next operation. Makes any adjustments
* to the operations as needed, but does not perform the actual commutation.
*
* Returns whether successful.
*/
static bool op_commute_clear(SwsOp *op, SwsOp *next)
{
av_assert1(op->op == SWS_OP_CLEAR);
switch (next->op) {
case SWS_OP_CONVERT:
op->type = next->convert.to;
/* fall through */
case SWS_OP_LSHIFT:
case SWS_OP_RSHIFT:
case SWS_OP_DITHER:
case SWS_OP_MIN:
case SWS_OP_MAX:
case SWS_OP_SCALE:
case SWS_OP_READ:
case SWS_OP_SWIZZLE:
ff_sws_apply_op_q(next, op->c.q4);
return true;
case SWS_OP_INVALID:
case SWS_OP_SWAP_BYTES:
case SWS_OP_WRITE:
case SWS_OP_LINEAR:
case SWS_OP_PACK:
case SWS_OP_UNPACK:
case SWS_OP_CLEAR:
return false;
case SWS_OP_TYPE_NB:
break;
}
av_unreachable("Invalid operation type!");
return false;
}
/**
* Try to commute a swizzle op with the next operation. Makes any adjustments
* to the operations as needed, but does not perform the actual commutation.
*
* Returns whether successful.
*/
static bool op_commute_swizzle(SwsOp *op, SwsOp *next)
{
bool seen[4] = {0};
av_assert1(op->op == SWS_OP_SWIZZLE);
switch (next->op) {
case SWS_OP_CONVERT:
op->type = next->convert.to;
/* fall through */
case SWS_OP_SWAP_BYTES:
case SWS_OP_LSHIFT:
case SWS_OP_RSHIFT:
case SWS_OP_SCALE:
return true;
/**
* We can commute per-channel ops only if the per-channel constants are the
* same for all duplicated channels; e.g.:
* SWIZZLE {0, 0, 0, 3}
* NEXT {x, x, x, w}
* ->
* NEXT {x, _, _, w}
* SWIZZLE {0, 0, 0, 3}
*/
case SWS_OP_MIN:
case SWS_OP_MAX: {
const SwsConst c = next->c;
for (int i = 0; i < 4; i++) {
if (next->comps.unused[i])
continue;
const int j = op->swizzle.in[i];
if (seen[j] && av_cmp_q(next->c.q4[j], c.q4[i]))
return false;
next->c.q4[j] = c.q4[i];
seen[j] = true;
}
return true;
}
case SWS_OP_DITHER: {
const SwsDitherOp d = next->dither;
for (int i = 0; i < 4; i++) {
if (next->comps.unused[i])
continue;
const int j = op->swizzle.in[i];
if (seen[j] && next->dither.y_offset[j] != d.y_offset[i])
return false;
next->dither.y_offset[j] = d.y_offset[i];
seen[j] = true;
}
return true;
}
case SWS_OP_INVALID:
case SWS_OP_READ:
case SWS_OP_WRITE:
case SWS_OP_SWIZZLE:
case SWS_OP_CLEAR:
case SWS_OP_LINEAR:
case SWS_OP_PACK:
case SWS_OP_UNPACK:
return false;
case SWS_OP_TYPE_NB:
break;
}
av_unreachable("Invalid operation type!");
return false;
}
/* returns log2(x) only if x is a power of two, or 0 otherwise */
static int exact_log2(const int x)
{
int p;
if (x <= 0)
return 0;
p = av_log2(x);
return (1 << p) == x ? p : 0;
}
static int exact_log2_q(const AVRational x)
{
if (x.den == 1)
return exact_log2(x.num);
else if (x.num == 1)
return -exact_log2(x.den);
else
return 0;
}
/**
* If a linear operation can be reduced to a scalar multiplication, returns
* the corresponding scaling factor, or 0 otherwise.
*/
static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next,
SwsConst *out_scale)
{
SwsConst scale = {0};
/* There are components not on the main diagonal */
if (c->mask & ~SWS_MASK_DIAG4)
return false;
for (int i = 0; i < 4; i++) {
const AVRational s = c->m[i][i];
if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i])
continue;
if (scale.q.den && av_cmp_q(s, scale.q))
return false;
scale.q = s;
}
if (scale.q.den)
*out_scale = scale;
return scale.q.den;
}
/* Extracts an integer clear operation (subset) from the given linear op. */
static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev,
SwsConst *out_clear)
{
SwsConst clear = {0};
bool ret = false;
for (int i = 0; i < 4; i++) {
bool const_row = c->m[i][4].den == 1; /* offset is integer */
for (int j = 0; j < 4; j++) {
const_row &= c->m[i][j].num == 0 || /* scalar is zero */
(prev.flags[j] & SWS_COMP_ZERO); /* input is zero */
}
if (const_row && (c->mask & SWS_MASK_ROW(i))) {
clear.q4[i] = c->m[i][4];
for (int j = 0; j < 5; j++)
c->m[i][j] = Q(i == j);
c->mask &= ~SWS_MASK_ROW(i);
ret = true;
}
}
if (ret)
*out_clear = clear;
return ret;
}
/* Unswizzle a linear operation by aligning single-input rows with
* their corresponding diagonal */
static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz)
{
SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
SwsLinearOp c = *op;
for (int i = 0; i < 4; i++) {
int idx = -1;
for (int j = 0; j < 4; j++) {
if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO))
continue;
if (idx >= 0)
return false; /* multiple inputs */
idx = j;
}
if (idx >= 0 && idx != i) {
/* Move coefficient to the diagonal */
c.m[i][i] = c.m[i][idx];
c.m[i][idx] = Q(0);
swiz.in[i] = idx;
}
}
if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
return false; /* no swizzle was identified */
c.mask = ff_sws_linear_mask(c);
*out_swiz = swiz;
*op = c;
return true;
}
int ff_sws_op_list_optimize(SwsOpList *ops)
{
int ret;
retry:
ff_sws_op_list_update_comps(ops);
/* Apply all in-place optimizations (that do not re-order the list) */
for (int n = 0; n < ops->num_ops; n++) {
SwsOp dummy = {0};
SwsOp *op = &ops->ops[n];
SwsOp *prev = n ? &ops->ops[n - 1] : &dummy;
SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy;
/* common helper variable */
bool noop = true;
switch (op->op) {
case SWS_OP_READ:
/* Optimized further into refcopy / memcpy */
if (next->op == SWS_OP_WRITE &&
next->rw.elems == op->rw.elems &&
next->rw.packed == op->rw.packed &&
next->rw.frac == op->rw.frac)
{
ff_sws_op_list_remove_at(ops, n, 2);
av_assert1(ops->num_ops == 0);
return 0;
}
/* Skip reading extra unneeded components */
if (!op->rw.packed) {
int needed = op->rw.elems;
while (needed > 0 && next->comps.unused[needed - 1])
needed--;
if (op->rw.elems != needed) {
op->rw.elems = needed;
goto retry;
}
}
break;
case SWS_OP_SWAP_BYTES:
/* Redundant (double) swap */
if (next->op == SWS_OP_SWAP_BYTES) {
ff_sws_op_list_remove_at(ops, n, 2);
goto retry;
}
break;
case SWS_OP_UNPACK:
/* Redundant unpack+pack */
if (next->op == SWS_OP_PACK && next->type == op->type &&
next->pack.pattern[0] == op->pack.pattern[0] &&
next->pack.pattern[1] == op->pack.pattern[1] &&
next->pack.pattern[2] == op->pack.pattern[2] &&
next->pack.pattern[3] == op->pack.pattern[3])
{
ff_sws_op_list_remove_at(ops, n, 2);
goto retry;
}
break;
case SWS_OP_LSHIFT:
case SWS_OP_RSHIFT:
/* Two shifts in the same direction */
if (next->op == op->op) {
op->c.u += next->c.u;
ff_sws_op_list_remove_at(ops, n + 1, 1);
goto retry;
}
/* No-op shift */
if (!op->c.u) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
break;
case SWS_OP_CLEAR:
for (int i = 0; i < 4; i++) {
if (!op->c.q4[i].den)
continue;
if ((prev->comps.flags[i] & SWS_COMP_ZERO) &&
!(prev->comps.flags[i] & SWS_COMP_GARBAGE) &&
op->c.q4[i].num == 0)
{
/* Redundant clear-to-zero of zero component */
op->c.q4[i].den = 0;
} else if (next->comps.unused[i]) {
/* Unnecessary clear of unused component */
op->c.q4[i] = (AVRational) {0, 0};
} else if (op->c.q4[i].den) {
noop = false;
}
}
if (noop) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
/* Transitive clear */
if (next->op == SWS_OP_CLEAR) {
for (int i = 0; i < 4; i++) {
if (next->c.q4[i].den)
op->c.q4[i] = next->c.q4[i];
}
ff_sws_op_list_remove_at(ops, n + 1, 1);
goto retry;
}
break;
case SWS_OP_SWIZZLE:
for (int i = 0; i < 4; i++) {
if (next->comps.unused[i])
continue;
if (op->swizzle.in[i] != i)
noop = false;
}
/* Identity swizzle */
if (noop) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
/* Transitive swizzle */
if (next->op == SWS_OP_SWIZZLE) {
const SwsSwizzleOp orig = op->swizzle;
for (int i = 0; i < 4; i++)
op->swizzle.in[i] = orig.in[next->swizzle.in[i]];
ff_sws_op_list_remove_at(ops, n + 1, 1);
goto retry;
}
break;
case SWS_OP_CONVERT:
/* No-op conversion */
if (op->type == op->convert.to) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
/* Transitive conversion */
if (next->op == SWS_OP_CONVERT &&
op->convert.expand == next->convert.expand)
{
av_assert1(op->convert.to == next->type);
op->convert.to = next->convert.to;
ff_sws_op_list_remove_at(ops, n + 1, 1);
goto retry;
}
/* Conversion followed by integer expansion */
if (next->op == SWS_OP_SCALE && !op->convert.expand &&
!av_cmp_q(next->c.q, ff_sws_pixel_expand(op->type, op->convert.to)))
{
op->convert.expand = true;
ff_sws_op_list_remove_at(ops, n + 1, 1);
goto retry;
}
break;
case SWS_OP_MIN:
for (int i = 0; i < 4; i++) {
if (next->comps.unused[i] || !op->c.q4[i].den)
continue;
if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0)
noop = false;
}
if (noop) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
break;
case SWS_OP_MAX:
for (int i = 0; i < 4; i++) {
if (next->comps.unused[i] || !op->c.q4[i].den)
continue;
if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0)
noop = false;
}
if (noop) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
break;
case SWS_OP_DITHER:
for (int i = 0; i < 4; i++) {
noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) ||
next->comps.unused[i];
}
if (noop) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
break;
case SWS_OP_LINEAR: {
SwsSwizzleOp swizzle;
SwsConst c;
/* No-op (identity) linear operation */
if (!op->lin.mask) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
if (next->op == SWS_OP_LINEAR) {
/* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */
const SwsLinearOp m1 = op->lin;
const SwsLinearOp m2 = next->lin;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 5; j++) {
AVRational sum = Q(0);
for (int k = 0; k < 4; k++)
sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j]));
if (j == 4) /* m1.m[4][j] == 1 */
sum = av_add_q(sum, m2.m[i][4]);
op->lin.m[i][j] = sum;
}
}
op->lin.mask = ff_sws_linear_mask(op->lin);
ff_sws_op_list_remove_at(ops, n + 1, 1);
goto retry;
}
/* Optimize away zero columns */
for (int j = 0; j < 4; j++) {
const uint32_t col = SWS_MASK_COL(j);
if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col))
continue;
for (int i = 0; i < 4; i++)
op->lin.m[i][j] = Q(i == j);
op->lin.mask &= ~col;
goto retry;
}
/* Optimize away unused rows */
for (int i = 0; i < 4; i++) {
const uint32_t row = SWS_MASK_ROW(i);
if (!next->comps.unused[i] || !(op->lin.mask & row))
continue;
for (int j = 0; j < 5; j++)
op->lin.m[i][j] = Q(i == j);
op->lin.mask &= ~row;
goto retry;
}
/* Convert constant rows to explicit clear instruction */
if (extract_constant_rows(&op->lin, prev->comps, &c)) {
RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
.op = SWS_OP_CLEAR,
.type = op->type,
.comps = op->comps,
.c = c,
}));
goto retry;
}
/* Multiplication by scalar constant */
if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) {
op->op = SWS_OP_SCALE;
op->c = c;
goto retry;
}
/* Swizzle by fixed pattern */
if (extract_swizzle(&op->lin, prev->comps, &swizzle)) {
RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) {
.op = SWS_OP_SWIZZLE,
.type = op->type,
.swizzle = swizzle,
}));
goto retry;
}
break;
}
case SWS_OP_SCALE: {
const int factor2 = exact_log2_q(op->c.q);
/* No-op scaling */
if (op->c.q.num == 1 && op->c.q.den == 1) {
ff_sws_op_list_remove_at(ops, n, 1);
goto retry;
}
/* Scaling by exact power of two */
if (factor2 && ff_sws_pixel_type_is_int(op->type)) {
op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT;
op->c.u = FFABS(factor2);
goto retry;
}
break;
}
}
}
/* Push clears to the back to void any unused components */
for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */
SwsOp *op = &ops->ops[n];
SwsOp *next = &ops->ops[n + 1];
switch (op->op) {
case SWS_OP_CLEAR:
if (op_commute_clear(op, next)) {
FFSWAP(SwsOp, *op, *next);
goto retry;
}
break;
}
}
/* Apply any remaining preferential re-ordering optimizations; do these
* last because they are more likely to block other optimizations if done
* too aggressively */
for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */
SwsOp *op = &ops->ops[n];
SwsOp *prev = &ops->ops[n - 1];
SwsOp *next = &ops->ops[n + 1];
switch (op->op) {
case SWS_OP_SWIZZLE: {
bool seen[4] = {0};
bool has_duplicates = false;
for (int i = 0; i < 4; i++) {
if (next->comps.unused[i])
continue;
has_duplicates |= seen[op->swizzle.in[i]];
seen[op->swizzle.in[i]] = true;
}
/* Try to push swizzles with duplicates towards the output */
if (has_duplicates && op_commute_swizzle(op, next)) {
FFSWAP(SwsOp, *op, *next);
goto retry;
}
/* Move swizzle out of the way between two converts so that
* they may be merged */
if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) {
op->type = next->convert.to;
FFSWAP(SwsOp, *op, *next);
goto retry;
}
break;
}
case SWS_OP_SCALE:
/* Scaling by integer before conversion to int */
if (op->c.q.den == 1 && next->op == SWS_OP_CONVERT &&
ff_sws_pixel_type_is_int(next->convert.to))
{
op->type = next->convert.to;
FFSWAP(SwsOp, *op, *next);
goto retry;
}
break;
}
}
return 0;
}
int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
int size, uint8_t clear_val,
int *read_bytes, int *write_bytes)
{
const SwsOp read = ops->ops[0];
const int read_size = ff_sws_pixel_type_size(read.type);
uint32_t mask[4] = {0};
if (!ops->num_ops || read.op != SWS_OP_READ)
return AVERROR(EINVAL);
if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
return AVERROR(ENOTSUP);
for (int i = 0; i < read.rw.elems; i++)
mask[i] = 0x01010101 * i * read_size + 0x03020100;
for (int opidx = 1; opidx < ops->num_ops; opidx++) {
const SwsOp *op = &ops->ops[opidx];
switch (op->op) {
case SWS_OP_SWIZZLE: {
uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
for (int i = 0; i < 4; i++)
mask[i] = orig[op->swizzle.in[i]];
break;
}
case SWS_OP_SWAP_BYTES:
for (int i = 0; i < 4; i++) {
switch (ff_sws_pixel_type_size(op->type)) {
case 2: mask[i] = av_bswap16(mask[i]); break;
case 4: mask[i] = av_bswap32(mask[i]); break;
}
}
break;
case SWS_OP_CLEAR:
for (int i = 0; i < 4; i++) {
if (!op->c.q4[i].den)
continue;
if (op->c.q4[i].num != 0 || !clear_val)
return AVERROR(ENOTSUP);
mask[i] = 0x1010101ul * clear_val;
}
break;
case SWS_OP_CONVERT: {
if (!op->convert.expand)
return AVERROR(ENOTSUP);
for (int i = 0; i < 4; i++) {
switch (ff_sws_pixel_type_size(op->type)) {
case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF); break;
case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
}
}
break;
}
case SWS_OP_WRITE: {
if (op->rw.frac || (!op->rw.packed && op->rw.elems > 1))
return AVERROR(ENOTSUP);
/* Initialize to no-op */
memset(shuffle, clear_val, size);
const int write_size = ff_sws_pixel_type_size(op->type);
const int read_chunk = read.rw.elems * read_size;
const int write_chunk = op->rw.elems * write_size;
const int num_groups = size / FFMAX(read_chunk, write_chunk);
for (int n = 0; n < num_groups; n++) {
const int base_in = n * read_chunk;
const int base_out = n * write_chunk;
for (int i = 0; i < op->rw.elems; i++) {
const int offset = base_out + i * write_size;
for (int b = 0; b < write_size; b++) {
const uint8_t idx = mask[i] >> (b * 8);
if (idx != clear_val)
shuffle[offset + b] = base_in + idx;
}
}
}
*read_bytes = num_groups * read_chunk;
*write_bytes = num_groups * write_chunk;
return num_groups;
}
default:
return AVERROR(ENOTSUP);
}
}
return AVERROR(EINVAL);
}