swscale/x86/ops_float: store and load per row dither offset directly

Instead of computing y + N with a hard-coded index offset, calculate the
relative offset as a 16-bit integer in C and add that to the pointer directly.
Since we no longer mask the resulting combined address, this may result in
overread, but that's fine since we over-provisioned the array in the previous
commit.
This commit is contained in:
Niklas Haas
2025-12-03 19:12:37 +01:00
committed by Niklas Haas
parent 0744260ff0
commit 3f7e3cedb5
3 changed files with 21 additions and 8 deletions

View File

@@ -215,6 +215,13 @@ static int setup_dither(const SwsOp *op, SwsOpPriv *out)
memcpy(&matrix[size * size], matrix, max_offset * stride);
/* Store relative pointer offset to each row inside extra space */
static_assert(sizeof(out->ptr) <= sizeof(uint16_t[4]), ">8 byte pointers not supported");
assert(max_offset * stride <= UINT16_MAX);
uint16_t *offset = &out->u16[4];
for (int i = 0; i < 4; i++)
offset[i] = (op->dither.y_offset[i] & (size - 1)) * stride;
return 0;
}

View File

@@ -245,6 +245,9 @@ endstruc
%define tmp0d r4d
%define tmp1d r5d
%define tmp0w r4w
%define tmp1w r5w
; Registers for plane pointers; put at the end (and in ascending plane order)
; so that we can avoid reserving them when not necessary
%define out0q r6q

View File

@@ -179,10 +179,8 @@ IF W, mulps mw2, m8
CONTINUE tmp0q
%endmacro
%macro load_dither_row 5 ; size_log2, y, addr, out, out2
lea tmp0q, %2
and tmp0q, (1 << %1) - 1
shl tmp0q, %1+2
%macro load_dither_row 5 ; size_log2, comp_idx, addr, out, out2
mov tmp0w, [implq + SwsOpImpl.priv + (4 + %2) * 2] ; priv.u16[4 + i]
%if %1 == 1
vbroadcastsd %4, [%3 + tmp0q]
%elif %1 == 2
@@ -225,6 +223,11 @@ op dither%1
%endif
; dither matrix is stored indirectly at the private data address
mov tmp1q, [implq + SwsOpImpl.priv]
; add y offset
mov tmp0d, yd
and tmp0d, (1 << %1) - 1
shl tmp0d, %1 + 2 ; * sizeof(float)
add tmp1q, tmp0q
%if (4 << %1) > 2 * mmsize
; need to add in x offset
mov tmp0d, bxd
@@ -232,10 +235,10 @@ op dither%1
and tmp0d, (4 << %1) - 1
add tmp1q, tmp0q
%endif
IF X, load_dither_row %1, [yd + 0], tmp1q, DX, DX2
IF Y, load_dither_row %1, [yd + 3], tmp1q, DY, DY2
IF Z, load_dither_row %1, [yd + 2], tmp1q, DZ, DZ2
IF W, load_dither_row %1, [yd + 5], tmp1q, DW, DW2
IF X, load_dither_row %1, 0, tmp1q, DX, DX2
IF Y, load_dither_row %1, 1, tmp1q, DY, DY2
IF Z, load_dither_row %1, 2, tmp1q, DZ, DZ2
IF W, load_dither_row %1, 3, tmp1q, DW, DW2
%endif
LOAD_CONT tmp0q
IF X, addps mx, DX