swscale/x86/ops_float: store and load per row dither offset directly

Instead of computing y + N with a hard-coded index offset, calculate the relative offset as a 16-bit integer in C and add that to the pointer directly. Since we no longer mask the resulting combined address, this may result in overread, but that's fine since we over-provisioned the array in the previous commit.
2026-01-12 00:06:51 +08:00 · 2025-12-03 19:12:37 +01:00
parent 0744260ff0
commit 3f7e3cedb5
3 changed files with 21 additions and 8 deletions
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -215,6 +215,13 @@ static int setup_dither(const SwsOp *op, SwsOpPriv *out)

    memcpy(&matrix[size * size], matrix, max_offset * stride);

+    /* Store relative pointer offset to each row inside extra space */
+    static_assert(sizeof(out->ptr) <= sizeof(uint16_t[4]), ">8 byte pointers not supported");
+    assert(max_offset * stride <= UINT16_MAX);
+    uint16_t *offset = &out->u16[4];
+    for (int i = 0; i < 4; i++)
+        offset[i] = (op->dither.y_offset[i] & (size - 1)) * stride;
+
    return 0;
 }

--- a/libswscale/x86/ops_common.asm
+++ b/libswscale/x86/ops_common.asm
@@ -245,6 +245,9 @@ endstruc
 %define tmp0d   r4d
 %define tmp1d   r5d

+%define tmp0w   r4w
+%define tmp1w   r5w
+
 ; Registers for plane pointers; put at the end (and in ascending plane order)
 ; so that we can avoid reserving them when not necessary
 %define out0q   r6q
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -179,10 +179,8 @@ IF W,   mulps mw2, m8
        CONTINUE tmp0q
 %endmacro

-%macro load_dither_row 5 ; size_log2, y, addr, out, out2
-        lea tmp0q, %2
-        and tmp0q, (1 << %1) - 1
-        shl tmp0q, %1+2
+%macro load_dither_row 5 ; size_log2, comp_idx, addr, out, out2
+        mov tmp0w, [implq + SwsOpImpl.priv + (4 + %2) * 2] ; priv.u16[4 + i]
 %if %1 == 1
        vbroadcastsd   %4, [%3 + tmp0q]
 %elif %1 == 2
@@ -225,6 +223,11 @@ op dither%1
    %endif
        ; dither matrix is stored indirectly at the private data address
        mov tmp1q, [implq + SwsOpImpl.priv]
+        ; add y offset
+        mov tmp0d, yd
+        and tmp0d, (1 << %1) - 1
+        shl tmp0d, %1 + 2 ; * sizeof(float)
+        add tmp1q, tmp0q
    %if (4 << %1) > 2 * mmsize
        ; need to add in x offset
        mov tmp0d, bxd
@@ -232,10 +235,10 @@ op dither%1
        and tmp0d, (4 << %1) - 1
        add tmp1q, tmp0q
    %endif
-IF X,   load_dither_row %1, [yd + 0], tmp1q, DX, DX2
-IF Y,   load_dither_row %1, [yd + 3], tmp1q, DY, DY2
-IF Z,   load_dither_row %1, [yd + 2], tmp1q, DZ, DZ2
-IF W,   load_dither_row %1, [yd + 5], tmp1q, DW, DW2
+IF X,   load_dither_row %1, 0, tmp1q, DX, DX2
+IF Y,   load_dither_row %1, 1, tmp1q, DY, DY2
+IF Z,   load_dither_row %1, 2, tmp1q, DZ, DZ2
+IF W,   load_dither_row %1, 3, tmp1q, DW, DW2
 %endif
        LOAD_CONT tmp0q
 IF X,   addps mx, DX