avcodec/ppc/vc1dsp_altivec: Don't read too much data

vc1_inv_trans_8x4_altivec() is supposed to process a block
of 8x4 words, yet it read and processed eight lines. This led
to ASAN failures (see [1]) that this commit intends to fix.
It should also lead to performance improvements, but I don't have
real hardware to bench it.

[1]: https://fate.ffmpeg.org/report.cgi?time=20251207214004&slot=ppc64-linux-gcc-14.3-asan

Reviewed-by: Sean McGovern <gseanmcg@gmail.com>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-12-08 06:14:24 +01:00
parent 6258e61c4d
commit a72e01b4ec

View File

@@ -235,7 +235,7 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, ptrdiff_t stride,
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
vector signed int s8, s9, sA, sB;
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
@@ -253,40 +253,42 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, ptrdiff_t stride,
src1 = vec_ld( 16, block);
src2 = vec_ld( 32, block);
src3 = vec_ld( 48, block);
src4 = vec_ld( 64, block);
src5 = vec_ld( 80, block);
src6 = vec_ld( 96, block);
src7 = vec_ld(112, block);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
// Transpose 8x4 matrix of 16-bit elements (in-place)
vec_s16 A1, B1, C1, D1;
vec_s16 A2, B2, C2, D2;
A1 = vec_mergeh(src0, src2);
B1 = vec_mergel(src0, src2);
C1 = vec_mergeh(src1, src3);
D1 = vec_mergel(src1, src3);
A2 = vec_mergeh(A1, C1);
B2 = vec_mergel(A1, C1);
C2 = vec_mergeh(B1, D1);
D2 = vec_mergel(B1, D1);
s0 = vec_unpackh(A2);
s1 = vec_unpackl(A2);
s2 = vec_unpackh(B2);
s3 = vec_unpackl(B2);
s4 = vec_unpackh(C2);
s5 = vec_unpackl(C2);
s6 = vec_unpackh(D2);
s7 = vec_unpackl(D2);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
src0 = vec_pack(s0, s0);
src1 = vec_pack(s1, s1);
src2 = vec_pack(s2, s2);
src3 = vec_pack(s3, s3);
src4 = vec_pack(s4, s4);
src5 = vec_pack(s5, s5);
src6 = vec_pack(s6, s6);
src7 = vec_pack(s7, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackh(src0);