avcodec/{arm,neon}/mpegvideo: Fix h263 unquantize functions

These functions currently operate on the assumption that the number
of coefficients to process is always of the form 16k+m with m<=4 or >8.
Yet this is not true when the IDCT permutation is of type FF_IDCT_PERM_LIBMPEG2
(i.e. when FF_IDCT_INT is in use).

Reviewed-by: Martin Storsjö <martin@martin.st>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-29 01:05:51 +01:00
parent 581050a175
commit 357fc5243c
2 changed files with 17 additions and 23 deletions

View File

@@ -36,7 +36,7 @@ function ff_dct_unquantize_h263_neon, export=1
vdup.16 q15, r0 @ qmul
vdup.16 q14, r2 @ qadd
vneg.s16 q13, q14
cmp r3, #4
cmp r3, #8
mov r0, r1
ble 2f
1:
@@ -62,14 +62,14 @@ function ff_dct_unquantize_h263_neon, export=1
cmp r3, #8
bgt 1b
2:
vld1.16 {d0}, [r0,:64]
vclt.s16 d3, d0, #0
vceq.s16 d1, d0, #0
vmul.s16 d2, d0, d30
vbsl d3, d26, d28
vadd.s16 d2, d2, d3
vbif d0, d2, d1
vst1.16 {d0}, [r1,:64]
vld1.16 {q0}, [r0,:128]
vclt.s16 q3, q0, #0
vceq.s16 q1, q0, #0
vmul.s16 q2, q0, q15
vbsl q3, q13, q14
vadd.s16 q2, q2, q3
vbif q0, q2, q1
vst1.16 {q0}, [r1,:128]
bx lr
endfunc

View File

@@ -39,12 +39,7 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs
{
int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16;
int16x8_t q14s16, q15s16, qzs16;
int16x4_t d0s16, d2s16, d3s16, dzs16;
uint16x8_t q1u16, q9u16;
uint16x4_t d1u16;
dzs16 = vdup_n_s16(0);
qzs16 = vdupq_n_s16(0);
q15s16 = vdupq_n_s16(qscale << 1);
q14s16 = vdupq_n_s16(qadd);
@@ -73,15 +68,14 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs
if (nCoeffs <= 0)
return;
d0s16 = vld1_s16(block);
d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16));
d1u16 = vceq_s16(d0s16, dzs16);
d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16));
d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16),
vget_high_s16(q13s16), vget_high_s16(q14s16));
d2s16 = vadd_s16(d2s16, d3s16);
d0s16 = vbsl_s16(d1u16, d0s16, d2s16);
vst1_s16(block, d0s16);
q0s16 = vld1q_s16(block);
q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16));
q1u16 = vceqq_s16(q0s16, qzs16);
q2s16 = vmulq_s16(q0s16, q15s16);
q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16);
q2s16 = vaddq_s16(q2s16, q3s16);
q0s16 = vbslq_s16(q1u16, q0s16, q2s16);
vst1q_s16(block, q0s16);
}
static void dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,