mirror of
https://github.com/libjpeg-turbo/libjpeg-turbo.git
synced 2026-01-12 00:04:17 +08:00
MMI: Format comments consistently with MMX code
This commit is contained in:
@@ -319,8 +319,8 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
|
||||
|
||||
#endif
|
||||
|
||||
/* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
|
||||
* ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
|
||||
/* re = (R0 R2 R4 R6), ge = (G0 G2 G4 G6), be = (B0 B2 B4 B6)
|
||||
* ro = (R1 R3 R5 R7), go = (G1 G3 G5 G7), bo = (B1 B3 B5 B7)
|
||||
*
|
||||
* (Original)
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
|
||||
@@ -310,8 +310,8 @@ void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
|
||||
|
||||
#endif
|
||||
|
||||
/* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
|
||||
* ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
|
||||
/* re = (R0 R2 R4 R6), ge = (G0 G2 G4 G6), be = (B0 B2 B4 B6)
|
||||
* ro = (R1 R3 R5 R7), go = (G1 G3 G5 G7), bo = (B1 B3 B5 B7)
|
||||
*
|
||||
* (Original)
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
|
||||
@@ -50,9 +50,9 @@ void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
|
||||
output_cols * 2);
|
||||
|
||||
bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */
|
||||
/* bias={1, 2, 1, 2} (16-bit) */
|
||||
/* bias = { 1, 2, 1, 2 } (16-bit) */
|
||||
mask = _mm_cmpeq_pi16(mask, mask);
|
||||
mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
|
||||
mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */
|
||||
|
||||
for (inrow = 0, outrow = 0; outrow < v_samp_factor;
|
||||
inrow += 2, outrow++) {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
* Loongson MMI optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2015, 2019, 2025, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
|
||||
* All Rights Reserved.
|
||||
* Authors: ZhuChen <zhuchen@loongson.cn>
|
||||
@@ -116,8 +116,8 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
mask = decenter = 0.0;
|
||||
mask = _mm_cmpeq_pi16(mask, mask);
|
||||
decenter = _mm_cmpeq_pi16(decenter, decenter);
|
||||
mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
|
||||
decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
|
||||
mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */
|
||||
decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */
|
||||
|
||||
cbe = _mm_and_si64(mask, cb); /* Cb(0246) */
|
||||
cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */
|
||||
@@ -139,15 +139,15 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
* B = Y - 0.22800 * Cb + Cb + Cb
|
||||
*/
|
||||
|
||||
cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */
|
||||
cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */
|
||||
cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */
|
||||
cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */
|
||||
cbe2 = _mm_add_pi16(cbe, cbe); /* 2 * CbE */
|
||||
cbo2 = _mm_add_pi16(cbo, cbo); /* 2 * CbO */
|
||||
cre2 = _mm_add_pi16(cre, cre); /* 2 * CrE */
|
||||
cro2 = _mm_add_pi16(cro, cro); /* 2 * CrO */
|
||||
|
||||
be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */
|
||||
bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */
|
||||
re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */
|
||||
ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */
|
||||
be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2 * CbE * -FIX(0.22800) */
|
||||
bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2 * CbO * -FIX(0.22800) */
|
||||
re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2 * CrE * FIX(0.40200)) */
|
||||
ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2 * CrO * FIX(0.40200)) */
|
||||
|
||||
be = _mm_add_pi16(be, PW_ONE);
|
||||
bo = _mm_add_pi16(bo, PW_ONE);
|
||||
@@ -160,10 +160,10 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
|
||||
be = _mm_add_pi16(be, cbe);
|
||||
bo = _mm_add_pi16(bo, cbo);
|
||||
be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */
|
||||
bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */
|
||||
re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */
|
||||
ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */
|
||||
be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200)) = (B - Y)E */
|
||||
bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200)) = (B - Y)O */
|
||||
re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200)) = (R - Y)E */
|
||||
ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200)) = (R - Y)O */
|
||||
|
||||
gle = _mm_unpacklo_pi16(cbe, cre);
|
||||
ghe = _mm_unpackhi_pi16(cbe, cre);
|
||||
@@ -183,54 +183,64 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
glo = _mm_srai_pi32(glo, SCALEBITS);
|
||||
gho = _mm_srai_pi32(gho, SCALEBITS);
|
||||
|
||||
ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
|
||||
go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
|
||||
ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
|
||||
go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
|
||||
ge = _mm_packs_pi32(gle, ghe); /* CbE * -FIX(0.344) + CrE * FIX(0.285) */
|
||||
go = _mm_packs_pi32(glo, gho); /* CbO * -FIX(0.344) + CrO * FIX(0.285) */
|
||||
ge = _mm_sub_pi16(ge, cre); /* CbE * -FIX(0.344) + CrE * -FIX(0.714) = (G - Y)E */
|
||||
go = _mm_sub_pi16(go, cro); /* CbO * -FIX(0.344) + CrO * -FIX(0.714) = (G - Y)O */
|
||||
|
||||
ye = _mm_and_si64(mask, y); /* Y(0246) */
|
||||
yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */
|
||||
|
||||
re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
|
||||
ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
|
||||
re = _mm_add_pi16(re, ye); /* ((R - Y)E + YE) = (R0 R2 R4 R6) */
|
||||
ro = _mm_add_pi16(ro, yo); /* ((R - Y)O + YO) = (R1 R3 R5 R7) */
|
||||
re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */
|
||||
ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */
|
||||
|
||||
ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
|
||||
go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
|
||||
ge = _mm_add_pi16(ge, ye); /* ((G - Y)E + YE) = (G0 G2 G4 G6) */
|
||||
go = _mm_add_pi16(go, yo); /* ((G - Y)O + YO) = (G1 G3 G5 G7) */
|
||||
ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */
|
||||
go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */
|
||||
|
||||
be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
|
||||
bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
|
||||
be = _mm_add_pi16(be, ye); /* (YE + (B - Y)E) = (B0 B2 B4 B6) */
|
||||
bo = _mm_add_pi16(bo, yo); /* (YO + (B - Y)O) = (B1 B3 B5 B7) */
|
||||
be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */
|
||||
bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
|
||||
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
|
||||
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
|
||||
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
|
||||
mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
|
||||
mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
|
||||
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
|
||||
* mapping of components A, B, and C to red, green, and blue.
|
||||
*
|
||||
* mmA = (A0 A2 A4 A6 ** ** ** **) = AE
|
||||
* mmB = (A1 A3 A5 A7 ** ** ** **) = AO
|
||||
* mmC = (B0 B2 B4 B6 ** ** ** **) = BE
|
||||
* mmD = (B1 B3 B5 B7 ** ** ** **) = BO
|
||||
* mmE = (C0 C2 C4 C6 ** ** ** **) = CE
|
||||
* mmF = (C1 C3 C5 C7 ** ** ** **) = CO
|
||||
* mmG = (** ** ** ** ** ** ** **)
|
||||
* mmH = (** ** ** ** ** ** ** **)
|
||||
*/
|
||||
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
|
||||
mmE = _mm_unpacklo_pi8(mmE, mmB); /* (C0 A1 C2 A3 C4 A5 C6 A7) */
|
||||
mmD = _mm_unpacklo_pi8(mmD, mmF); /* (B1 C1 B3 C3 B5 C5 B7 C7) */
|
||||
|
||||
mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
|
||||
|
||||
mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */
|
||||
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */
|
||||
mmG = _mm_unpackhi_pi16(mmA, mmE); /* (A4 B4 C4 A5 A6 B6 C6 A7) */
|
||||
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (A0 B0 C0 A1 A2 B2 C2 A3) */
|
||||
|
||||
mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
|
||||
mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */
|
||||
mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (B3 C3 B5 C5 B7 C7 -- --) */
|
||||
|
||||
mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */
|
||||
mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */
|
||||
mmC = _mm_unpackhi_pi16(mmD, mmH); /* (B5 C5 A6 B6 B7 C7 -- --) */
|
||||
mmD = _mm_unpacklo_pi16(mmD, mmH); /* (B1 C1 A2 B2 B3 C3 A4 B4) */
|
||||
|
||||
mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */
|
||||
mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */
|
||||
mmF = _mm_unpackhi_pi16(mmE, mmB); /* (C6 A7 B7 C7 -- -- -- --) */
|
||||
mmE = _mm_unpacklo_pi16(mmE, mmB); /* (C2 A3 B3 C3 C4 A5 B5 C5) */
|
||||
|
||||
mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */
|
||||
mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */
|
||||
mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */
|
||||
mmA = _mm_unpacklo_pi32(mmA, mmD); /* (A0 B0 C0 A1 B1 C1 A2 B2) */
|
||||
mmE = _mm_unpacklo_pi32(mmE, mmG); /* (C2 A3 B3 C3 A4 B4 C4 A5) */
|
||||
mmC = _mm_unpacklo_pi32(mmC, mmF); /* (B5 C5 A6 B6 C6 A7 B7 C7) */
|
||||
|
||||
if (num_cols >= 8) {
|
||||
if (!(((long)outptr) & 7)) {
|
||||
@@ -320,25 +330,33 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
|
||||
xe = _mm_xor_si64(xe, xe);
|
||||
xo = _mm_xor_si64(xo, xo);
|
||||
#endif
|
||||
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
|
||||
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
|
||||
/* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
|
||||
/* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
|
||||
|
||||
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
|
||||
mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
|
||||
mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
|
||||
mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
|
||||
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
|
||||
* mapping of components A, B, C, and D to red, green, and blue.
|
||||
*
|
||||
* mmA = (A0 A2 A4 A6 ** ** ** **) = AE
|
||||
* mmB = (A1 A3 A5 A7 ** ** ** **) = AO
|
||||
* mmC = (B0 B2 B4 B6 ** ** ** **) = BE
|
||||
* mmD = (B1 B3 B5 B7 ** ** ** **) = BO
|
||||
* mmE = (C0 C2 C4 C6 ** ** ** **) = CE
|
||||
* mmF = (C1 C3 C5 C7 ** ** ** **) = CO
|
||||
* mmG = (D0 D2 D4 D6 ** ** ** **) = DE
|
||||
* mmH = (D1 D3 D5 D7 ** ** ** **) = DO
|
||||
*/
|
||||
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
|
||||
mmE = _mm_unpacklo_pi8(mmE, mmG); /* (C0 D0 C2 D2 C4 D4 C6 D6) */
|
||||
mmB = _mm_unpacklo_pi8(mmB, mmD); /* (A1 B1 A3 B3 A5 B5 A7 B7) */
|
||||
mmF = _mm_unpacklo_pi8(mmF, mmH); /* (C1 D1 C3 D3 C5 D5 C7 D7) */
|
||||
|
||||
mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */
|
||||
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */
|
||||
mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */
|
||||
mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */
|
||||
mmC = _mm_unpackhi_pi16(mmA, mmE); /* (A4 B4 C4 D4 A6 B6 C6 D6) */
|
||||
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (A0 B0 C0 D0 A2 B2 C2 D2) */
|
||||
mmG = _mm_unpackhi_pi16(mmB, mmF); /* (A5 B5 C5 D5 A7 B7 C7 D7) */
|
||||
mmB = _mm_unpacklo_pi16(mmB, mmF); /* (A1 B1 C1 D1 A3 B3 C3 D3) */
|
||||
|
||||
mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */
|
||||
mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */
|
||||
mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */
|
||||
mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */
|
||||
mmD = _mm_unpackhi_pi32(mmA, mmB); /* (A2 B2 C2 D2 A3 B3 C3 D3) */
|
||||
mmA = _mm_unpacklo_pi32(mmA, mmB); /* (A0 B0 C0 D0 A1 B1 C1 D1) */
|
||||
mmH = _mm_unpackhi_pi32(mmC, mmG); /* (A6 B6 C6 D6 A7 B7 C7 D7) */
|
||||
mmC = _mm_unpacklo_pi32(mmC, mmG); /* (A4 B4 C4 D4 A5 B5 C5 D5) */
|
||||
|
||||
if (num_cols >= 8) {
|
||||
if (!(((long)outptr) & 7)) {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
* Loongson MMI optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2015, 2019, 2025, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
|
||||
* All Rights Reserved.
|
||||
* Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
|
||||
@@ -120,8 +120,8 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
mask = decenter = 0.0;
|
||||
mask = _mm_cmpeq_pi16(mask, mask);
|
||||
decenter = _mm_cmpeq_pi16(decenter, decenter);
|
||||
mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
|
||||
decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
|
||||
mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */
|
||||
decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */
|
||||
|
||||
cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
|
||||
cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */
|
||||
@@ -143,15 +143,15 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
* B = Y - 0.22800 * Cb + Cb + Cb
|
||||
*/
|
||||
|
||||
cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
|
||||
cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */
|
||||
crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
|
||||
crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */
|
||||
cbl2 = _mm_add_pi16(cbl, cbl); /* 2 * CbL */
|
||||
cbh2 = _mm_add_pi16(cbh, cbh); /* 2 * CbH */
|
||||
crl2 = _mm_add_pi16(crl, crl); /* 2 * CrL */
|
||||
crh2 = _mm_add_pi16(crh, crh); /* 2 * CrH */
|
||||
|
||||
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
|
||||
bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */
|
||||
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
|
||||
rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */
|
||||
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2 * CbL * -FIX(0.22800) */
|
||||
bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2 * CbH * -FIX(0.22800) */
|
||||
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2 * CrL * FIX(0.40200)) */
|
||||
rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2 * CrH * FIX(0.40200)) */
|
||||
|
||||
bl = _mm_add_pi16(bl, PW_ONE);
|
||||
bh = _mm_add_pi16(bh, PW_ONE);
|
||||
@@ -164,10 +164,10 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
|
||||
bl = _mm_add_pi16(bl, cbl);
|
||||
bh = _mm_add_pi16(bh, cbh);
|
||||
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
|
||||
bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */
|
||||
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
|
||||
rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */
|
||||
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200)) = (B - Y)L */
|
||||
bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200)) = (B - Y)H */
|
||||
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200)) = (R - Y)L */
|
||||
rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200)) = (R - Y)H */
|
||||
|
||||
ga = _mm_unpacklo_pi16(cbl, crl);
|
||||
gb = _mm_unpackhi_pi16(cbl, crl);
|
||||
@@ -187,10 +187,10 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
gc = _mm_srai_pi32(gc, SCALEBITS);
|
||||
gd = _mm_srai_pi32(gd, SCALEBITS);
|
||||
|
||||
gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
|
||||
gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
|
||||
gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
|
||||
gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
|
||||
gl = _mm_packs_pi32(ga, gb); /* CbL * -FIX(0.344) + CrL * FIX(0.285) */
|
||||
gh = _mm_packs_pi32(gc, gd); /* CbH * -FIX(0.344) + CrH * FIX(0.285) */
|
||||
gl = _mm_sub_pi16(gl, crl); /* CbL * -FIX(0.344) + CrL * -FIX(0.714) = (G - Y)L */
|
||||
gh = _mm_sub_pi16(gh, crh); /* CbH * -FIX(0.344) + CrH * -FIX(0.714) = (G - Y)H */
|
||||
|
||||
ythise = _mm_and_si64(mask, ythis); /* Y(0246) */
|
||||
ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */
|
||||
@@ -220,38 +220,47 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
|
||||
#if RGB_PIXELSIZE == 3
|
||||
|
||||
/* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
|
||||
/* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
|
||||
/* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
|
||||
mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
|
||||
mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
|
||||
mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
|
||||
mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */
|
||||
mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
|
||||
mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */
|
||||
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
|
||||
* mapping of components A, B, and C to red, green, and blue.
|
||||
*
|
||||
* mmA = (A0 A2 A4 A6 A8 Aa Ac Ae) = AE
|
||||
* mmB = (A1 A3 A5 A7 A9 Ab Ad Af) = AO
|
||||
* mmC = (B0 B2 B4 B6 B8 Ba Bc Be) = BE
|
||||
* mmD = (B1 B3 B5 B7 B9 Bb Bd Bf) = BO
|
||||
* mmE = (C0 C2 C4 C6 C8 Ca Cc Ce) = CE
|
||||
* mmF = (C1 C3 C5 C7 C9 Cb Cd Cf) = CO
|
||||
* mmG = (** ** ** ** ** ** ** **)
|
||||
* mmH = (** ** ** ** ** ** ** **)
|
||||
*/
|
||||
mmG = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
|
||||
mmA = _mm_unpackhi_pi8(mmA, mmC); /* (A8 B8 Aa Ba Ac Bc Ae Be) */
|
||||
mmH = _mm_unpacklo_pi8(mmE, mmB); /* (C0 A1 C2 A3 C4 A5 C6 A7) */
|
||||
mmE = _mm_unpackhi_pi8(mmE, mmB); /* (C8 A9 Ca Ab Cc Ad Ce Af) */
|
||||
mmC = _mm_unpacklo_pi8(mmD, mmF); /* (B1 C1 B3 C3 B5 C5 B7 C7) */
|
||||
mmD = _mm_unpackhi_pi8(mmD, mmF); /* (B9 C9 Bb Cb Bd Cd Bf Cf) */
|
||||
|
||||
mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */
|
||||
mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */
|
||||
mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */
|
||||
mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */
|
||||
mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */
|
||||
mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */
|
||||
mmB = _mm_unpacklo_pi16(mmG, mmA); /* (A0 B0 A8 B8 A2 B2 Aa Ba) */
|
||||
mmA = _mm_unpackhi_pi16(mmG, mmA); /* (A4 B4 Ac Bc A6 B6 Ae Be) */
|
||||
mmF = _mm_unpacklo_pi16(mmH, mmE); /* (C0 A1 C8 A9 C2 A3 Ca Ab) */
|
||||
mmE = _mm_unpackhi_pi16(mmH, mmE); /* (C4 A5 Cc Ad C6 A7 Ce Af) */
|
||||
mmH = _mm_unpacklo_pi16(mmC, mmD); /* (B1 C1 B9 C9 B3 C3 Bb Cb) */
|
||||
mmG = _mm_unpackhi_pi16(mmC, mmD); /* (B5 C5 Bd Cd B7 C7 Bf Cf) */
|
||||
|
||||
mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */
|
||||
mmC = _mm_unpacklo_pi16(mmB, mmF); /* (A0 B0 C0 A1 A8 B8 C8 A9) */
|
||||
mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
|
||||
mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */
|
||||
mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */
|
||||
mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */
|
||||
mmB = _mm_unpacklo_pi16(mmH, mmB); /* (B1 C1 A2 B2 B9 C9 Aa Ba) */
|
||||
mmD = _mm_unpackhi_pi16(mmF, mmH); /* (C2 A3 B3 C3 Ca Ab Bb Cb) */
|
||||
mmF = _mm_unpacklo_pi16(mmA, mmE); /* (A4 B4 C4 A5 Ac Bc Cc Ad) */
|
||||
mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
|
||||
mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */
|
||||
mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */
|
||||
mmH = _mm_unpacklo_pi16(mmG, mmA); /* (B5 C5 A6 B6 Bd Cd Ae Be) */
|
||||
mmG = _mm_unpackhi_pi16(mmE, mmG); /* (C6 A7 B7 C7 Ce Af Bf Cf) */
|
||||
|
||||
mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */
|
||||
mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */
|
||||
mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */
|
||||
mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */
|
||||
mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */
|
||||
mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */
|
||||
mmA = _mm_unpacklo_pi32(mmC, mmB); /* (A0 B0 C0 A1 B1 C1 A2 B2) */
|
||||
mmE = _mm_unpackhi_pi32(mmC, mmB); /* (A8 B8 C8 A9 B9 C9 Aa Ba) */
|
||||
mmB = _mm_unpacklo_pi32(mmD, mmF); /* (C2 A3 B3 C3 A4 B4 C4 A5) */
|
||||
mmF = _mm_unpackhi_pi32(mmD, mmF); /* (Ca Ab Bb Cb Ac Bc Cc Ad) */
|
||||
mmC = _mm_unpacklo_pi32(mmH, mmG); /* (B5 C5 A6 B6 C6 A7 B7 C7) */
|
||||
mmG = _mm_unpackhi_pi32(mmH, mmG); /* (Bd Cd Ae Be Ce Af Bf Cf) */
|
||||
|
||||
if (num_cols >= 8) {
|
||||
if (!(((long)outptr) & 7)) {
|
||||
@@ -367,40 +376,48 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
xe = _mm_xor_si64(xe, xe);
|
||||
xo = _mm_xor_si64(xo, xo);
|
||||
#endif
|
||||
/* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
|
||||
/* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
|
||||
/* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
|
||||
/* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
|
||||
|
||||
mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
|
||||
mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
|
||||
mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
|
||||
mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */
|
||||
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
|
||||
* mapping of components A, B, C, and D to red, green, and blue.
|
||||
*
|
||||
* mmA = (A0 A2 A4 A6 A8 Aa Ac Ae) = AE
|
||||
* mmB = (A1 A3 A5 A7 A9 Ab Ad Af) = AO
|
||||
* mmC = (B0 B2 B4 B6 B8 Ba Bc Be) = BE
|
||||
* mmD = (B1 B3 B5 B7 B9 Bb Bd Bf) = BO
|
||||
* mmE = (C0 C2 C4 C6 C8 Ca Cc Ce) = CE
|
||||
* mmF = (C1 C3 C5 C7 C9 Cb Cd Cf) = CO
|
||||
* mmG = (D0 D2 D4 D6 D8 Da Dc De) = DE
|
||||
* mmH = (D1 D3 D5 D7 D9 Db Dd Df) = DO
|
||||
*/
|
||||
mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
|
||||
mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (A8 B8 Aa Ba Ac Bc Ae Be) */
|
||||
mmA = _mm_unpacklo_pi8(mmE, mmG); /* (C0 D0 C2 D2 C4 D4 C6 D6) */
|
||||
mmE = _mm_unpackhi_pi8(mmE, mmG); /* (C8 D8 Ca Da Cc Dc Ce De) */
|
||||
|
||||
mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
|
||||
mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */
|
||||
mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
|
||||
mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */
|
||||
mmG = _mm_unpacklo_pi8(mmB, mmD); /* (A1 B1 A3 B3 A5 B5 A7 B7) */
|
||||
mmB = _mm_unpackhi_pi8(mmB, mmD); /* (A9 B9 Ab Bb Ad Bd Af Bf) */
|
||||
mmD = _mm_unpacklo_pi8(mmF, mmH); /* (C1 D1 C3 D3 C5 D5 C7 D7) */
|
||||
mmF = _mm_unpackhi_pi8(mmF, mmH); /* (C9 D9 Cb Db Cd Dd Cf Df) */
|
||||
|
||||
mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */
|
||||
mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */
|
||||
mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */
|
||||
mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */
|
||||
mmH = _mm_unpacklo_pi16(mm8, mmA); /* (A0 B0 C0 D0 A2 B2 C2 D2) */
|
||||
mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (A4 B4 C4 D4 A6 B6 C6 D6) */
|
||||
mmA = _mm_unpacklo_pi16(mmG, mmD); /* (A1 B1 C1 D1 A3 B3 C3 D3) */
|
||||
mmD = _mm_unpackhi_pi16(mmG, mmD); /* (A5 B5 C5 D5 A7 B7 C7 D7) */
|
||||
|
||||
mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */
|
||||
mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */
|
||||
mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */
|
||||
mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */
|
||||
mmG = _mm_unpackhi_pi16(mm9, mmE); /* (Ac Bc Cc Dc Ae Be Ce De) */
|
||||
mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (A8 B8 C8 D8 Aa Ba Ca Da) */
|
||||
mmE = _mm_unpacklo_pi16(mmB, mmF); /* (A9 B9 C9 D9 Ab Bb Cb Db) */
|
||||
mmF = _mm_unpackhi_pi16(mmB, mmF); /* (Ad Bd Cd Dd Af Bf Cf Df) */
|
||||
|
||||
mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */
|
||||
mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */
|
||||
mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */
|
||||
mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */
|
||||
mmB = _mm_unpackhi_pi32(mmH, mmA); /* (A2 B2 C2 D2 A3 B3 C3 D3) */
|
||||
mmA = _mm_unpacklo_pi32(mmH, mmA); /* (A0 B0 C0 D0 A1 B1 C1 D1) */
|
||||
mmC = _mm_unpacklo_pi32(mm8, mmD); /* (A4 B4 C4 D4 A5 B5 C5 D5) */
|
||||
mmD = _mm_unpackhi_pi32(mm8, mmD); /* (A6 B6 C6 D6 A7 B7 C7 D7) */
|
||||
|
||||
mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */
|
||||
mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */
|
||||
mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */
|
||||
mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */
|
||||
mmH = _mm_unpackhi_pi32(mmG, mmF); /* (Ae Be Ce De Af Bf Cf Df) */
|
||||
mmG = _mm_unpacklo_pi32(mmG, mmF); /* (Ac Bc Cc Dc Ad Bd Cd Dd) */
|
||||
mmF = _mm_unpackhi_pi32(mm9, mmE); /* (Aa Ba Ca Da Ab Bb Cb Db) */
|
||||
mmE = _mm_unpacklo_pi32(mm9, mmE); /* (A8 B8 C8 D8 A9 B9 C9 D9) */
|
||||
|
||||
if (num_cols >= 8) {
|
||||
if (!(((long)outptr) & 7)) {
|
||||
@@ -505,17 +522,17 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
|
||||
decenter = 0.0;
|
||||
decenter = _mm_cmpeq_pi16(decenter, decenter);
|
||||
decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
|
||||
decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */
|
||||
|
||||
cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
|
||||
crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
|
||||
cbl = _mm_add_pi16(cbl, decenter);
|
||||
crl = _mm_add_pi16(crl, decenter);
|
||||
|
||||
cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
|
||||
crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
|
||||
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
|
||||
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
|
||||
cbl2 = _mm_add_pi16(cbl, cbl); /* 2 * CbL */
|
||||
crl2 = _mm_add_pi16(crl, crl); /* 2 * CrL */
|
||||
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2 * CbL * -FIX(0.22800) */
|
||||
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2 * CrL * FIX(0.40200)) */
|
||||
|
||||
bl = _mm_add_pi16(bl, PW_ONE);
|
||||
bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
|
||||
@@ -523,15 +540,15 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
|
||||
rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
|
||||
|
||||
bl = _mm_add_pi16(bl, cbl);
|
||||
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
|
||||
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
|
||||
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200)) = (B - Y)L */
|
||||
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200)) = (R - Y)L */
|
||||
|
||||
gl = _mm_unpacklo_pi16(cbl, crl);
|
||||
gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
|
||||
gl = _mm_add_pi32(gl, PD_ONEHALF);
|
||||
gl = _mm_srai_pi32(gl, SCALEBITS);
|
||||
gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
|
||||
gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
|
||||
gl = _mm_packs_pi32(gl, zero); /* CbL * -FIX(0.344) + CrL * FIX(0.285) */
|
||||
gl = _mm_sub_pi16(gl, crl); /* CbL * -FIX(0.344) + CrL * -FIX(0.714) = (G - Y)L */
|
||||
|
||||
yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */
|
||||
rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */
|
||||
|
||||
@@ -133,35 +133,35 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
|
||||
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
|
||||
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
|
||||
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
|
||||
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a = (20 30 21 31) */ \
|
||||
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b = (22 32 23 33) */ \
|
||||
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c = (24 34 25 35) */ \
|
||||
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d = (26 36 27 37) */ \
|
||||
\
|
||||
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
|
||||
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
|
||||
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
|
||||
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
|
||||
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a = (00 10 01 11) */ \
|
||||
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b = (02 12 03 13) */ \
|
||||
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c = (04 14 05 15) */ \
|
||||
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d = (06 16 07 17) */ \
|
||||
\
|
||||
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
|
||||
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
|
||||
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
|
||||
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
|
||||
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0 = (00 10 20 30) */ \
|
||||
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1 = (01 11 21 31) */ \
|
||||
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6 = (06 16 26 36) */ \
|
||||
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7 = (07 17 27 37) */ \
|
||||
\
|
||||
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
|
||||
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
|
||||
tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
|
||||
tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
|
||||
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6 = col1 - col6 */ \
|
||||
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7 = col0 - col7 */ \
|
||||
tmp1 = _mm_add_pi16(col1, col6); /* tmp1 = col1 + col6 */ \
|
||||
tmp0 = _mm_add_pi16(col0, col7); /* tmp0 = col0 + col7 */ \
|
||||
\
|
||||
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
|
||||
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
|
||||
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
|
||||
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
|
||||
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2 = (02 12 22 32) */ \
|
||||
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3 = (03 13 23 33) */ \
|
||||
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4 = (04 14 24 34) */ \
|
||||
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5 = (05 15 25 35) */ \
|
||||
\
|
||||
tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
|
||||
tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
|
||||
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
|
||||
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
|
||||
tmp3 = _mm_add_pi16(col3, col4); /* tmp3 = col3 + col4 */ \
|
||||
tmp2 = _mm_add_pi16(col2, col5); /* tmp2 = col2 + col5 */ \
|
||||
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4 = col3 - col4 */ \
|
||||
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5 = col2 - col5 */ \
|
||||
\
|
||||
DO_FDCT_COMMON() \
|
||||
\
|
||||
@@ -191,35 +191,35 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
|
||||
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
|
||||
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
|
||||
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
|
||||
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a = (02 03 12 13) */ \
|
||||
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b = (22 23 32 33) */ \
|
||||
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c = (42 43 52 53) */ \
|
||||
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d = (62 63 72 73) */ \
|
||||
\
|
||||
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
|
||||
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
|
||||
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
|
||||
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
|
||||
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a = (00 01 10 11) */ \
|
||||
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b = (20 21 30 31) */ \
|
||||
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c = (40 41 50 51) */ \
|
||||
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d = (60 61 70 71) */ \
|
||||
\
|
||||
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
|
||||
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
|
||||
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
|
||||
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
|
||||
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0 = (00 01 02 03) */ \
|
||||
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1 = (10 11 12 13) */ \
|
||||
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6 = (60 61 62 63) */ \
|
||||
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7 = (70 71 72 73) */ \
|
||||
\
|
||||
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
|
||||
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
|
||||
tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
|
||||
tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
|
||||
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6 = row1 - row6 */ \
|
||||
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7 = row0 - row7 */ \
|
||||
tmp1 = _mm_add_pi16(row1, row6); /* tmp1 = row1 + row6 */ \
|
||||
tmp0 = _mm_add_pi16(row0, row7); /* tmp0 = row0 + row7 */ \
|
||||
\
|
||||
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
|
||||
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
|
||||
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
|
||||
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
|
||||
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2 = (20 21 22 23) */ \
|
||||
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3 = (30 31 32 33) */ \
|
||||
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4 = (40 41 42 43) */ \
|
||||
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5 = (50 51 52 53) */ \
|
||||
\
|
||||
tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
|
||||
tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
|
||||
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
|
||||
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
|
||||
tmp3 = _mm_add_pi16(row3, row4); /* tmp3 = row3 + row4 */ \
|
||||
tmp2 = _mm_add_pi16(row2, row5); /* tmp2 = row2 + row5 */ \
|
||||
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4 = row3 - row4 */ \
|
||||
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5 = row2 - row5 */ \
|
||||
\
|
||||
DO_FDCT_COMMON() \
|
||||
\
|
||||
|
||||
@@ -248,45 +248,45 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
|
||||
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
|
||||
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
|
||||
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
|
||||
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a = (20 30 21 31) */ \
|
||||
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b = (22 32 23 33) */ \
|
||||
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c = (24 34 25 35) */ \
|
||||
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d = (26 36 27 37) */ \
|
||||
\
|
||||
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
|
||||
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
|
||||
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
|
||||
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
|
||||
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a = (00 10 01 11) */ \
|
||||
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b = (02 12 03 13) */ \
|
||||
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c = (04 14 05 15) */ \
|
||||
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d = (06 16 07 17) */ \
|
||||
\
|
||||
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
|
||||
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
|
||||
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
|
||||
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
|
||||
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0 = (00 10 20 30) */ \
|
||||
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1 = (01 11 21 31) */ \
|
||||
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6 = (06 16 26 36) */ \
|
||||
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7 = (07 17 27 37) */ \
|
||||
\
|
||||
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
|
||||
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
|
||||
tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
|
||||
tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
|
||||
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6 = col1 - col6 */ \
|
||||
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7 = col0 - col7 */ \
|
||||
tmp1 = _mm_add_pi16(col1, col6); /* tmp1 = col1 + col6 */ \
|
||||
tmp0 = _mm_add_pi16(col0, col7); /* tmp0 = col0 + col7 */ \
|
||||
\
|
||||
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
|
||||
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
|
||||
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
|
||||
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
|
||||
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2 = (02 12 22 32) */ \
|
||||
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3 = (03 13 23 33) */ \
|
||||
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4 = (04 14 24 34) */ \
|
||||
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5 = (05 15 25 35) */ \
|
||||
\
|
||||
tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
|
||||
tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
|
||||
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
|
||||
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
|
||||
tmp3 = _mm_add_pi16(col3, col4); /* tmp3 = col3 + col4 */ \
|
||||
tmp2 = _mm_add_pi16(col2, col5); /* tmp2 = col2 + col5 */ \
|
||||
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4 = col3 - col4 */ \
|
||||
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5 = col2 - col5 */ \
|
||||
\
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
|
||||
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
|
||||
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
|
||||
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
|
||||
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10 = tmp0 + tmp3 */ \
|
||||
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13 = tmp0 - tmp3 */ \
|
||||
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11 = tmp1 + tmp2 */ \
|
||||
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12 = tmp1 - tmp2 */ \
|
||||
\
|
||||
out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
|
||||
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
|
||||
out0 = _mm_add_pi16(tmp10, tmp11); /* out0 = tmp10 + tmp11 */ \
|
||||
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4 = tmp10 - tmp11 */ \
|
||||
out0 = _mm_slli_pi16(out0, PASS1_BITS); \
|
||||
out4 = _mm_slli_pi16(out4, PASS1_BITS); \
|
||||
\
|
||||
@@ -319,45 +319,45 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
|
||||
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
|
||||
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
|
||||
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
|
||||
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a = (02 03 12 13) */ \
|
||||
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b = (22 23 32 33) */ \
|
||||
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c = (42 43 52 53) */ \
|
||||
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d = (62 63 72 73) */ \
|
||||
\
|
||||
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
|
||||
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
|
||||
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
|
||||
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
|
||||
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a = (00 01 10 11) */ \
|
||||
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b = (20 21 30 31) */ \
|
||||
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c = (40 41 50 51) */ \
|
||||
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d = (60 61 70 71) */ \
|
||||
\
|
||||
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
|
||||
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
|
||||
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
|
||||
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
|
||||
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0 = (00 01 02 03) */ \
|
||||
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1 = (10 11 12 13) */ \
|
||||
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6 = (60 61 62 63) */ \
|
||||
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7 = (70 71 72 73) */ \
|
||||
\
|
||||
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
|
||||
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
|
||||
tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
|
||||
tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
|
||||
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6 = row1 - row6 */ \
|
||||
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7 = row0 - row7 */ \
|
||||
tmp1 = _mm_add_pi16(row1, row6); /* tmp1 = row1 + row6 */ \
|
||||
tmp0 = _mm_add_pi16(row0, row7); /* tmp0 = row0 + row7 */ \
|
||||
\
|
||||
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
|
||||
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
|
||||
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
|
||||
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
|
||||
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2 = (20 21 22 23) */ \
|
||||
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3 = (30 31 32 33) */ \
|
||||
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4 = (40 41 42 43) */ \
|
||||
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5 = (50 51 52 53) */ \
|
||||
\
|
||||
tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
|
||||
tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
|
||||
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
|
||||
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
|
||||
tmp3 = _mm_add_pi16(row3, row4); /* tmp3 = row3 + row4 */ \
|
||||
tmp2 = _mm_add_pi16(row2, row5); /* tmp2 = row2 + row5 */ \
|
||||
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4 = row3 - row4 */ \
|
||||
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5 = row2 - row5 */ \
|
||||
\
|
||||
/* Even part */ \
|
||||
\
|
||||
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
|
||||
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
|
||||
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
|
||||
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
|
||||
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10 = tmp0 + tmp3 */ \
|
||||
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13 = tmp0 - tmp3 */ \
|
||||
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11 = tmp1 + tmp2 */ \
|
||||
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12 = tmp1 - tmp2 */ \
|
||||
\
|
||||
out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
|
||||
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
|
||||
out0 = _mm_add_pi16(tmp10, tmp11); /* out0 = tmp10 + tmp11 */ \
|
||||
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4 = tmp10 - tmp11 */ \
|
||||
\
|
||||
out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
|
||||
out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
|
||||
|
||||
@@ -156,15 +156,15 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
|
||||
\
|
||||
dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval=(00 10 20 30) */ \
|
||||
dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval = (00 10 20 30) */ \
|
||||
\
|
||||
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
|
||||
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
|
||||
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall = (00 00 10 10) */ \
|
||||
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh = (20 20 30 30) */ \
|
||||
\
|
||||
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
|
||||
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
|
||||
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
|
||||
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
|
||||
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0 = (00 00 00 00) */ \
|
||||
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1 = (10 10 10 10) */ \
|
||||
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2 = (20 20 20 20) */ \
|
||||
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3 = (30 30 30 30) */ \
|
||||
\
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
|
||||
@@ -234,32 +234,33 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
DO_IDCT_COMMON() \
|
||||
\
|
||||
/* out0=(00 10 20 30), out1=(01 11 21 31) */ \
|
||||
/* out2=(02 12 22 32), out3=(03 13 23 33) */ \
|
||||
/* out4=(04 14 24 34), out5=(05 15 25 35) */ \
|
||||
/* out6=(06 16 26 36), out7=(07 17 27 37) */ \
|
||||
/* out0 = (00 10 20 30), out1 = (01 11 21 31) \
|
||||
* out2 = (02 12 22 32), out3 = (03 13 23 33) \
|
||||
* out4 = (04 14 24 34), out5 = (05 15 25 35) \
|
||||
* out6 = (06 16 26 36), out7 = (07 17 27 37) \
|
||||
*/ \
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
|
||||
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
|
||||
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
|
||||
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
|
||||
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a = (00 01 10 11) */ \
|
||||
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a = (20 21 30 31) */ \
|
||||
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d = (06 07 16 17) */ \
|
||||
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d = (26 27 36 37) */ \
|
||||
\
|
||||
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
|
||||
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
|
||||
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
|
||||
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
|
||||
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b = (02 03 12 13) */ \
|
||||
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b = (22 23 32 33) */ \
|
||||
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c = (04 05 14 15) */ \
|
||||
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c = (24 25 34 35) */ \
|
||||
\
|
||||
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
|
||||
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
|
||||
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
|
||||
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
|
||||
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l = (00 01 02 03) */ \
|
||||
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l = (10 11 12 13) */ \
|
||||
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l = (20 21 22 23) */ \
|
||||
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l = (30 31 32 33) */ \
|
||||
\
|
||||
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
|
||||
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
|
||||
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
|
||||
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
|
||||
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h = (04 05 06 07) */ \
|
||||
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h = (14 15 16 17) */ \
|
||||
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h = (24 25 26 27) */ \
|
||||
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h = (34 35 36 37) */ \
|
||||
\
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
|
||||
@@ -312,11 +313,11 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
DO_IDCT_COMMON() \
|
||||
\
|
||||
/* out0=(00 01 02 03), out1=(10 11 12 13) */ \
|
||||
/* out2=(20 21 22 23), out3=(30 31 32 33) */ \
|
||||
/* out4=(40 41 42 43), out5=(50 51 52 53) */ \
|
||||
/* out6=(60 61 62 63), out7=(70 71 72 73) */ \
|
||||
\
|
||||
/* out0 = (00 01 02 03), out1 = (10 11 12 13) \
|
||||
* out2 = (20 21 22 23), out3 = (30 31 32 33) \
|
||||
* out4 = (40 41 42 43), out5 = (50 51 52 53) \
|
||||
* out6 = (60 61 62 63), out7 = (70 71 72 73) \
|
||||
*/ \
|
||||
out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
|
||||
out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
|
||||
out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
|
||||
@@ -326,10 +327,10 @@ static uint64_t const_value[] = {
|
||||
out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
|
||||
out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
|
||||
\
|
||||
row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
|
||||
row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
|
||||
row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
|
||||
row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
|
||||
row06 = _mm_packs_pi16(out0, out6); /* row06 = (00 01 02 03 60 61 62 63) */ \
|
||||
row17 = _mm_packs_pi16(out1, out7); /* row17 = (10 11 12 13 70 71 72 73) */ \
|
||||
row24 = _mm_packs_pi16(out2, out4); /* row24 = (20 21 22 23 40 41 42 43) */ \
|
||||
row35 = _mm_packs_pi16(out3, out5); /* row35 = (30 31 32 33 50 51 52 53) */ \
|
||||
\
|
||||
row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
|
||||
row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
|
||||
@@ -338,20 +339,20 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
|
||||
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
|
||||
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
|
||||
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
|
||||
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a = (00 10 01 11 02 12 03 13) */ \
|
||||
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d = (60 70 61 71 62 72 63 73) */ \
|
||||
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b = (20 30 21 31 22 32 23 33) */ \
|
||||
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c = (40 50 41 51 42 52 43 53) */ \
|
||||
\
|
||||
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
|
||||
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
|
||||
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
|
||||
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
|
||||
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l = (00 10 20 30 01 11 21 31) */ \
|
||||
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l = (02 12 22 32 03 13 23 33) */ \
|
||||
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h = (40 50 60 70 41 51 61 71) */ \
|
||||
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h = (42 52 62 72 43 53 63 73) */ \
|
||||
\
|
||||
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
|
||||
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
|
||||
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
|
||||
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
|
||||
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0 = (00 10 20 30 40 50 60 70) */ \
|
||||
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1 = (01 11 21 31 41 51 61 71) */ \
|
||||
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2 = (02 12 22 32 42 52 62 72) */ \
|
||||
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3 = (03 13 23 33 43 53 63 73) */ \
|
||||
\
|
||||
_mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
|
||||
_mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
|
||||
|
||||
@@ -293,15 +293,15 @@ static uint64_t const_value[] = {
|
||||
quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
|
||||
\
|
||||
dcval = _mm_mullo_pi16(col0l, quant0l); \
|
||||
dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \
|
||||
dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval = (00 10 20 30) */ \
|
||||
\
|
||||
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
|
||||
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
|
||||
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall = (00 00 10 10) */ \
|
||||
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh = (20 20 30 30) */ \
|
||||
\
|
||||
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
|
||||
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
|
||||
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
|
||||
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
|
||||
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0 = (00 00 00 00) */ \
|
||||
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1 = (10 10 10 10) */ \
|
||||
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2 = (20 20 20 20) */ \
|
||||
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3 = (30 30 30 30) */ \
|
||||
\
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
|
||||
@@ -392,32 +392,33 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
DO_IDCT_COMMON(1) \
|
||||
\
|
||||
/* out0=(00 10 20 30), out1=(01 11 21 31) */ \
|
||||
/* out2=(02 12 22 32), out3=(03 13 23 33) */ \
|
||||
/* out4=(04 14 24 34), out5=(05 15 25 35) */ \
|
||||
/* out6=(06 16 26 36), out7=(07 17 27 37) */ \
|
||||
/* out0 = (00 10 20 30), out1 = (01 11 21 31) \
|
||||
* out2 = (02 12 22 32), out3 = (03 13 23 33) \
|
||||
* out4 = (04 14 24 34), out5 = (05 15 25 35) \
|
||||
* out6 = (06 16 26 36), out7 = (07 17 27 37) \
|
||||
*/ \
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
|
||||
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
|
||||
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
|
||||
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
|
||||
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a = (00 01 10 11) */ \
|
||||
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a = (20 21 30 31) */ \
|
||||
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d = (06 07 16 17) */ \
|
||||
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d = (26 27 36 37) */ \
|
||||
\
|
||||
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
|
||||
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
|
||||
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
|
||||
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
|
||||
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b = (02 03 12 13) */ \
|
||||
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b = (22 23 32 33) */ \
|
||||
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c = (04 05 14 15) */ \
|
||||
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c = (24 25 34 35) */ \
|
||||
\
|
||||
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
|
||||
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
|
||||
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
|
||||
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
|
||||
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l = (00 01 02 03) */ \
|
||||
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l = (10 11 12 13) */ \
|
||||
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l = (20 21 22 23) */ \
|
||||
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l = (30 31 32 33) */ \
|
||||
\
|
||||
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
|
||||
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
|
||||
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
|
||||
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
|
||||
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h = (04 05 06 07) */ \
|
||||
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h = (14 15 16 17) */ \
|
||||
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h = (24 25 26 27) */ \
|
||||
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h = (34 35 36 37) */ \
|
||||
\
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
|
||||
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
|
||||
@@ -497,15 +498,15 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
DO_IDCT_COMMON(2) \
|
||||
\
|
||||
/* out0=(00 01 02 03), out1=(10 11 12 13) */ \
|
||||
/* out2=(20 21 22 23), out3=(30 31 32 33) */ \
|
||||
/* out4=(40 41 42 43), out5=(50 51 52 53) */ \
|
||||
/* out6=(60 61 62 63), out7=(70 71 72 73) */ \
|
||||
\
|
||||
row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
|
||||
row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
|
||||
row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
|
||||
row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
|
||||
/* out0 = (00 01 02 03), out1 = (10 11 12 13) \
|
||||
* out2 = (20 21 22 23), out3 = (30 31 32 33) \
|
||||
* out4 = (40 41 42 43), out5 = (50 51 52 53) \
|
||||
* out6 = (60 61 62 63), out7 = (70 71 72 73) \
|
||||
*/ \
|
||||
row06 = _mm_packs_pi16(out0, out6); /* row06 = (00 01 02 03 60 61 62 63) */ \
|
||||
row17 = _mm_packs_pi16(out1, out7); /* row17 = (10 11 12 13 70 71 72 73) */ \
|
||||
row24 = _mm_packs_pi16(out2, out4); /* row24 = (20 21 22 23 40 41 42 43) */ \
|
||||
row35 = _mm_packs_pi16(out3, out5); /* row35 = (30 31 32 33 50 51 52 53) */ \
|
||||
\
|
||||
row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
|
||||
row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
|
||||
@@ -514,20 +515,20 @@ static uint64_t const_value[] = {
|
||||
\
|
||||
/* Transpose coefficients */ \
|
||||
\
|
||||
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
|
||||
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
|
||||
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
|
||||
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
|
||||
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a = (00 10 01 11 02 12 03 13) */ \
|
||||
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d = (60 70 61 71 62 72 63 73) */ \
|
||||
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b = (20 30 21 31 22 32 23 33) */ \
|
||||
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c = (40 50 41 51 42 52 43 53) */ \
|
||||
\
|
||||
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
|
||||
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
|
||||
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
|
||||
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
|
||||
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l = (00 10 20 30 01 11 21 31) */ \
|
||||
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l = (02 12 22 32 03 13 23 33) */ \
|
||||
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h = (40 50 60 70 41 51 61 71) */ \
|
||||
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h = (42 52 62 72 43 53 63 73) */ \
|
||||
\
|
||||
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
|
||||
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
|
||||
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
|
||||
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
|
||||
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0 = (00 10 20 30 40 50 60 70) */ \
|
||||
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1 = (01 11 21 31 41 51 61 71) */ \
|
||||
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2 = (02 12 22 32 42 52 62 72) */ \
|
||||
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3 = (03 13 23 33 43 53 63 73) */ \
|
||||
\
|
||||
_mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
|
||||
_mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
* Authors: ZhuChen <zhuchen@loongson.cn>
|
||||
* CaiWanwei <caiwanwei@loongson.cn>
|
||||
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
|
||||
* Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2018-2019, 2025, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -62,9 +62,10 @@
|
||||
rowl = _mm_mulhi_pi16(rowl, recipl); \
|
||||
rowh = _mm_mulhi_pi16(rowh, reciph); \
|
||||
\
|
||||
/* reciprocal is always negative (MSB=1), so we always need to add the */ \
|
||||
/* initial value (input value is never negative as we inverted it at the */ \
|
||||
/* start of this routine) */ \
|
||||
/* reciprocal is always negative (MSB = 1), so we always need to add the \
|
||||
* initial value. (The input value is never negative, as we inverted it at \
|
||||
* the start of this routine.) \
|
||||
*/ \
|
||||
rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
|
||||
rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
|
||||
\
|
||||
|
||||
Reference in New Issue
Block a user