MMI: Format comments consistently with MMX code

This commit is contained in:
DRC
2025-09-26 10:09:01 -04:00
parent e0dcefd565
commit a30b3d28dc
12 changed files with 438 additions and 400 deletions

View File

@@ -319,8 +319,8 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
#endif
/* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
* ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
/* re = (R0 R2 R4 R6), ge = (G0 G2 G4 G6), be = (B0 B2 B4 B6)
* ro = (R1 R3 R5 R7), go = (G1 G3 G5 G7), bo = (B1 B3 B5 B7)
*
* (Original)
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B

View File

@@ -30,15 +30,15 @@
#include "jsimd_mmi.h"
#define F_0_081 ((short)5329) /* FIX(0.08131) */
#define F_0_114 ((short)7471) /* FIX(0.11400) */
#define F_0_168 ((short)11059) /* FIX(0.16874) */
#define F_0_250 ((short)16384) /* FIX(0.25000) */
#define F_0_299 ((short)19595) /* FIX(0.29900) */
#define F_0_331 ((short)21709) /* FIX(0.33126) */
#define F_0_418 ((short)27439) /* FIX(0.41869) */
#define F_0_587 ((short)38470) /* FIX(0.58700) */
#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
#define F_0_081 ((short)5329) /* FIX(0.08131) */
#define F_0_114 ((short)7471) /* FIX(0.11400) */
#define F_0_168 ((short)11059) /* FIX(0.16874) */
#define F_0_250 ((short)16384) /* FIX(0.25000) */
#define F_0_299 ((short)19595) /* FIX(0.29900) */
#define F_0_331 ((short)21709) /* FIX(0.33126) */
#define F_0_418 ((short)27439) /* FIX(0.41869) */
#define F_0_587 ((short)38470) /* FIX(0.58700) */
#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
enum const_index {
index_PD_ONEHALF,

View File

@@ -28,11 +28,11 @@
#include "jsimd_mmi.h"
#define F_0_114 ((short)7471) /* FIX(0.11400) */
#define F_0_250 ((short)16384) /* FIX(0.25000) */
#define F_0_299 ((short)19595) /* FIX(0.29900) */
#define F_0_587 ((short)38470) /* FIX(0.58700) */
#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
#define F_0_114 ((short)7471) /* FIX(0.11400) */
#define F_0_250 ((short)16384) /* FIX(0.25000) */
#define F_0_299 ((short)19595) /* FIX(0.29900) */
#define F_0_587 ((short)38470) /* FIX(0.58700) */
#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
enum const_index {
index_PD_ONEHALF,

View File

@@ -310,8 +310,8 @@ void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
#endif
/* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
* ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
/* re = (R0 R2 R4 R6), ge = (G0 G2 G4 G6), be = (B0 B2 B4 B6)
* ro = (R1 R3 R5 R7), go = (G1 G3 G5 G7), bo = (B1 B3 B5 B7)
*
* (Original)
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B

View File

@@ -50,9 +50,9 @@ void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
output_cols * 2);
bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */
/* bias={1, 2, 1, 2} (16-bit) */
/* bias = { 1, 2, 1, 2 } (16-bit) */
mask = _mm_cmpeq_pi16(mask, mask);
mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */
for (inrow = 0, outrow = 0; outrow < v_samp_factor;
inrow += 2, outrow++) {

View File

@@ -2,7 +2,7 @@
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
* Copyright (C) 2015, 2019, 2025, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
@@ -116,8 +116,8 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
mask = decenter = 0.0;
mask = _mm_cmpeq_pi16(mask, mask);
decenter = _mm_cmpeq_pi16(decenter, decenter);
mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */
decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */
cbe = _mm_and_si64(mask, cb); /* Cb(0246) */
cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */
@@ -139,15 +139,15 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
* B = Y - 0.22800 * Cb + Cb + Cb
*/
cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */
cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */
cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */
cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */
cbe2 = _mm_add_pi16(cbe, cbe); /* 2 * CbE */
cbo2 = _mm_add_pi16(cbo, cbo); /* 2 * CbO */
cre2 = _mm_add_pi16(cre, cre); /* 2 * CrE */
cro2 = _mm_add_pi16(cro, cro); /* 2 * CrO */
be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */
bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */
re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */
ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */
be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2 * CbE * -FIX(0.22800) */
bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2 * CbO * -FIX(0.22800) */
re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2 * CrE * FIX(0.40200)) */
ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2 * CrO * FIX(0.40200)) */
be = _mm_add_pi16(be, PW_ONE);
bo = _mm_add_pi16(bo, PW_ONE);
@@ -160,10 +160,10 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
be = _mm_add_pi16(be, cbe);
bo = _mm_add_pi16(bo, cbo);
be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */
bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */
re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */
ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */
be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200)) = (B - Y)E */
bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200)) = (B - Y)O */
re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200)) = (R - Y)E */
ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200)) = (R - Y)O */
gle = _mm_unpacklo_pi16(cbe, cre);
ghe = _mm_unpackhi_pi16(cbe, cre);
@@ -183,54 +183,64 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
glo = _mm_srai_pi32(glo, SCALEBITS);
gho = _mm_srai_pi32(gho, SCALEBITS);
ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
ge = _mm_packs_pi32(gle, ghe); /* CbE * -FIX(0.344) + CrE * FIX(0.285) */
go = _mm_packs_pi32(glo, gho); /* CbO * -FIX(0.344) + CrO * FIX(0.285) */
ge = _mm_sub_pi16(ge, cre); /* CbE * -FIX(0.344) + CrE * -FIX(0.714) = (G - Y)E */
go = _mm_sub_pi16(go, cro); /* CbO * -FIX(0.344) + CrO * -FIX(0.714) = (G - Y)O */
ye = _mm_and_si64(mask, y); /* Y(0246) */
yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */
re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
re = _mm_add_pi16(re, ye); /* ((R - Y)E + YE) = (R0 R2 R4 R6) */
ro = _mm_add_pi16(ro, yo); /* ((R - Y)O + YO) = (R1 R3 R5 R7) */
re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */
ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */
ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
ge = _mm_add_pi16(ge, ye); /* ((G - Y)E + YE) = (G0 G2 G4 G6) */
go = _mm_add_pi16(go, yo); /* ((G - Y)O + YO) = (G1 G3 G5 G7) */
ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */
go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */
be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
be = _mm_add_pi16(be, ye); /* (YE + (B - Y)E) = (B0 B2 B4 B6) */
bo = _mm_add_pi16(bo, yo); /* (YO + (B - Y)O) = (B1 B3 B5 B7) */
be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */
bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */
#if RGB_PIXELSIZE == 3
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
* mapping of components A, B, and C to red, green, and blue.
*
* mmA = (A0 A2 A4 A6 ** ** ** **) = AE
* mmB = (A1 A3 A5 A7 ** ** ** **) = AO
* mmC = (B0 B2 B4 B6 ** ** ** **) = BE
* mmD = (B1 B3 B5 B7 ** ** ** **) = BO
* mmE = (C0 C2 C4 C6 ** ** ** **) = CE
* mmF = (C1 C3 C5 C7 ** ** ** **) = CO
* mmG = (** ** ** ** ** ** ** **)
* mmH = (** ** ** ** ** ** ** **)
*/
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
mmE = _mm_unpacklo_pi8(mmE, mmB); /* (C0 A1 C2 A3 C4 A5 C6 A7) */
mmD = _mm_unpacklo_pi8(mmD, mmF); /* (B1 C1 B3 C3 B5 C5 B7 C7) */
mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */
mmG = _mm_unpackhi_pi16(mmA, mmE); /* (A4 B4 C4 A5 A6 B6 C6 A7) */
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (A0 B0 C0 A1 A2 B2 C2 A3) */
mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */
mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (B3 C3 B5 C5 B7 C7 -- --) */
mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */
mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */
mmC = _mm_unpackhi_pi16(mmD, mmH); /* (B5 C5 A6 B6 B7 C7 -- --) */
mmD = _mm_unpacklo_pi16(mmD, mmH); /* (B1 C1 A2 B2 B3 C3 A4 B4) */
mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */
mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */
mmF = _mm_unpackhi_pi16(mmE, mmB); /* (C6 A7 B7 C7 -- -- -- --) */
mmE = _mm_unpacklo_pi16(mmE, mmB); /* (C2 A3 B3 C3 C4 A5 B5 C5) */
mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */
mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */
mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */
mmA = _mm_unpacklo_pi32(mmA, mmD); /* (A0 B0 C0 A1 B1 C1 A2 B2) */
mmE = _mm_unpacklo_pi32(mmE, mmG); /* (C2 A3 B3 C3 A4 B4 C4 A5) */
mmC = _mm_unpacklo_pi32(mmC, mmF); /* (B5 C5 A6 B6 C6 A7 B7 C7) */
if (num_cols >= 8) {
if (!(((long)outptr) & 7)) {
@@ -320,25 +330,33 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
xe = _mm_xor_si64(xe, xe);
xo = _mm_xor_si64(xo, xo);
#endif
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
/* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
/* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
* mapping of components A, B, C, and D to red, green, and blue.
*
* mmA = (A0 A2 A4 A6 ** ** ** **) = AE
* mmB = (A1 A3 A5 A7 ** ** ** **) = AO
* mmC = (B0 B2 B4 B6 ** ** ** **) = BE
* mmD = (B1 B3 B5 B7 ** ** ** **) = BO
* mmE = (C0 C2 C4 C6 ** ** ** **) = CE
* mmF = (C1 C3 C5 C7 ** ** ** **) = CO
* mmG = (D0 D2 D4 D6 ** ** ** **) = DE
* mmH = (D1 D3 D5 D7 ** ** ** **) = DO
*/
mmA = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
mmE = _mm_unpacklo_pi8(mmE, mmG); /* (C0 D0 C2 D2 C4 D4 C6 D6) */
mmB = _mm_unpacklo_pi8(mmB, mmD); /* (A1 B1 A3 B3 A5 B5 A7 B7) */
mmF = _mm_unpacklo_pi8(mmF, mmH); /* (C1 D1 C3 D3 C5 D5 C7 D7) */
mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */
mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */
mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */
mmC = _mm_unpackhi_pi16(mmA, mmE); /* (A4 B4 C4 D4 A6 B6 C6 D6) */
mmA = _mm_unpacklo_pi16(mmA, mmE); /* (A0 B0 C0 D0 A2 B2 C2 D2) */
mmG = _mm_unpackhi_pi16(mmB, mmF); /* (A5 B5 C5 D5 A7 B7 C7 D7) */
mmB = _mm_unpacklo_pi16(mmB, mmF); /* (A1 B1 C1 D1 A3 B3 C3 D3) */
mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */
mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */
mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */
mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */
mmD = _mm_unpackhi_pi32(mmA, mmB); /* (A2 B2 C2 D2 A3 B3 C3 D3) */
mmA = _mm_unpacklo_pi32(mmA, mmB); /* (A0 B0 C0 D0 A1 B1 C1 D1) */
mmH = _mm_unpackhi_pi32(mmC, mmG); /* (A6 B6 C6 D6 A7 B7 C7 D7) */
mmC = _mm_unpacklo_pi32(mmC, mmG); /* (A4 B4 C4 D4 A5 B5 C5 D5) */
if (num_cols >= 8) {
if (!(((long)outptr) & 7)) {

View File

@@ -2,7 +2,7 @@
* Loongson MMI optimizations for libjpeg-turbo
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
* Copyright (C) 2015, 2019, 2025, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
@@ -120,8 +120,8 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
mask = decenter = 0.0;
mask = _mm_cmpeq_pi16(mask, mask);
decenter = _mm_cmpeq_pi16(decenter, decenter);
mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */
decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */
cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */
@@ -143,15 +143,15 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
* B = Y - 0.22800 * Cb + Cb + Cb
*/
cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */
crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */
cbl2 = _mm_add_pi16(cbl, cbl); /* 2 * CbL */
cbh2 = _mm_add_pi16(cbh, cbh); /* 2 * CbH */
crl2 = _mm_add_pi16(crl, crl); /* 2 * CrL */
crh2 = _mm_add_pi16(crh, crh); /* 2 * CrH */
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2 * CbL * -FIX(0.22800) */
bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2 * CbH * -FIX(0.22800) */
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2 * CrL * FIX(0.40200)) */
rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2 * CrH * FIX(0.40200)) */
bl = _mm_add_pi16(bl, PW_ONE);
bh = _mm_add_pi16(bh, PW_ONE);
@@ -164,10 +164,10 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
bl = _mm_add_pi16(bl, cbl);
bh = _mm_add_pi16(bh, cbh);
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200)) = (B - Y)L */
bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200)) = (B - Y)H */
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200)) = (R - Y)L */
rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200)) = (R - Y)H */
ga = _mm_unpacklo_pi16(cbl, crl);
gb = _mm_unpackhi_pi16(cbl, crl);
@@ -187,10 +187,10 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
gc = _mm_srai_pi32(gc, SCALEBITS);
gd = _mm_srai_pi32(gd, SCALEBITS);
gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
gl = _mm_packs_pi32(ga, gb); /* CbL * -FIX(0.344) + CrL * FIX(0.285) */
gh = _mm_packs_pi32(gc, gd); /* CbH * -FIX(0.344) + CrH * FIX(0.285) */
gl = _mm_sub_pi16(gl, crl); /* CbL * -FIX(0.344) + CrL * -FIX(0.714) = (G - Y)L */
gh = _mm_sub_pi16(gh, crh); /* CbH * -FIX(0.344) + CrH * -FIX(0.714) = (G - Y)H */
ythise = _mm_and_si64(mask, ythis); /* Y(0246) */
ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */
@@ -220,38 +220,47 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
#if RGB_PIXELSIZE == 3
/* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
/* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
/* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */
mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
* mapping of components A, B, and C to red, green, and blue.
*
* mmA = (A0 A2 A4 A6 A8 Aa Ac Ae) = AE
* mmB = (A1 A3 A5 A7 A9 Ab Ad Af) = AO
* mmC = (B0 B2 B4 B6 B8 Ba Bc Be) = BE
* mmD = (B1 B3 B5 B7 B9 Bb Bd Bf) = BO
* mmE = (C0 C2 C4 C6 C8 Ca Cc Ce) = CE
* mmF = (C1 C3 C5 C7 C9 Cb Cd Cf) = CO
* mmG = (** ** ** ** ** ** ** **)
* mmH = (** ** ** ** ** ** ** **)
*/
mmG = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
mmA = _mm_unpackhi_pi8(mmA, mmC); /* (A8 B8 Aa Ba Ac Bc Ae Be) */
mmH = _mm_unpacklo_pi8(mmE, mmB); /* (C0 A1 C2 A3 C4 A5 C6 A7) */
mmE = _mm_unpackhi_pi8(mmE, mmB); /* (C8 A9 Ca Ab Cc Ad Ce Af) */
mmC = _mm_unpacklo_pi8(mmD, mmF); /* (B1 C1 B3 C3 B5 C5 B7 C7) */
mmD = _mm_unpackhi_pi8(mmD, mmF); /* (B9 C9 Bb Cb Bd Cd Bf Cf) */
mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */
mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */
mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */
mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */
mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */
mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */
mmB = _mm_unpacklo_pi16(mmG, mmA); /* (A0 B0 A8 B8 A2 B2 Aa Ba) */
mmA = _mm_unpackhi_pi16(mmG, mmA); /* (A4 B4 Ac Bc A6 B6 Ae Be) */
mmF = _mm_unpacklo_pi16(mmH, mmE); /* (C0 A1 C8 A9 C2 A3 Ca Ab) */
mmE = _mm_unpackhi_pi16(mmH, mmE); /* (C4 A5 Cc Ad C6 A7 Ce Af) */
mmH = _mm_unpacklo_pi16(mmC, mmD); /* (B1 C1 B9 C9 B3 C3 Bb Cb) */
mmG = _mm_unpackhi_pi16(mmC, mmD); /* (B5 C5 Bd Cd B7 C7 Bf Cf) */
mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */
mmC = _mm_unpacklo_pi16(mmB, mmF); /* (A0 B0 C0 A1 A8 B8 C8 A9) */
mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */
mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */
mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */
mmB = _mm_unpacklo_pi16(mmH, mmB); /* (B1 C1 A2 B2 B9 C9 Aa Ba) */
mmD = _mm_unpackhi_pi16(mmF, mmH); /* (C2 A3 B3 C3 Ca Ab Bb Cb) */
mmF = _mm_unpacklo_pi16(mmA, mmE); /* (A4 B4 C4 A5 Ac Bc Cc Ad) */
mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */
mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */
mmH = _mm_unpacklo_pi16(mmG, mmA); /* (B5 C5 A6 B6 Bd Cd Ae Be) */
mmG = _mm_unpackhi_pi16(mmE, mmG); /* (C6 A7 B7 C7 Ce Af Bf Cf) */
mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */
mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */
mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */
mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */
mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */
mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */
mmA = _mm_unpacklo_pi32(mmC, mmB); /* (A0 B0 C0 A1 B1 C1 A2 B2) */
mmE = _mm_unpackhi_pi32(mmC, mmB); /* (A8 B8 C8 A9 B9 C9 Aa Ba) */
mmB = _mm_unpacklo_pi32(mmD, mmF); /* (C2 A3 B3 C3 A4 B4 C4 A5) */
mmF = _mm_unpackhi_pi32(mmD, mmF); /* (Ca Ab Bb Cb Ac Bc Cc Ad) */
mmC = _mm_unpacklo_pi32(mmH, mmG); /* (B5 C5 A6 B6 C6 A7 B7 C7) */
mmG = _mm_unpackhi_pi32(mmH, mmG); /* (Bd Cd Ae Be Ce Af Bf Cf) */
if (num_cols >= 8) {
if (!(((long)outptr) & 7)) {
@@ -367,40 +376,48 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
xe = _mm_xor_si64(xe, xe);
xo = _mm_xor_si64(xo, xo);
#endif
/* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
/* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
/* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
/* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */
/* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the
* mapping of components A, B, C, and D to red, green, and blue.
*
* mmA = (A0 A2 A4 A6 A8 Aa Ac Ae) = AE
* mmB = (A1 A3 A5 A7 A9 Ab Ad Af) = AO
* mmC = (B0 B2 B4 B6 B8 Ba Bc Be) = BE
* mmD = (B1 B3 B5 B7 B9 Bb Bd Bf) = BO
* mmE = (C0 C2 C4 C6 C8 Ca Cc Ce) = CE
* mmF = (C1 C3 C5 C7 C9 Cb Cd Cf) = CO
* mmG = (D0 D2 D4 D6 D8 Da Dc De) = DE
* mmH = (D1 D3 D5 D7 D9 Db Dd Df) = DO
*/
mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */
mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (A8 B8 Aa Ba Ac Bc Ae Be) */
mmA = _mm_unpacklo_pi8(mmE, mmG); /* (C0 D0 C2 D2 C4 D4 C6 D6) */
mmE = _mm_unpackhi_pi8(mmE, mmG); /* (C8 D8 Ca Da Cc Dc Ce De) */
mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */
mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */
mmG = _mm_unpacklo_pi8(mmB, mmD); /* (A1 B1 A3 B3 A5 B5 A7 B7) */
mmB = _mm_unpackhi_pi8(mmB, mmD); /* (A9 B9 Ab Bb Ad Bd Af Bf) */
mmD = _mm_unpacklo_pi8(mmF, mmH); /* (C1 D1 C3 D3 C5 D5 C7 D7) */
mmF = _mm_unpackhi_pi8(mmF, mmH); /* (C9 D9 Cb Db Cd Dd Cf Df) */
mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */
mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */
mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */
mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */
mmH = _mm_unpacklo_pi16(mm8, mmA); /* (A0 B0 C0 D0 A2 B2 C2 D2) */
mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (A4 B4 C4 D4 A6 B6 C6 D6) */
mmA = _mm_unpacklo_pi16(mmG, mmD); /* (A1 B1 C1 D1 A3 B3 C3 D3) */
mmD = _mm_unpackhi_pi16(mmG, mmD); /* (A5 B5 C5 D5 A7 B7 C7 D7) */
mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */
mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */
mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */
mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */
mmG = _mm_unpackhi_pi16(mm9, mmE); /* (Ac Bc Cc Dc Ae Be Ce De) */
mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (A8 B8 C8 D8 Aa Ba Ca Da) */
mmE = _mm_unpacklo_pi16(mmB, mmF); /* (A9 B9 C9 D9 Ab Bb Cb Db) */
mmF = _mm_unpackhi_pi16(mmB, mmF); /* (Ad Bd Cd Dd Af Bf Cf Df) */
mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */
mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */
mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */
mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */
mmB = _mm_unpackhi_pi32(mmH, mmA); /* (A2 B2 C2 D2 A3 B3 C3 D3) */
mmA = _mm_unpacklo_pi32(mmH, mmA); /* (A0 B0 C0 D0 A1 B1 C1 D1) */
mmC = _mm_unpacklo_pi32(mm8, mmD); /* (A4 B4 C4 D4 A5 B5 C5 D5) */
mmD = _mm_unpackhi_pi32(mm8, mmD); /* (A6 B6 C6 D6 A7 B7 C7 D7) */
mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */
mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */
mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */
mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */
mmH = _mm_unpackhi_pi32(mmG, mmF); /* (Ae Be Ce De Af Bf Cf Df) */
mmG = _mm_unpacklo_pi32(mmG, mmF); /* (Ac Bc Cc Dc Ad Bd Cd Dd) */
mmF = _mm_unpackhi_pi32(mm9, mmE); /* (Aa Ba Ca Da Ab Bb Cb Db) */
mmE = _mm_unpacklo_pi32(mm9, mmE); /* (A8 B8 C8 D8 A9 B9 C9 D9) */
if (num_cols >= 8) {
if (!(((long)outptr) & 7)) {
@@ -505,17 +522,17 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
decenter = 0.0;
decenter = _mm_cmpeq_pi16(decenter, decenter);
decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */
cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
cbl = _mm_add_pi16(cbl, decenter);
crl = _mm_add_pi16(crl, decenter);
cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
cbl2 = _mm_add_pi16(cbl, cbl); /* 2 * CbL */
crl2 = _mm_add_pi16(crl, crl); /* 2 * CrL */
bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2 * CbL * -FIX(0.22800) */
rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2 * CrL * FIX(0.40200)) */
bl = _mm_add_pi16(bl, PW_ONE);
bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
@@ -523,15 +540,15 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
bl = _mm_add_pi16(bl, cbl);
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200)) = (B - Y)L */
rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200)) = (R - Y)L */
gl = _mm_unpacklo_pi16(cbl, crl);
gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
gl = _mm_add_pi32(gl, PD_ONEHALF);
gl = _mm_srai_pi32(gl, SCALEBITS);
gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
gl = _mm_packs_pi32(gl, zero); /* CbL * -FIX(0.344) + CrL * FIX(0.285) */
gl = _mm_sub_pi16(gl, crl); /* CbL * -FIX(0.344) + CrL * -FIX(0.714) = (G - Y)L */
yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */
rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */

View File

@@ -122,46 +122,46 @@ static uint64_t const_value[] = {
__m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
__m64 col0, col1, col2, col3, col4, col5, col6, col7; \
\
row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
\
/* Transpose coefficients */ \
\
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a = (20 30 21 31) */ \
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b = (22 32 23 33) */ \
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c = (24 34 25 35) */ \
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d = (26 36 27 37) */ \
\
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a = (00 10 01 11) */ \
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b = (02 12 03 13) */ \
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c = (04 14 05 15) */ \
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d = (06 16 07 17) */ \
\
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0 = (00 10 20 30) */ \
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1 = (01 11 21 31) */ \
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6 = (06 16 26 36) */ \
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7 = (07 17 27 37) */ \
\
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6 = col1 - col6 */ \
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7 = col0 - col7 */ \
tmp1 = _mm_add_pi16(col1, col6); /* tmp1 = col1 + col6 */ \
tmp0 = _mm_add_pi16(col0, col7); /* tmp0 = col0 + col7 */ \
\
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2 = (02 12 22 32) */ \
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3 = (03 13 23 33) */ \
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4 = (04 14 24 34) */ \
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5 = (05 15 25 35) */ \
\
tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
tmp3 = _mm_add_pi16(col3, col4); /* tmp3 = col3 + col4 */ \
tmp2 = _mm_add_pi16(col2, col5); /* tmp2 = col2 + col5 */ \
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4 = col3 - col4 */ \
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5 = col2 - col5 */ \
\
DO_FDCT_COMMON() \
\
@@ -191,35 +191,35 @@ static uint64_t const_value[] = {
\
/* Transpose coefficients */ \
\
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a = (02 03 12 13) */ \
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b = (22 23 32 33) */ \
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c = (42 43 52 53) */ \
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d = (62 63 72 73) */ \
\
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a = (00 01 10 11) */ \
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b = (20 21 30 31) */ \
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c = (40 41 50 51) */ \
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d = (60 61 70 71) */ \
\
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0 = (00 01 02 03) */ \
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1 = (10 11 12 13) */ \
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6 = (60 61 62 63) */ \
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7 = (70 71 72 73) */ \
\
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6 = row1 - row6 */ \
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7 = row0 - row7 */ \
tmp1 = _mm_add_pi16(row1, row6); /* tmp1 = row1 + row6 */ \
tmp0 = _mm_add_pi16(row0, row7); /* tmp0 = row0 + row7 */ \
\
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2 = (20 21 22 23) */ \
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3 = (30 31 32 33) */ \
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4 = (40 41 42 43) */ \
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5 = (50 51 52 53) */ \
\
tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
tmp3 = _mm_add_pi16(row3, row4); /* tmp3 = row3 + row4 */ \
tmp2 = _mm_add_pi16(row2, row5); /* tmp2 = row2 + row5 */ \
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4 = row3 - row4 */ \
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5 = row2 - row5 */ \
\
DO_FDCT_COMMON() \
\

View File

@@ -237,56 +237,56 @@ static uint64_t const_value[] = {
__m64 col0, col1, col2, col3, col4, col5, col6, col7; \
__m64 tmp10, tmp11; \
\
row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
\
/* Transpose coefficients */ \
\
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a = (20 30 21 31) */ \
row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b = (22 32 23 33) */ \
row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c = (24 34 25 35) */ \
row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d = (26 36 27 37) */ \
\
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a = (00 10 01 11) */ \
row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b = (02 12 03 13) */ \
row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c = (04 14 05 15) */ \
row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d = (06 16 07 17) */ \
\
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0 = (00 10 20 30) */ \
col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1 = (01 11 21 31) */ \
col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6 = (06 16 26 36) */ \
col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7 = (07 17 27 37) */ \
\
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
tmp6 = _mm_sub_pi16(col1, col6); /* tmp6 = col1 - col6 */ \
tmp7 = _mm_sub_pi16(col0, col7); /* tmp7 = col0 - col7 */ \
tmp1 = _mm_add_pi16(col1, col6); /* tmp1 = col1 + col6 */ \
tmp0 = _mm_add_pi16(col0, col7); /* tmp0 = col0 + col7 */ \
\
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2 = (02 12 22 32) */ \
col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3 = (03 13 23 33) */ \
col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4 = (04 14 24 34) */ \
col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5 = (05 15 25 35) */ \
\
tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
tmp3 = _mm_add_pi16(col3, col4); /* tmp3 = col3 + col4 */ \
tmp2 = _mm_add_pi16(col2, col5); /* tmp2 = col2 + col5 */ \
tmp4 = _mm_sub_pi16(col3, col4); /* tmp4 = col3 - col4 */ \
tmp5 = _mm_sub_pi16(col2, col5); /* tmp5 = col2 - col5 */ \
\
/* Even part */ \
\
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10 = tmp0 + tmp3 */ \
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13 = tmp0 - tmp3 */ \
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11 = tmp1 + tmp2 */ \
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12 = tmp1 - tmp2 */ \
\
out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
out0 = _mm_add_pi16(tmp10, tmp11); /* out0 = tmp10 + tmp11 */ \
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4 = tmp10 - tmp11 */ \
out0 = _mm_slli_pi16(out0, PASS1_BITS); \
out4 = _mm_slli_pi16(out4, PASS1_BITS); \
\
@@ -319,45 +319,45 @@ static uint64_t const_value[] = {
\
/* Transpose coefficients */ \
\
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a = (02 03 12 13) */ \
col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b = (22 23 32 33) */ \
col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c = (42 43 52 53) */ \
col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d = (62 63 72 73) */ \
\
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a = (00 01 10 11) */ \
col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b = (20 21 30 31) */ \
col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c = (40 41 50 51) */ \
col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d = (60 61 70 71) */ \
\
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0 = (00 01 02 03) */ \
row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1 = (10 11 12 13) */ \
row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6 = (60 61 62 63) */ \
row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7 = (70 71 72 73) */ \
\
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
tmp6 = _mm_sub_pi16(row1, row6); /* tmp6 = row1 - row6 */ \
tmp7 = _mm_sub_pi16(row0, row7); /* tmp7 = row0 - row7 */ \
tmp1 = _mm_add_pi16(row1, row6); /* tmp1 = row1 + row6 */ \
tmp0 = _mm_add_pi16(row0, row7); /* tmp0 = row0 + row7 */ \
\
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2 = (20 21 22 23) */ \
row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3 = (30 31 32 33) */ \
row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4 = (40 41 42 43) */ \
row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5 = (50 51 52 53) */ \
\
tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
tmp3 = _mm_add_pi16(row3, row4); /* tmp3 = row3 + row4 */ \
tmp2 = _mm_add_pi16(row2, row5); /* tmp2 = row2 + row5 */ \
tmp4 = _mm_sub_pi16(row3, row4); /* tmp4 = row3 - row4 */ \
tmp5 = _mm_sub_pi16(row2, row5); /* tmp5 = row2 - row5 */ \
\
/* Even part */ \
\
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10 = tmp0 + tmp3 */ \
tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13 = tmp0 - tmp3 */ \
tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11 = tmp1 + tmp2 */ \
tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12 = tmp1 - tmp2 */ \
\
out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
out0 = _mm_add_pi16(tmp10, tmp11); /* out0 = tmp10 + tmp11 */ \
out4 = _mm_sub_pi16(tmp10, tmp11); /* out4 = tmp10 - tmp11 */ \
\
out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \

View File

@@ -34,11 +34,11 @@
#define CONST_BITS 8
#define PASS1_BITS 2
#define FIX_1_082 ((short)277) /* FIX(1.082392200) */
#define FIX_1_414 ((short)362) /* FIX(1.414213562) */
#define FIX_1_847 ((short)473) /* FIX(1.847759065) */
#define FIX_2_613 ((short)669) /* FIX(2.613125930) */
#define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
#define FIX_1_082 ((short)277) /* FIX(1.082392200) */
#define FIX_1_414 ((short)362) /* FIX(1.414213562) */
#define FIX_1_847 ((short)473) /* FIX(1.847759065) */
#define FIX_2_613 ((short)669) /* FIX(2.613125930) */
#define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
#define PRE_MULTIPLY_SCALE_BITS 2
#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
@@ -156,15 +156,15 @@ static uint64_t const_value[] = {
\
quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
\
dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval=(00 10 20 30) */ \
dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval = (00 10 20 30) */ \
\
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall = (00 00 10 10) */ \
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh = (20 20 30 30) */ \
\
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0 = (00 00 00 00) */ \
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1 = (10 10 10 10) */ \
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2 = (20 20 20 20) */ \
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3 = (30 30 30 30) */ \
\
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
@@ -234,32 +234,33 @@ static uint64_t const_value[] = {
\
DO_IDCT_COMMON() \
\
/* out0=(00 10 20 30), out1=(01 11 21 31) */ \
/* out2=(02 12 22 32), out3=(03 13 23 33) */ \
/* out4=(04 14 24 34), out5=(05 15 25 35) */ \
/* out6=(06 16 26 36), out7=(07 17 27 37) */ \
/* out0 = (00 10 20 30), out1 = (01 11 21 31) \
* out2 = (02 12 22 32), out3 = (03 13 23 33) \
* out4 = (04 14 24 34), out5 = (05 15 25 35) \
* out6 = (06 16 26 36), out7 = (07 17 27 37) \
*/ \
\
/* Transpose coefficients */ \
\
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a = (00 01 10 11) */ \
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a = (20 21 30 31) */ \
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d = (06 07 16 17) */ \
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d = (26 27 36 37) */ \
\
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b = (02 03 12 13) */ \
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b = (22 23 32 33) */ \
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c = (04 05 14 15) */ \
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c = (24 25 34 35) */ \
\
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l = (00 01 02 03) */ \
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l = (10 11 12 13) */ \
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l = (20 21 22 23) */ \
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l = (30 31 32 33) */ \
\
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h = (04 05 06 07) */ \
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h = (14 15 16 17) */ \
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h = (24 25 26 27) */ \
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h = (34 35 36 37) */ \
\
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
@@ -312,11 +313,11 @@ static uint64_t const_value[] = {
\
DO_IDCT_COMMON() \
\
/* out0=(00 01 02 03), out1=(10 11 12 13) */ \
/* out2=(20 21 22 23), out3=(30 31 32 33) */ \
/* out4=(40 41 42 43), out5=(50 51 52 53) */ \
/* out6=(60 61 62 63), out7=(70 71 72 73) */ \
\
/* out0 = (00 01 02 03), out1 = (10 11 12 13) \
* out2 = (20 21 22 23), out3 = (30 31 32 33) \
* out4 = (40 41 42 43), out5 = (50 51 52 53) \
* out6 = (60 61 62 63), out7 = (70 71 72 73) \
*/ \
out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
@@ -326,10 +327,10 @@ static uint64_t const_value[] = {
out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
\
row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
row06 = _mm_packs_pi16(out0, out6); /* row06 = (00 01 02 03 60 61 62 63) */ \
row17 = _mm_packs_pi16(out1, out7); /* row17 = (10 11 12 13 70 71 72 73) */ \
row24 = _mm_packs_pi16(out2, out4); /* row24 = (20 21 22 23 40 41 42 43) */ \
row35 = _mm_packs_pi16(out3, out5); /* row35 = (30 31 32 33 50 51 52 53) */ \
\
row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
@@ -338,20 +339,20 @@ static uint64_t const_value[] = {
\
/* Transpose coefficients */ \
\
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a = (00 10 01 11 02 12 03 13) */ \
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d = (60 70 61 71 62 72 63 73) */ \
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b = (20 30 21 31 22 32 23 33) */ \
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c = (40 50 41 51 42 52 43 53) */ \
\
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l = (00 10 20 30 01 11 21 31) */ \
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l = (02 12 22 32 03 13 23 33) */ \
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h = (40 50 60 70 41 51 61 71) */ \
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h = (42 52 62 72 43 53 63 73) */ \
\
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0 = (00 10 20 30 40 50 60 70) */ \
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1 = (01 11 21 31 41 51 61 71) */ \
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2 = (02 12 22 32 42 52 62 72) */ \
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3 = (03 13 23 33 43 53 63 73) */ \
\
_mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
_mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \

View File

@@ -40,18 +40,18 @@
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
#define CENTERJSAMPLE 128
#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
enum const_index {
index_PW_F130_F054,
@@ -293,15 +293,15 @@ static uint64_t const_value[] = {
quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
\
dcval = _mm_mullo_pi16(col0l, quant0l); \
dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \
dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval = (00 10 20 30) */ \
\
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall = (00 00 10 10) */ \
dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh = (20 20 30 30) */ \
\
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0 = (00 00 00 00) */ \
row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1 = (10 10 10 10) */ \
row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2 = (20 20 20 20) */ \
row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3 = (30 30 30 30) */ \
\
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
@@ -392,32 +392,33 @@ static uint64_t const_value[] = {
\
DO_IDCT_COMMON(1) \
\
/* out0=(00 10 20 30), out1=(01 11 21 31) */ \
/* out2=(02 12 22 32), out3=(03 13 23 33) */ \
/* out4=(04 14 24 34), out5=(05 15 25 35) */ \
/* out6=(06 16 26 36), out7=(07 17 27 37) */ \
/* out0 = (00 10 20 30), out1 = (01 11 21 31) \
* out2 = (02 12 22 32), out3 = (03 13 23 33) \
* out4 = (04 14 24 34), out5 = (05 15 25 35) \
* out6 = (06 16 26 36), out7 = (07 17 27 37) \
*/ \
\
/* Transpose coefficients */ \
\
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
row01a = _mm_unpacklo_pi16(out0, out1); /* row01a = (00 01 10 11) */ \
row23a = _mm_unpackhi_pi16(out0, out1); /* row23a = (20 21 30 31) */ \
row01d = _mm_unpacklo_pi16(out6, out7); /* row01d = (06 07 16 17) */ \
row23d = _mm_unpackhi_pi16(out6, out7); /* row23d = (26 27 36 37) */ \
\
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
row01b = _mm_unpacklo_pi16(out2, out3); /* row01b = (02 03 12 13) */ \
row23b = _mm_unpackhi_pi16(out2, out3); /* row23b = (22 23 32 33) */ \
row01c = _mm_unpacklo_pi16(out4, out5); /* row01c = (04 05 14 15) */ \
row23c = _mm_unpackhi_pi16(out4, out5); /* row23c = (24 25 34 35) */ \
\
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l = (00 01 02 03) */ \
row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l = (10 11 12 13) */ \
row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l = (20 21 22 23) */ \
row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l = (30 31 32 33) */ \
\
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h = (04 05 06 07) */ \
row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h = (14 15 16 17) */ \
row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h = (24 25 26 27) */ \
row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h = (34 35 36 37) */ \
\
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
_mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
@@ -497,15 +498,15 @@ static uint64_t const_value[] = {
\
DO_IDCT_COMMON(2) \
\
/* out0=(00 01 02 03), out1=(10 11 12 13) */ \
/* out2=(20 21 22 23), out3=(30 31 32 33) */ \
/* out4=(40 41 42 43), out5=(50 51 52 53) */ \
/* out6=(60 61 62 63), out7=(70 71 72 73) */ \
\
row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
/* out0 = (00 01 02 03), out1 = (10 11 12 13) \
* out2 = (20 21 22 23), out3 = (30 31 32 33) \
* out4 = (40 41 42 43), out5 = (50 51 52 53) \
* out6 = (60 61 62 63), out7 = (70 71 72 73) \
*/ \
row06 = _mm_packs_pi16(out0, out6); /* row06 = (00 01 02 03 60 61 62 63) */ \
row17 = _mm_packs_pi16(out1, out7); /* row17 = (10 11 12 13 70 71 72 73) */ \
row24 = _mm_packs_pi16(out2, out4); /* row24 = (20 21 22 23 40 41 42 43) */ \
row35 = _mm_packs_pi16(out3, out5); /* row35 = (30 31 32 33 50 51 52 53) */ \
\
row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
@@ -514,20 +515,20 @@ static uint64_t const_value[] = {
\
/* Transpose coefficients */ \
\
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a = (00 10 01 11 02 12 03 13) */ \
col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d = (60 70 61 71 62 72 63 73) */ \
col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b = (20 30 21 31 22 32 23 33) */ \
col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c = (40 50 41 51 42 52 43 53) */ \
\
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l = (00 10 20 30 01 11 21 31) */ \
col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l = (02 12 22 32 03 13 23 33) */ \
col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h = (40 50 60 70 41 51 61 71) */ \
col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h = (42 52 62 72 43 53 63 73) */ \
\
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0 = (00 10 20 30 40 50 60 70) */ \
col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1 = (01 11 21 31 41 51 61 71) */ \
col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2 = (02 12 22 32 42 52 62 72) */ \
col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3 = (03 13 23 33 43 53 63 73) */ \
\
_mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
_mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \

View File

@@ -6,7 +6,7 @@
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
* Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved.
* Copyright (C) 2018-2019, 2025, D. R. Commander. All Rights Reserved.
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -62,9 +62,10 @@
rowl = _mm_mulhi_pi16(rowl, recipl); \
rowh = _mm_mulhi_pi16(rowh, reciph); \
\
/* reciprocal is always negative (MSB=1), so we always need to add the */ \
/* initial value (input value is never negative as we inverted it at the */ \
/* start of this routine) */ \
/* reciprocal is always negative (MSB = 1), so we always need to add the \
* initial value. (The input value is never negative, as we inverted it at \
* the start of this routine.) \
*/ \
rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
\