mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2026-01-12 00:06:51 +08:00
avutil/crc: Use x86 clmul for CRC when available
Observed near 10x speedup on AMD Zen4 7950x: av_crc_c: 22057.0 ( 1.00x) av_crc_clmul: 2202.8 (10.01x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -25,6 +25,9 @@
|
||||
#include "bswap.h"
|
||||
#include "crc.h"
|
||||
#include "error.h"
|
||||
#if ARCH_X86
|
||||
#include "libavutil/x86/crc.h"
|
||||
#endif
|
||||
|
||||
#if CONFIG_HARDCODED_TABLES
|
||||
static const AVCRC av_crc_table[AV_CRC_MAX][257] = {
|
||||
@@ -348,6 +351,12 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
|
||||
if (ctx_size != sizeof(AVCRC) * 257 && ctx_size != sizeof(AVCRC) * 1024)
|
||||
return AVERROR(EINVAL);
|
||||
|
||||
#if ARCH_X86
|
||||
int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size);
|
||||
if (done)
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
if (le) {
|
||||
for (c = i, j = 0; j < 8; j++)
|
||||
@@ -375,6 +384,14 @@ const AVCRC *av_crc_get_table(AVCRCId crc_id)
|
||||
{
|
||||
if ((unsigned)crc_id >= AV_CRC_MAX)
|
||||
return NULL;
|
||||
// Check for arch-specific extensions first to avoid initializing
|
||||
// ordinary CRC tables unnecessarily.
|
||||
#if ARCH_X86
|
||||
const AVCRC *table = ff_crc_get_table_x86(crc_id);
|
||||
if (table)
|
||||
return table;
|
||||
#endif
|
||||
|
||||
#if !CONFIG_HARDCODED_TABLES
|
||||
switch (crc_id) {
|
||||
case AV_CRC_8_ATM: CRC_INIT_TABLE_ONCE(AV_CRC_8_ATM); break;
|
||||
@@ -394,6 +411,13 @@ const AVCRC *av_crc_get_table(AVCRCId crc_id)
|
||||
uint32_t av_crc(const AVCRC *ctx, uint32_t crc,
|
||||
const uint8_t *buffer, size_t length)
|
||||
{
|
||||
if (ctx[0]) {
|
||||
#if ARCH_X86
|
||||
return ff_crc_x86(ctx, crc, buffer, length);
|
||||
#endif
|
||||
}
|
||||
av_assert2(ctx[0] == 0);
|
||||
|
||||
const uint8_t *end = buffer + length;
|
||||
|
||||
#if !CONFIG_SMALL
|
||||
|
||||
@@ -4,6 +4,7 @@ EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
|
||||
|
||||
X86ASM-OBJS += x86/aes.o x86/aes_init.o \
|
||||
x86/cpuid.o \
|
||||
x86/crc.o \
|
||||
$(EMMS_OBJS__yes_) \
|
||||
x86/fixed_dsp.o x86/fixed_dsp_init.o \
|
||||
x86/float_dsp.o x86/float_dsp_init.o \
|
||||
|
||||
@@ -138,6 +138,7 @@ SECTION .text
|
||||
%endmacro
|
||||
|
||||
%macro CRC 1
|
||||
%define CTX r0+4
|
||||
;-----------------------------------------------------------------------------------------------
|
||||
; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, size_t length
|
||||
;-----------------------------------------------------------------------------------------------
|
||||
@@ -177,7 +178,7 @@ cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10
|
||||
mov r4, 64
|
||||
cmp r3, 128
|
||||
jb .reduce_4x_to_1
|
||||
movu m4, [r0]
|
||||
movu m4, [CTX]
|
||||
|
||||
.fold_4x_loop:
|
||||
movu m6, [r2 + r4 + 0]
|
||||
@@ -200,7 +201,7 @@ cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10
|
||||
jbe .fold_4x_loop
|
||||
|
||||
.reduce_4x_to_1:
|
||||
movu m4, [r0 + 16]
|
||||
movu m4, [CTX + 16]
|
||||
FOLD_SINGLE m5, m1, m4, m3
|
||||
FOLD_SINGLE m5, m1, m4, m2
|
||||
FOLD_SINGLE m5, m1, m4, m0
|
||||
@@ -245,10 +246,10 @@ cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10
|
||||
FOLD_SINGLE m5, m1, m4, m2
|
||||
|
||||
.reduce_128_to_64:
|
||||
movu m4, [r0 + 32]
|
||||
movu m4, [CTX + 32]
|
||||
FOLD_128_TO_64 %1, m1, m4, m5
|
||||
.reduce_64_to_32:
|
||||
movu m4, [r0 + 48]
|
||||
movu m4, [CTX + 48]
|
||||
FOLD_64_TO_32 %1, m1, m4, m5
|
||||
RET
|
||||
|
||||
@@ -261,7 +262,7 @@ cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10
|
||||
pshufb m1, m10
|
||||
%endif
|
||||
mov r4, 16
|
||||
movu m4, [r0 + 16]
|
||||
movu m4, [CTX + 16]
|
||||
jmp .fold_1x_pre
|
||||
|
||||
.less_than_16bytes:
|
||||
|
||||
@@ -24,8 +24,10 @@
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/attributes_internal.h"
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/crc.h"
|
||||
#include "libavutil/intreadwrite.h"
|
||||
#include "libavutil/reverse.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
@@ -37,50 +39,64 @@ uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc,
|
||||
const uint8_t *buffer, size_t length);
|
||||
FF_VISIBILITY_POP_HIDDEN
|
||||
|
||||
static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = {
|
||||
enum {
|
||||
CRC_C = 0,
|
||||
CLMUL_BE,
|
||||
CLMUL_LE,
|
||||
};
|
||||
|
||||
static const AVCRC crc_table_clmul[AV_CRC_MAX][17] = {
|
||||
[AV_CRC_8_ATM] = {
|
||||
CLMUL_BE,
|
||||
0x32000000, 0x0, 0xbc000000, 0x0,
|
||||
0xc4000000, 0x0, 0x94000000, 0x0,
|
||||
0x62000000, 0x0, 0x79000000, 0x0,
|
||||
0x07156a16, 0x1, 0x07000000, 0x1,
|
||||
},
|
||||
[AV_CRC_8_EBU] = {
|
||||
CLMUL_BE,
|
||||
0xb5000000, 0x0, 0xf3000000, 0x0,
|
||||
0xfc000000, 0x0, 0x0d000000, 0x0,
|
||||
0x6a000000, 0x0, 0x65000000, 0x0,
|
||||
0x1c4b8192, 0x1, 0x1d000000, 0x1,
|
||||
},
|
||||
[AV_CRC_16_ANSI] = {
|
||||
CLMUL_BE,
|
||||
0xf9e30000, 0x0, 0x807d0000, 0x0,
|
||||
0xf9130000, 0x0, 0xff830000, 0x0,
|
||||
0x807b0000, 0x0, 0x86630000, 0x0,
|
||||
0xfffbffe7, 0x1, 0x80050000, 0x1,
|
||||
},
|
||||
[AV_CRC_16_CCITT] = {
|
||||
CLMUL_BE,
|
||||
0x60190000, 0x0, 0x59b00000, 0x0,
|
||||
0xd5f60000, 0x0, 0x45630000, 0x0,
|
||||
0xaa510000, 0x0, 0xeb230000, 0x0,
|
||||
0x11303471, 0x1, 0x10210000, 0x1,
|
||||
},
|
||||
[AV_CRC_24_IEEE] = {
|
||||
CLMUL_BE,
|
||||
0x1f428700, 0x0, 0x467d2400, 0x0,
|
||||
0x2c8c9d00, 0x0, 0x64e4d700, 0x0,
|
||||
0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
|
||||
0xf845fe24, 0x1, 0x864cfb00, 0x1,
|
||||
},
|
||||
[AV_CRC_32_IEEE] = {
|
||||
CLMUL_BE,
|
||||
0x8833794c, 0x0, 0xe6228b11, 0x0,
|
||||
0xc5b9cd4c, 0x0, 0xe8a45605, 0x0,
|
||||
0x490d678d, 0x0, 0xf200aa66, 0x0,
|
||||
0x04d101df, 0x1, 0x04c11db7, 0x1,
|
||||
},
|
||||
[AV_CRC_32_IEEE_LE] = {
|
||||
CLMUL_LE,
|
||||
0xc6e41596, 0x1, 0x54442bd4, 0x1,
|
||||
0xccaa009e, 0x0, 0x751997d0, 0x1,
|
||||
0xccaa009e, 0x0, 0x63cd6124, 0x1,
|
||||
0xf7011640, 0x1, 0xdb710641, 0x1,
|
||||
},
|
||||
[AV_CRC_16_ANSI_LE] = {
|
||||
CLMUL_LE,
|
||||
0x0000bffa, 0x0, 0x1b0c2, 0x0,
|
||||
0x00018cc2, 0x0, 0x1d0c2, 0x0,
|
||||
0x00018cc2, 0x0, 0x1bc02, 0x0,
|
||||
@@ -139,9 +155,10 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int
|
||||
// convert to 32 degree polynomial
|
||||
poly_ = ((uint64_t)poly) << (32 - bits);
|
||||
|
||||
uint64_t x1, x2, x3, x4, x5, x6, x7, x8, div;
|
||||
uint8_t *dst = (uint8_t*)ctx;
|
||||
uint64_t div;
|
||||
uint8_t *dst = (uint8_t*)(ctx + 1);
|
||||
if (le) {
|
||||
ctx[0] = CLMUL_LE;
|
||||
AV_WN64(dst, xnmodp(4 * 128 - 32, poly_, 32, &div, le));
|
||||
AV_WN64(dst + 8, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
|
||||
uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
|
||||
@@ -152,6 +169,7 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int
|
||||
AV_WN64(dst + 48, div);
|
||||
AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
|
||||
} else {
|
||||
ctx[0] = CLMUL_BE;
|
||||
AV_WN64(dst, xnmodp(4 * 128 + 64, poly_, 32, &div, le));
|
||||
AV_WN64(dst + 8, xnmodp(4 * 128, poly_, 32, &div, le));
|
||||
AV_WN64(dst + 16, xnmodp(128 + 64, poly_, 32, &div, le));
|
||||
@@ -164,4 +182,42 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline const AVCRC *ff_crc_get_table_x86(AVCRCId crc_id)
|
||||
{
|
||||
#if HAVE_CLMUL_EXTERNAL
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_CLMUL(cpu_flags)) {
|
||||
return crc_table_clmul[crc_id];
|
||||
}
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline av_cold int ff_crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
|
||||
{
|
||||
#if HAVE_CLMUL_EXTERNAL
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_CLMUL(cpu_flags)) {
|
||||
crc_init_x86(ctx, le, bits, poly, ctx_size);
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline uint32_t ff_crc_x86(const AVCRC *ctx, uint32_t crc,
|
||||
const uint8_t *buffer, size_t length)
|
||||
{
|
||||
switch (ctx[0]) {
|
||||
#if HAVE_CLMUL_EXTERNAL
|
||||
case CLMUL_BE: return ff_crc_clmul(ctx, crc, buffer, length);
|
||||
case CLMUL_LE: return ff_crc_le_clmul(ctx, crc, buffer, length);
|
||||
#endif
|
||||
default: av_unreachable("x86 CRC only uses CLMUL_BE and CLMUL_LE");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* AVUTIL_X86_CRC_H */
|
||||
|
||||
Reference in New Issue
Block a user