|
@@ -38,22 +38,23 @@
|
|
#include "bulk_crc32.h"
|
|
#include "bulk_crc32.h"
|
|
#include "gcc_optimizations.h"
|
|
#include "gcc_optimizations.h"
|
|
|
|
|
|
-#if (!defined(__FreeBSD__) && !defined(WINDOWS))
|
|
|
|
-#define USE_PIPELINED
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
#define CRC_INITIAL_VAL 0xffffffff
|
|
#define CRC_INITIAL_VAL 0xffffffff
|
|
|
|
|
|
-typedef uint32_t (*crc_update_func_t)(uint32_t, const uint8_t *, size_t);
|
|
|
|
static uint32_t crc_val(uint32_t crc);
|
|
static uint32_t crc_val(uint32_t crc);
|
|
-static uint32_t crc32_zlib_sb8(uint32_t crc, const uint8_t *buf, size_t length);
|
|
|
|
-static uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length);
|
|
|
|
|
|
|
|
-#ifdef USE_PIPELINED
|
|
|
|
-static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks);
|
|
|
|
-#endif
|
|
|
|
-static int cached_cpu_supports_crc32; // initialized by constructor below
|
|
|
|
-static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length);
|
|
|
|
|
|
+typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, const uint8_t *, size_t, int);
|
|
|
|
+
|
|
|
|
+// The software versions of pipelined crc
|
|
|
|
+static void pipelined_crc32c_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3,
|
|
|
|
+ const uint8_t *p_buf, size_t block_size, int num_blocks);
|
|
|
|
+static void pipelined_crc32_zlib_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3,
|
|
|
|
+ const uint8_t *p_buf, size_t block_size, int num_blocks);
|
|
|
|
+
|
|
|
|
+// Satically initialise the function pointers to the software versions
|
|
|
|
+// If a HW implementation is available they will subsequently be initialised in the dynamic
|
|
|
|
+// initialisers to point to the HW routines.
|
|
|
|
+crc_pipelined_func_t pipelined_crc32c_func = pipelined_crc32c_sb8;
|
|
|
|
+crc_pipelined_func_t pipelined_crc32_zlib_func = pipelined_crc32_zlib_sb8;
|
|
|
|
|
|
static inline int store_or_verify(uint32_t *sums, uint32_t crc,
|
|
static inline int store_or_verify(uint32_t *sums, uint32_t crc,
|
|
int is_verify) {
|
|
int is_verify) {
|
|
@@ -72,94 +73,66 @@ int bulk_crc(const uint8_t *data, size_t data_len,
|
|
|
|
|
|
int is_verify = error_info != NULL;
|
|
int is_verify = error_info != NULL;
|
|
|
|
|
|
-#ifdef USE_PIPELINED
|
|
|
|
uint32_t crc1, crc2, crc3;
|
|
uint32_t crc1, crc2, crc3;
|
|
int n_blocks = data_len / bytes_per_checksum;
|
|
int n_blocks = data_len / bytes_per_checksum;
|
|
int remainder = data_len % bytes_per_checksum;
|
|
int remainder = data_len % bytes_per_checksum;
|
|
- int do_pipelined = 0;
|
|
|
|
-#endif
|
|
|
|
uint32_t crc;
|
|
uint32_t crc;
|
|
- crc_update_func_t crc_update_func;
|
|
|
|
|
|
+ crc_pipelined_func_t crc_pipelined_func;
|
|
switch (checksum_type) {
|
|
switch (checksum_type) {
|
|
case CRC32_ZLIB_POLYNOMIAL:
|
|
case CRC32_ZLIB_POLYNOMIAL:
|
|
- crc_update_func = crc32_zlib_sb8;
|
|
|
|
|
|
+ crc_pipelined_func = pipelined_crc32_zlib_func;
|
|
break;
|
|
break;
|
|
case CRC32C_POLYNOMIAL:
|
|
case CRC32C_POLYNOMIAL:
|
|
- if (likely(cached_cpu_supports_crc32)) {
|
|
|
|
- crc_update_func = crc32c_hardware;
|
|
|
|
-#ifdef USE_PIPELINED
|
|
|
|
- do_pipelined = 1;
|
|
|
|
-#endif
|
|
|
|
- } else {
|
|
|
|
- crc_update_func = crc32c_sb8;
|
|
|
|
- }
|
|
|
|
|
|
+ crc_pipelined_func = pipelined_crc32c_func;
|
|
break;
|
|
break;
|
|
default:
|
|
default:
|
|
return is_verify ? INVALID_CHECKSUM_TYPE : -EINVAL;
|
|
return is_verify ? INVALID_CHECKSUM_TYPE : -EINVAL;
|
|
}
|
|
}
|
|
|
|
|
|
-#ifdef USE_PIPELINED
|
|
|
|
- if (do_pipelined) {
|
|
|
|
- /* Process three blocks at a time */
|
|
|
|
- while (likely(n_blocks >= 3)) {
|
|
|
|
- crc1 = crc2 = crc3 = CRC_INITIAL_VAL;
|
|
|
|
- pipelined_crc32c(&crc1, &crc2, &crc3, data, bytes_per_checksum, 3);
|
|
|
|
|
|
+ /* Process three blocks at a time */
|
|
|
|
+ while (likely(n_blocks >= 3)) {
|
|
|
|
+ crc1 = crc2 = crc3 = CRC_INITIAL_VAL;
|
|
|
|
+ crc_pipelined_func(&crc1, &crc2, &crc3, data, bytes_per_checksum, 3);
|
|
|
|
|
|
- if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify)))
|
|
|
|
- goto return_crc_error;
|
|
|
|
- sums++;
|
|
|
|
- data += bytes_per_checksum;
|
|
|
|
- if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc2))), is_verify)))
|
|
|
|
- goto return_crc_error;
|
|
|
|
- sums++;
|
|
|
|
- data += bytes_per_checksum;
|
|
|
|
- if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc3))), is_verify)))
|
|
|
|
- goto return_crc_error;
|
|
|
|
- sums++;
|
|
|
|
- data += bytes_per_checksum;
|
|
|
|
- n_blocks -= 3;
|
|
|
|
- }
|
|
|
|
|
|
+ if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify)))
|
|
|
|
+ goto return_crc_error;
|
|
|
|
+ sums++;
|
|
|
|
+ data += bytes_per_checksum;
|
|
|
|
+ if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc2))), is_verify)))
|
|
|
|
+ goto return_crc_error;
|
|
|
|
+ sums++;
|
|
|
|
+ data += bytes_per_checksum;
|
|
|
|
+ if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc3))), is_verify)))
|
|
|
|
+ goto return_crc_error;
|
|
|
|
+ sums++;
|
|
|
|
+ data += bytes_per_checksum;
|
|
|
|
+ n_blocks -= 3;
|
|
|
|
+ }
|
|
|
|
|
|
- /* One or two blocks */
|
|
|
|
- if (n_blocks) {
|
|
|
|
- crc1 = crc2 = crc3 = CRC_INITIAL_VAL;
|
|
|
|
- pipelined_crc32c(&crc1, &crc2, &crc3, data, bytes_per_checksum, n_blocks);
|
|
|
|
|
|
+ /* One or two blocks */
|
|
|
|
+ if (n_blocks) {
|
|
|
|
+ crc1 = crc2 = crc3 = CRC_INITIAL_VAL;
|
|
|
|
+ crc_pipelined_func(&crc1, &crc2, &crc3, data, bytes_per_checksum, n_blocks);
|
|
|
|
|
|
- if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify)))
|
|
|
|
|
|
+ if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify)))
|
|
|
|
+ goto return_crc_error;
|
|
|
|
+ data += bytes_per_checksum;
|
|
|
|
+ sums++;
|
|
|
|
+ if (n_blocks == 2) {
|
|
|
|
+ if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc2))), is_verify)))
|
|
goto return_crc_error;
|
|
goto return_crc_error;
|
|
- data += bytes_per_checksum;
|
|
|
|
sums++;
|
|
sums++;
|
|
- if (n_blocks == 2) {
|
|
|
|
- if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc2))), is_verify)))
|
|
|
|
- goto return_crc_error;
|
|
|
|
- sums++;
|
|
|
|
- data += bytes_per_checksum;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- /* For something smaller than a block */
|
|
|
|
- if (remainder) {
|
|
|
|
- crc1 = crc2 = crc3 = CRC_INITIAL_VAL;
|
|
|
|
- pipelined_crc32c(&crc1, &crc2, &crc3, data, remainder, 1);
|
|
|
|
-
|
|
|
|
- if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify)))
|
|
|
|
- goto return_crc_error;
|
|
|
|
|
|
+ data += bytes_per_checksum;
|
|
}
|
|
}
|
|
- return is_verify ? CHECKSUMS_VALID : 0;
|
|
|
|
}
|
|
}
|
|
-#endif
|
|
|
|
|
|
|
|
- while (likely(data_len > 0)) {
|
|
|
|
- int len = likely(data_len >= bytes_per_checksum) ? bytes_per_checksum : data_len;
|
|
|
|
- crc = CRC_INITIAL_VAL;
|
|
|
|
- crc = crc_update_func(crc, data, len);
|
|
|
|
- crc = ntohl(crc_val(crc));
|
|
|
|
- if (unlikely(!store_or_verify(sums, crc, is_verify))) {
|
|
|
|
|
|
+ /* For something smaller than a block */
|
|
|
|
+ if (remainder) {
|
|
|
|
+ crc1 = crc2 = crc3 = CRC_INITIAL_VAL;
|
|
|
|
+ crc_pipelined_func(&crc1, &crc2, &crc3, data, remainder, 1);
|
|
|
|
+
|
|
|
|
+ if (unlikely(!store_or_verify(sums, (crc = ntohl(crc_val(crc1))), is_verify)))
|
|
goto return_crc_error;
|
|
goto return_crc_error;
|
|
- }
|
|
|
|
- data += len;
|
|
|
|
- data_len -= len;
|
|
|
|
- sums++;
|
|
|
|
}
|
|
}
|
|
return is_verify ? CHECKSUMS_VALID : 0;
|
|
return is_verify ? CHECKSUMS_VALID : 0;
|
|
|
|
|
|
@@ -175,7 +148,7 @@ return_crc_error:
|
|
/**
|
|
/**
|
|
* Extract the final result of a CRC
|
|
* Extract the final result of a CRC
|
|
*/
|
|
*/
|
|
-uint32_t crc_val(uint32_t crc) {
|
|
|
|
|
|
+static uint32_t crc_val(uint32_t crc) {
|
|
return ~crc;
|
|
return ~crc;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -214,6 +187,16 @@ static uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length) {
|
|
return crc;
|
|
return crc;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static void pipelined_crc32c_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3,
|
|
|
|
+ const uint8_t *p_buf, size_t block_size, int num_blocks) {
|
|
|
|
+ assert(num_blocks >= 1 && num_blocks <=3 && "invalid num_blocks");
|
|
|
|
+ *crc1 = crc32c_sb8(*crc1, p_buf, block_size);
|
|
|
|
+ if (num_blocks >= 2)
|
|
|
|
+ *crc2 = crc32c_sb8(*crc2, p_buf+block_size, block_size);
|
|
|
|
+ if (num_blocks >= 3)
|
|
|
|
+ *crc3 = crc32c_sb8(*crc3, p_buf+2*block_size, block_size);
|
|
|
|
+}
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* Update a CRC using the "zlib" polynomial -- what Hadoop calls CHECKSUM_CRC32
|
|
* Update a CRC using the "zlib" polynomial -- what Hadoop calls CHECKSUM_CRC32
|
|
* using slicing-by-8
|
|
* using slicing-by-8
|
|
@@ -250,416 +233,12 @@ static uint32_t crc32_zlib_sb8(
|
|
return crc;
|
|
return crc;
|
|
}
|
|
}
|
|
|
|
|
|
-///////////////////////////////////////////////////////////////////////////
|
|
|
|
-// Begin code for SSE4.2 specific hardware support of CRC32C
|
|
|
|
-///////////////////////////////////////////////////////////////////////////
|
|
|
|
-
|
|
|
|
-#if (defined(__amd64__) || defined(__i386)) && defined(__GNUC__) && !defined(__FreeBSD__)
|
|
|
|
-# define SSE42_FEATURE_BIT (1 << 20)
|
|
|
|
-# define CPUID_FEATURES 1
|
|
|
|
-/**
|
|
|
|
- * Call the cpuid instruction to determine CPU feature flags.
|
|
|
|
- */
|
|
|
|
-static uint32_t cpuid(uint32_t eax_in) {
|
|
|
|
- uint32_t eax, ebx, ecx, edx;
|
|
|
|
-# if defined(__PIC__) && !defined(__LP64__)
|
|
|
|
-// 32-bit PIC code uses the ebx register for the base offset --
|
|
|
|
-// have to save and restore it on the stack
|
|
|
|
- asm("pushl %%ebx\n\t"
|
|
|
|
- "cpuid\n\t"
|
|
|
|
- "movl %%ebx, %[ebx]\n\t"
|
|
|
|
- "popl %%ebx" : "=a" (eax), [ebx] "=r"(ebx), "=c"(ecx), "=d"(edx) : "a" (eax_in)
|
|
|
|
- : "cc");
|
|
|
|
-# else
|
|
|
|
- asm("cpuid" : "=a" (eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax_in)
|
|
|
|
- : "cc");
|
|
|
|
-# endif
|
|
|
|
-
|
|
|
|
- return ecx;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/**
|
|
|
|
- * On library load, initiailize the cached value above for
|
|
|
|
- * whether the cpu supports SSE4.2's crc32 instruction.
|
|
|
|
- */
|
|
|
|
-void __attribute__ ((constructor)) init_cpu_support_flag(void) {
|
|
|
|
- uint32_t ecx = cpuid(CPUID_FEATURES);
|
|
|
|
- cached_cpu_supports_crc32 = ecx & SSE42_FEATURE_BIT;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-//
|
|
|
|
-// Definitions of the SSE4.2 crc32 operations. Using these instead of
|
|
|
|
-// the GCC __builtin_* intrinsics allows this code to compile without
|
|
|
|
-// -msse4.2, since we do dynamic CPU detection at runtime.
|
|
|
|
-//
|
|
|
|
-
|
|
|
|
-# ifdef __LP64__
|
|
|
|
-inline uint64_t _mm_crc32_u64(uint64_t crc, uint64_t value) {
|
|
|
|
- asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
|
|
|
|
- return crc;
|
|
|
|
-}
|
|
|
|
-# endif
|
|
|
|
-
|
|
|
|
-inline uint32_t _mm_crc32_u32(uint32_t crc, uint32_t value) {
|
|
|
|
- asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
|
|
|
|
- return crc;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t value) {
|
|
|
|
- asm("crc32w %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
|
|
|
|
- return crc;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) {
|
|
|
|
- asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
|
|
|
|
- return crc;
|
|
|
|
|
|
+static void pipelined_crc32_zlib_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3,
|
|
|
|
+ const uint8_t *p_buf, size_t block_size, int num_blocks) {
|
|
|
|
+ assert(num_blocks >= 1 && num_blocks <=3 && "invalid num_blocks");
|
|
|
|
+ *crc1 = crc32_zlib_sb8(*crc1, p_buf, block_size);
|
|
|
|
+ if (num_blocks >= 2)
|
|
|
|
+ *crc2 = crc32_zlib_sb8(*crc2, p_buf+block_size, block_size);
|
|
|
|
+ if (num_blocks >= 3)
|
|
|
|
+ *crc3 = crc32_zlib_sb8(*crc3, p_buf+2*block_size, block_size);
|
|
}
|
|
}
|
|
-
|
|
|
|
-
|
|
|
|
-# ifdef __LP64__
|
|
|
|
-/**
|
|
|
|
- * Hardware-accelerated CRC32C calculation using the 64-bit instructions.
|
|
|
|
- */
|
|
|
|
-static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) {
|
|
|
|
- // start directly at p_buf, even if it's an unaligned address. According
|
|
|
|
- // to the original author of this code, doing a small run of single bytes
|
|
|
|
- // to word-align the 64-bit instructions doesn't seem to help, but
|
|
|
|
- // we haven't reconfirmed those benchmarks ourselves.
|
|
|
|
- uint64_t crc64bit = crc;
|
|
|
|
- size_t i;
|
|
|
|
- for (i = 0; i < length / sizeof(uint64_t); i++) {
|
|
|
|
- crc64bit = _mm_crc32_u64(crc64bit, *(uint64_t*) p_buf);
|
|
|
|
- p_buf += sizeof(uint64_t);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- // This ugly switch is slightly faster for short strings than the straightforward loop
|
|
|
|
- uint32_t crc32bit = (uint32_t) crc64bit;
|
|
|
|
- length &= sizeof(uint64_t) - 1;
|
|
|
|
- switch (length) {
|
|
|
|
- case 7:
|
|
|
|
- crc32bit = _mm_crc32_u8(crc32bit, *p_buf++);
|
|
|
|
- case 6:
|
|
|
|
- crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*) p_buf);
|
|
|
|
- p_buf += 2;
|
|
|
|
- // case 5 is below: 4 + 1
|
|
|
|
- case 4:
|
|
|
|
- crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*) p_buf);
|
|
|
|
- break;
|
|
|
|
- case 3:
|
|
|
|
- crc32bit = _mm_crc32_u8(crc32bit, *p_buf++);
|
|
|
|
- case 2:
|
|
|
|
- crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*) p_buf);
|
|
|
|
- break;
|
|
|
|
- case 5:
|
|
|
|
- crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*) p_buf);
|
|
|
|
- p_buf += 4;
|
|
|
|
- case 1:
|
|
|
|
- crc32bit = _mm_crc32_u8(crc32bit, *p_buf);
|
|
|
|
- break;
|
|
|
|
- case 0:
|
|
|
|
- break;
|
|
|
|
- default:
|
|
|
|
- // This should never happen; enable in debug code
|
|
|
|
- assert(0 && "ended up with 8 or more bytes at tail of calculation");
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return crc32bit;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#ifdef USE_PIPELINED
|
|
|
|
-/**
|
|
|
|
- * Pipelined version of hardware-accelerated CRC32C calculation using
|
|
|
|
- * the 64 bit crc32q instruction.
|
|
|
|
- * One crc32c instruction takes three cycles, but two more with no data
|
|
|
|
- * dependency can be in the pipeline to achieve something close to single
|
|
|
|
- * instruction/cycle. Here we feed three blocks in RR.
|
|
|
|
- *
|
|
|
|
- * crc1, crc2, crc3 : Store initial checksum for each block before
|
|
|
|
- * calling. When it returns, updated checksums are stored.
|
|
|
|
- * p_buf : The base address of the data buffer. The buffer should be
|
|
|
|
- * at least as big as block_size * num_blocks.
|
|
|
|
- * block_size : The size of each block in bytes.
|
|
|
|
- * num_blocks : The number of blocks to work on. Min = 1, Max = 3
|
|
|
|
- */
|
|
|
|
-static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) {
|
|
|
|
- uint64_t c1 = *crc1;
|
|
|
|
- uint64_t c2 = *crc2;
|
|
|
|
- uint64_t c3 = *crc3;
|
|
|
|
- uint64_t *data = (uint64_t*)p_buf;
|
|
|
|
- int counter = block_size / sizeof(uint64_t);
|
|
|
|
- int remainder = block_size % sizeof(uint64_t);
|
|
|
|
- uint8_t *bdata;
|
|
|
|
-
|
|
|
|
- /* We do switch here because the loop has to be tight in order
|
|
|
|
- * to fill the pipeline. Any other statement inside the loop
|
|
|
|
- * or inbetween crc32 instruction can slow things down. Calling
|
|
|
|
- * individual crc32 instructions three times from C also causes
|
|
|
|
- * gcc to insert other instructions inbetween.
|
|
|
|
- *
|
|
|
|
- * Do not rearrange the following code unless you have verified
|
|
|
|
- * the generated machine code is as efficient as before.
|
|
|
|
- */
|
|
|
|
- switch (num_blocks) {
|
|
|
|
- case 3:
|
|
|
|
- /* Do three blocks */
|
|
|
|
- while (likely(counter)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32q (%7), %0;\n\t"
|
|
|
|
- "crc32q (%7,%6,1), %1;\n\t"
|
|
|
|
- "crc32q (%7,%6,2), %2;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2), "=r"(c3)
|
|
|
|
- : "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(data)
|
|
|
|
- );
|
|
|
|
- data++;
|
|
|
|
- counter--;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- /* Take care of the remainder. They are only up to seven bytes,
|
|
|
|
- * so performing byte-level crc32 won't take much time.
|
|
|
|
- */
|
|
|
|
- bdata = (uint8_t*)data;
|
|
|
|
- while (likely(remainder)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32b (%7), %0;\n\t"
|
|
|
|
- "crc32b (%7,%6,1), %1;\n\t"
|
|
|
|
- "crc32b (%7,%6,2), %2;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2), "=r"(c3)
|
|
|
|
- : "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(bdata)
|
|
|
|
- );
|
|
|
|
- bdata++;
|
|
|
|
- remainder--;
|
|
|
|
- }
|
|
|
|
- break;
|
|
|
|
- case 2:
|
|
|
|
- /* Do two blocks */
|
|
|
|
- while (likely(counter)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32q (%5), %0;\n\t"
|
|
|
|
- "crc32q (%5,%4,1), %1;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2)
|
|
|
|
- : "0"(c1), "1"(c2), "r"(block_size), "r"(data)
|
|
|
|
- );
|
|
|
|
- data++;
|
|
|
|
- counter--;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- bdata = (uint8_t*)data;
|
|
|
|
- while (likely(remainder)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32b (%5), %0;\n\t"
|
|
|
|
- "crc32b (%5,%4,1), %1;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2)
|
|
|
|
- : "0"(c1), "1"(c2), "r"(block_size), "r"(bdata)
|
|
|
|
- );
|
|
|
|
- bdata++;
|
|
|
|
- remainder--;
|
|
|
|
- }
|
|
|
|
- break;
|
|
|
|
- case 1:
|
|
|
|
- /* single block */
|
|
|
|
- while (likely(counter)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32q (%2), %0;\n\t"
|
|
|
|
- : "=r"(c1)
|
|
|
|
- : "0"(c1), "r"(data)
|
|
|
|
- );
|
|
|
|
- data++;
|
|
|
|
- counter--;
|
|
|
|
- }
|
|
|
|
- bdata = (uint8_t*)data;
|
|
|
|
- while (likely(remainder)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32b (%2), %0;\n\t"
|
|
|
|
- : "=r"(c1)
|
|
|
|
- : "0"(c1), "r"(bdata)
|
|
|
|
- );
|
|
|
|
- bdata++;
|
|
|
|
- remainder--;
|
|
|
|
- }
|
|
|
|
- break;
|
|
|
|
- case 0:
|
|
|
|
- return;
|
|
|
|
- default:
|
|
|
|
- assert(0 && "BUG: Invalid number of checksum blocks");
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- *crc1 = c1;
|
|
|
|
- *crc2 = c2;
|
|
|
|
- *crc3 = c3;
|
|
|
|
- return;
|
|
|
|
-}
|
|
|
|
-#endif /* USE_PIPELINED */
|
|
|
|
-
|
|
|
|
-# else // 32-bit
|
|
|
|
-
|
|
|
|
-/**
|
|
|
|
- * Hardware-accelerated CRC32C calculation using the 32-bit instructions.
|
|
|
|
- */
|
|
|
|
-static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) {
|
|
|
|
- // start directly at p_buf, even if it's an unaligned address. According
|
|
|
|
- // to the original author of this code, doing a small run of single bytes
|
|
|
|
- // to word-align the 64-bit instructions doesn't seem to help, but
|
|
|
|
- // we haven't reconfirmed those benchmarks ourselves.
|
|
|
|
- size_t i;
|
|
|
|
- for (i = 0; i < length / sizeof(uint32_t); i++) {
|
|
|
|
- crc = _mm_crc32_u32(crc, *(uint32_t*) p_buf);
|
|
|
|
- p_buf += sizeof(uint32_t);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- // This ugly switch is slightly faster for short strings than the straightforward loop
|
|
|
|
- length &= sizeof(uint32_t) - 1;
|
|
|
|
- switch (length) {
|
|
|
|
- case 3:
|
|
|
|
- crc = _mm_crc32_u8(crc, *p_buf++);
|
|
|
|
- case 2:
|
|
|
|
- crc = _mm_crc32_u16(crc, *(uint16_t*) p_buf);
|
|
|
|
- break;
|
|
|
|
- case 1:
|
|
|
|
- crc = _mm_crc32_u8(crc, *p_buf);
|
|
|
|
- break;
|
|
|
|
- case 0:
|
|
|
|
- break;
|
|
|
|
- default:
|
|
|
|
- // This should never happen; enable in debug code
|
|
|
|
- assert(0 && "ended up with 4 or more bytes at tail of calculation");
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return crc;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#ifdef USE_PIPELINED
|
|
|
|
-/**
|
|
|
|
- * Pipelined version of hardware-accelerated CRC32C calculation using
|
|
|
|
- * the 32 bit crc32l instruction.
|
|
|
|
- * One crc32c instruction takes three cycles, but two more with no data
|
|
|
|
- * dependency can be in the pipeline to achieve something close to single
|
|
|
|
- * instruction/cycle. Here we feed three blocks in RR.
|
|
|
|
- *
|
|
|
|
- * crc1, crc2, crc3 : Store initial checksum for each block before
|
|
|
|
- * calling. When it returns, updated checksums are stored.
|
|
|
|
- * data : The base address of the data buffer. The buffer should be
|
|
|
|
- * at least as big as block_size * num_blocks.
|
|
|
|
- * block_size : The size of each block in bytes.
|
|
|
|
- * num_blocks : The number of blocks to work on. Min = 1, Max = 3
|
|
|
|
- */
|
|
|
|
-static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) {
|
|
|
|
- uint32_t c1 = *crc1;
|
|
|
|
- uint32_t c2 = *crc2;
|
|
|
|
- uint32_t c3 = *crc3;
|
|
|
|
- int counter = block_size / sizeof(uint32_t);
|
|
|
|
- int remainder = block_size % sizeof(uint32_t);
|
|
|
|
- uint32_t *data = (uint32_t*)p_buf;
|
|
|
|
- uint8_t *bdata;
|
|
|
|
-
|
|
|
|
- /* We do switch here because the loop has to be tight in order
|
|
|
|
- * to fill the pipeline. Any other statement inside the loop
|
|
|
|
- * or inbetween crc32 instruction can slow things down. Calling
|
|
|
|
- * individual crc32 instructions three times from C also causes
|
|
|
|
- * gcc to insert other instructions inbetween.
|
|
|
|
- *
|
|
|
|
- * Do not rearrange the following code unless you have verified
|
|
|
|
- * the generated machine code is as efficient as before.
|
|
|
|
- */
|
|
|
|
- switch (num_blocks) {
|
|
|
|
- case 3:
|
|
|
|
- /* Do three blocks */
|
|
|
|
- while (likely(counter)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32l (%7), %0;\n\t"
|
|
|
|
- "crc32l (%7,%6,1), %1;\n\t"
|
|
|
|
- "crc32l (%7,%6,2), %2;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2), "=r"(c3)
|
|
|
|
- : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(data)
|
|
|
|
- );
|
|
|
|
- data++;
|
|
|
|
- counter--;
|
|
|
|
- }
|
|
|
|
- /* Take care of the remainder. They are only up to three bytes,
|
|
|
|
- * so performing byte-level crc32 won't take much time.
|
|
|
|
- */
|
|
|
|
- bdata = (uint8_t*)data;
|
|
|
|
- while (likely(remainder)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32b (%7), %0;\n\t"
|
|
|
|
- "crc32b (%7,%6,1), %1;\n\t"
|
|
|
|
- "crc32b (%7,%6,2), %2;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2), "=r"(c3)
|
|
|
|
- : "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata)
|
|
|
|
- );
|
|
|
|
- bdata++;
|
|
|
|
- remainder--;
|
|
|
|
- }
|
|
|
|
- break;
|
|
|
|
- case 2:
|
|
|
|
- /* Do two blocks */
|
|
|
|
- while (likely(counter)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32l (%5), %0;\n\t"
|
|
|
|
- "crc32l (%5,%4,1), %1;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2)
|
|
|
|
- : "r"(c1), "r"(c2), "r"(block_size), "r"(data)
|
|
|
|
- );
|
|
|
|
- data++;
|
|
|
|
- counter--;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- bdata = (uint8_t*)data;
|
|
|
|
- while (likely(remainder)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32b (%5), %0;\n\t"
|
|
|
|
- "crc32b (%5,%4,1), %1;\n\t"
|
|
|
|
- : "=r"(c1), "=r"(c2)
|
|
|
|
- : "r"(c1), "r"(c2), "r"(block_size), "r"(bdata)
|
|
|
|
- );
|
|
|
|
- bdata++;
|
|
|
|
- remainder--;
|
|
|
|
- }
|
|
|
|
- break;
|
|
|
|
- case 1:
|
|
|
|
- /* single block */
|
|
|
|
- while (likely(counter)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32l (%2), %0;\n\t"
|
|
|
|
- : "=r"(c1)
|
|
|
|
- : "r"(c1), "r"(data)
|
|
|
|
- );
|
|
|
|
- data++;
|
|
|
|
- counter--;
|
|
|
|
- }
|
|
|
|
- bdata = (uint8_t*)data;
|
|
|
|
- while (likely(remainder)) {
|
|
|
|
- __asm__ __volatile__(
|
|
|
|
- "crc32b (%2), %0;\n\t"
|
|
|
|
- : "=r"(c1)
|
|
|
|
- : "r"(c1), "r"(bdata)
|
|
|
|
- );
|
|
|
|
- bdata++;
|
|
|
|
- remainder--;
|
|
|
|
- }
|
|
|
|
- break;
|
|
|
|
- case 0:
|
|
|
|
- return;
|
|
|
|
- default:
|
|
|
|
- assert(0 && "BUG: Invalid number of checksum blocks");
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- *crc1 = c1;
|
|
|
|
- *crc2 = c2;
|
|
|
|
- *crc3 = c3;
|
|
|
|
- return;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#endif /* USE_PIPELINED */
|
|
|
|
-
|
|
|
|
-# endif // 64-bit vs 32-bit
|
|
|
|
-
|
|
|
|
-#else // end x86 architecture
|
|
|
|
-
|
|
|
|
-static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length) {
|
|
|
|
- // never called!
|
|
|
|
- assert(0 && "hardware crc called on an unsupported platform");
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#endif
|
|
|