-
Notifications
You must be signed in to change notification settings - Fork 79
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
License, Windows and Compatibility Upgrade #18
Changes from 1 commit
aa1deea
1f5a420
2937f89
dd3e924
b3064e0
4c77bc7
4bf55a2
59c0c27
dc28f91
b73befc
df5c4fe
c99aa75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,13 +20,12 @@ | |
|
||
DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h. | ||
*/ | ||
#ifdef HAS_PCLMUL | ||
|
||
#include <emmintrin.h> | ||
#include <smmintrin.h> | ||
#include <wmmintrin.h> | ||
//#include <stdio.h> | ||
#include <cpuid.h> | ||
#ifdef HAS_PCLMUL | ||
#include <emmintrin.h> | ||
#include <smmintrin.h> | ||
#include <wmmintrin.h> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think all of those can be replaced by a single There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. defines no longer in crc32, @Adenilson can decide if these can be replaced in arc_simd.c |
||
#include <cpuid.h> | ||
#endif | ||
|
||
#ifdef __aarch64__ | ||
|
@@ -279,29 +278,6 @@ local unsigned long crc32_generic(crc, buf, len) | |
|
||
#ifdef HAS_PCLMUL | ||
|
||
|
||
#ifdef HAS_GPL | ||
extern uLong crc32_pclmul_le_16(unsigned char const *buffer, size_t len, uInt crc32); | ||
#else | ||
|
||
int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test | ||
|
||
int has_pclmul(void) { | ||
if (cpu_has_pclmul >= 0) | ||
return cpu_has_pclmul; | ||
cpu_has_pclmul = 0; | ||
int leaf = 1; | ||
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; | ||
/* %ecx */ | ||
#define crc_bit_PCLMUL (1 << 1) | ||
if (__get_cpuid(leaf, &eax, &ebx, &ecx, &edx)) { | ||
//printf("leaf=%d, eax=0x%x, ebx=0x%x, ecx=0x%x, edx=0x%x\n", leaf, eax, ebx, ecx, edx); | ||
if ((ecx & crc_bit_PCLMUL) != 0) | ||
cpu_has_pclmul = 1; | ||
} | ||
return cpu_has_pclmul; | ||
} | ||
|
||
//https://github.com/webosose/chromium68/blob/master/src/third_party/zlib/crc32_simd.c | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe instead of pointing to an outdated copy of Chromium (M68), you could reference the ToT (Top of Tree) original repository Chromium repo*? Since M68, I've added some new optimizations, refactorings and bug fixes on Chromium's zlib. |
||
/* crc32_simd.c | ||
* | ||
|
@@ -344,14 +320,14 @@ int has_pclmul(void) { | |
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" | ||
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 | ||
*/ | ||
|
||
#ifdef _MSC_VER | ||
#define zalign(x) __declspec(align(x)) | ||
#else | ||
#define zalign(x) __attribute__((aligned((x)))) | ||
#endif | ||
|
||
uLong crc32_simd(unsigned char const *buf, size_t len, uInt crc) { | ||
uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed (rordenlab@59c0c27) |
||
/* | ||
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and | ||
* the CRC32+Barrett polynomials given at the end of the paper. | ||
|
@@ -457,23 +433,44 @@ uLong crc32_simd(unsigned char const *buf, size_t len, uInt crc) { | |
|
||
} | ||
|
||
#endif //Chromium code | ||
|
||
#define PCLMUL_MIN_LEN 64 | ||
#define PCLMUL_ALIGN 16 | ||
#define PCLMUL_ALIGN_MASK 15 | ||
|
||
int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test | ||
|
||
int has_pclmul(void) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this perhaps be called at init time with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @vkrasnov I think my solution is robust and efficient and works on all the compilers I tested it on (gcc 4.8..9.2; Clang 10, Microsoft C compiler). No other location in this library uses the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure about However, unsychronized access to a global variable is Undefined Behavior if code runs on multiple threads, and we definitely will use it in multi-threaded programs. So at very least it should be marked as atomic. I know x86 memory model is forgiving about such things, but things that are UB in the compiler, but happen to work in practice are a thin ice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kornelski -I have tried to address this in my fork. The value is atomic. There is also no a compiler flag |
||
if (cpu_has_pclmul >= 0) | ||
return cpu_has_pclmul; | ||
cpu_has_pclmul = 0; | ||
int leaf = 1; | ||
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; | ||
/* %ecx */ | ||
#define crc_bit_PCLMUL (1 << 1) | ||
if (__get_cpuid(leaf, &eax, &ebx, &ecx, &edx)) { | ||
//printf("leaf=%d, eax=0x%x, ebx=0x%x, ecx=0x%x, edx=0x%x\n", leaf, eax, ebx, ecx, edx); | ||
if ((ecx & crc_bit_PCLMUL) != 0) | ||
cpu_has_pclmul = 1; | ||
} | ||
return cpu_has_pclmul; | ||
} | ||
|
||
|
||
/* Function stolen from linux kernel 3.14. It computes the CRC over the given | ||
* buffer with initial CRC value <crc32>. The buffer is <len> byte in length, | ||
* and must be 16-byte aligned. | ||
*/ | ||
extern uint crc32_pclmul_le_16(unsigned char const *buffer, | ||
size_t len, uInt crc32); | ||
|
||
uLong crc32(crc, buf, len) | ||
uLong crc; | ||
const Bytef *buf; | ||
uInt len; | ||
{ | ||
if (len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) | ||
return crc32_generic(crc, buf, len); | ||
#ifndef HAS_GPL //detect whether current CPU supports PCLMUL | ||
if (!has_pclmul()) | ||
if ((len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) || (!has_pclmul())) | ||
return crc32_generic(crc, buf, len); | ||
#endif | ||
|
||
/* Handle the leading patial chunk */ | ||
uInt misalign = PCLMUL_ALIGN_MASK & ((unsigned long)buf); | ||
uInt sz = (PCLMUL_ALIGN - misalign) % PCLMUL_ALIGN; | ||
|
@@ -482,12 +479,11 @@ uLong crc32(crc, buf, len) | |
buf += sz; | ||
len -= sz; | ||
} | ||
|
||
/* Go over 16-byte chunks */ | ||
#ifdef HAS_GPL | ||
crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); | ||
#else | ||
//crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); | ||
crc = crc32_simd(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); | ||
#endif | ||
|
||
crc = crc ^ 0xffffffffUL; | ||
|
||
/* Handle the trailing partial chunk */ | ||
|
@@ -693,12 +689,12 @@ uLong ZEXPORT crc32_combine(crc1, crc2, len2) | |
return crc32_combine_(crc1, crc2, len2); | ||
} | ||
|
||
/*uLong ZEXPORT crc32_combine64(crc1, crc2, len2) | ||
uLong ZEXPORT crc32_combine64(crc1, crc2, len2) | ||
uLong crc1; | ||
uLong crc2; | ||
z_off64_t len2; | ||
{ | ||
return crc32_combine_(crc1, crc2, len2); | ||
}*/ | ||
} | ||
|
||
#endif | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest you try to follow the integration method used in Chromium's zlib i.e. keeping SIMD/optimizations in separated files.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Adenilson I have done so in my fork. Can I suggest I withdraw this pull request, and you either comment on my fork or make your own fork. Once all your concerns are addressed, we can generate a new pull request to the CloudFlare repo. I have tested the separate files on both MacOS and Linux, so I think this addresses your concern.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Adenilson I should also point out that my repo copies the Chromium zlib code verbatim, and still includes the ARM code, even though I only tested and included the x86 code. I think the whole community would benefit if the ARM code was enhanced, but that is outside my wheelhouse.