Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

License, Windows and Compatibility Upgrade #18

Closed
wants to merge 12 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 41 additions & 45 deletions crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@

DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h.
*/
#ifdef HAS_PCLMUL

#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>
//#include <stdio.h>
#include <cpuid.h>
#ifdef HAS_PCLMUL

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest you try to follow the integration method used in Chromium's zlib i.e. keeping SIMD/optimizations in separated files.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Adenilson I have done so in my fork. Can I suggest I withdraw this pull request, and you either comment on my fork or make your own fork. Once all your concerns are addressed, we can generate a new pull request to the CloudFlare repo. I have tested the separate files on both MacOS and Linux, so I think this addresses your concern.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Adenilson I should also point out that my repo copies the Chromium zlib code verbatim, and still includes the ARM code, even though I only tested and included the x86 code. I think the whole community would benefit if the ARM code was enhanced, but that is outside my wheelhouse.

#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think all of those can be replaced by a single #include <immintrin.h>

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

defines no longer in crc32, @Adenilson can decide if these can be replaced in arc_simd.c

#include <cpuid.h>
#endif

#ifdef __aarch64__
Expand Down Expand Up @@ -279,29 +278,6 @@ local unsigned long crc32_generic(crc, buf, len)

#ifdef HAS_PCLMUL


#ifdef HAS_GPL
extern uLong crc32_pclmul_le_16(unsigned char const *buffer, size_t len, uInt crc32);
#else

int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test

int has_pclmul(void) {
if (cpu_has_pclmul >= 0)
return cpu_has_pclmul;
cpu_has_pclmul = 0;
int leaf = 1;
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
/* %ecx */
#define crc_bit_PCLMUL (1 << 1)
if (__get_cpuid(leaf, &eax, &ebx, &ecx, &edx)) {
//printf("leaf=%d, eax=0x%x, ebx=0x%x, ecx=0x%x, edx=0x%x\n", leaf, eax, ebx, ecx, edx);
if ((ecx & crc_bit_PCLMUL) != 0)
cpu_has_pclmul = 1;
}
return cpu_has_pclmul;
}

//https://github.com/webosose/chromium68/blob/master/src/third_party/zlib/crc32_simd.c

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe instead of pointing to an outdated copy of Chromium (M68), you could reference the ToT (Top of Tree) original repository Chromium repo*?

Since M68, I've added some new optimizations, refactorings and bug fixes on Chromium's zlib.
:-)

*https://cs.chromium.org/chromium/src/third_party/zlib/

/* crc32_simd.c
*
Expand Down Expand Up @@ -344,14 +320,14 @@ int has_pclmul(void) {
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
*/

#ifdef _MSC_VER
#define zalign(x) __declspec(align(x))
#else
#define zalign(x) __attribute__((aligned((x))))
#endif

uLong crc32_simd(unsigned char const *buf, size_t len, uInt crc) {
uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather use uint32_t everywhere the size is known to be 32 bits.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/*
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
* the CRC32+Barrett polynomials given at the end of the paper.
Expand Down Expand Up @@ -457,23 +433,44 @@ uLong crc32_simd(unsigned char const *buf, size_t len, uInt crc) {

}

#endif //Chromium code

#define PCLMUL_MIN_LEN 64
#define PCLMUL_ALIGN 16
#define PCLMUL_ALIGN_MASK 15

int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test

int has_pclmul(void) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this perhaps be called at init time with __attribute__((constructor)) ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vkrasnov I think my solution is robust and efficient and works on all the compilers I tested it on (gcc 4.8..9.2; Clang 10, Microsoft C compiler). No other location in this library uses the __attribute__((constructor)) style, and it is unclear if it is universally accepted. This is your code, which you maintain, so ultimately should be familiar to you. Since that style is unfamiliar to me, I would be grateful if you or @Adenilson could devise the implementation you are happy with.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about __attribute__((constructor)), e.g. Rust intentionally avoids life before main, so I'm not sure how that would work if this library was wrapped as a Rust crate.

However, unsychronized access to a global variable is Undefined Behavior if code runs on multiple threads, and we definitely will use it in multi-threaded programs. So at very least it should be marked as atomic. I know x86 memory model is forgiving about such things, but things that are UB in the compiler, but happen to work in practice are a thin ice.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kornelski -I have tried to address this in my fork. The value is atomic. There is also no a compiler flag SKIP_CPUID_CHECK that allows one to assume that the CPU supports PCLMUL. One option would be to make the default behavior be to assume PCLMUL support. This would my commit would have the same compatibility as the current code, simply using the Chromium BSD-licensed SIMD CRC in place of the GPL code. This would eliminate @Adenilson's performance concerns, albeit just like the current code it will fail if the code is compiled on a system that supports PCLMUL but is run on a computer without support.

if (cpu_has_pclmul >= 0)
return cpu_has_pclmul;
cpu_has_pclmul = 0;
int leaf = 1;
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
/* %ecx */
#define crc_bit_PCLMUL (1 << 1)
if (__get_cpuid(leaf, &eax, &ebx, &ecx, &edx)) {
//printf("leaf=%d, eax=0x%x, ebx=0x%x, ecx=0x%x, edx=0x%x\n", leaf, eax, ebx, ecx, edx);
if ((ecx & crc_bit_PCLMUL) != 0)
cpu_has_pclmul = 1;
}
return cpu_has_pclmul;
}


/* Function stolen from linux kernel 3.14. It computes the CRC over the given
* buffer with initial CRC value <crc32>. The buffer is <len> byte in length,
* and must be 16-byte aligned.
*/
extern uint crc32_pclmul_le_16(unsigned char const *buffer,
size_t len, uInt crc32);

uLong crc32(crc, buf, len)
uLong crc;
const Bytef *buf;
uInt len;
{
if (len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1)
return crc32_generic(crc, buf, len);
#ifndef HAS_GPL //detect whether current CPU supports PCLMUL
if (!has_pclmul())
if ((len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) || (!has_pclmul()))
return crc32_generic(crc, buf, len);
#endif

/* Handle the leading patial chunk */
uInt misalign = PCLMUL_ALIGN_MASK & ((unsigned long)buf);
uInt sz = (PCLMUL_ALIGN - misalign) % PCLMUL_ALIGN;
Expand All @@ -482,12 +479,11 @@ uLong crc32(crc, buf, len)
buf += sz;
len -= sz;
}

/* Go over 16-byte chunks */
#ifdef HAS_GPL
crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL);
#else
//crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL);
crc = crc32_simd(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL);
#endif

crc = crc ^ 0xffffffffUL;

/* Handle the trailing partial chunk */
Expand Down Expand Up @@ -693,12 +689,12 @@ uLong ZEXPORT crc32_combine(crc1, crc2, len2)
return crc32_combine_(crc1, crc2, len2);
}

/*uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
uLong crc1;
uLong crc2;
z_off64_t len2;
{
return crc32_combine_(crc1, crc2, len2);
}*/
}

#endif
#endif