Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimized slide_hash for Power processors #457

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(VERSION "1.2.12.1")

option(ASM686 "Enable building i686 assembly implementation")
option(AMD64 "Enable building amd64 assembly implementation")
option(POWER "Enable building power implementation")

set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables")
set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries")
Expand Down Expand Up @@ -140,6 +141,73 @@ if(CMAKE_COMPILER_IS_GNUCC)
add_definitions(-DASMV)
set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
endif()

# test to see if we can use a GNU indirect function to detect and load optimized code at runtime
CHECK_C_SOURCE_COMPILES("
static int test_ifunc_native(void)
{
return 1;
}
static int (*(check_ifunc_native(void)))(void)
{
return test_ifunc_native;
}
int test_ifunc(void) __attribute__ ((ifunc (\"check_ifunc_native\")));
int main(void)
{
return 0;
}
" HAS_C_ATTR_IFUNC)

if(HAS_C_ATTR_IFUNC)
add_definitions(-DHAVE_IFUNC)
set(ZLIB_PRIVATE_HDRS ${ZLIB_PRIVATE_HDRS} contrib/gcc/zifunc.h)
endif()

if(POWER)
# Test to see if we can use the optimizations for Power
CHECK_C_SOURCE_COMPILES("
#ifndef _ARCH_PPC
#error \"Target is not Power\"
#endif
#ifndef __BUILTIN_CPU_SUPPORTS__
#error \"Target doesn't support __builtin_cpu_supports()\"
#endif
int main() { return 0; }
" HAS_POWER_SUPPORT)

if(HAS_POWER_SUPPORT AND HAS_C_ATTR_IFUNC)
add_definitions(-DZ_POWER_OPT)

set(CMAKE_REQUIRED_FLAGS -mcpu=power8)
CHECK_C_SOURCE_COMPILES("int main(void){return 0;}" POWER8)

if(POWER8)
add_definitions(-DZ_POWER8)
set(ZLIB_POWER8
contrib/power/slide_hash_power8.c)

set_source_files_properties(
${ZLIB_POWER8}
PROPERTIES COMPILE_FLAGS -mcpu=power8)
endif()

set(CMAKE_REQUIRED_FLAGS -mcpu=power9)
CHECK_C_SOURCE_COMPILES("int main(void){return 0;}" POWER9)

if(POWER9)
add_definitions(-DZ_POWER9)
set(ZLIB_POWER9 )

set_source_files_properties(
${ZLIB_POWER9}
PROPERTIES COMPILE_FLAGS -mcpu=power9)
endif()

set(ZLIB_PRIVATE_HDRS ${ZLIB_PRIVATE_HDRS} contrib/power/power.h)
set(ZLIB_SRCS ${ZLIB_SRCS} ${ZLIB_POWER8} ${ZLIB_POWER9})
endif()
endif()
endif()

if(MSVC)
Expand Down
8 changes: 8 additions & 0 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ crc32.o: $(SRCDIR)crc32.c
deflate.o: $(SRCDIR)deflate.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c

slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
$(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c

infback.o: $(SRCDIR)infback.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c

Expand Down Expand Up @@ -217,6 +220,11 @@ deflate.lo: $(SRCDIR)deflate.c
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
-@mv objs/deflate.o $@

slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
-@mv objs/slide_hash_power8.o $@

infback.lo: $(SRCDIR)infback.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c
Expand Down
66 changes: 66 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,72 @@ EOF
fi
fi

# test to see if we can use a gnu indirection function to detect and load optimized code at runtime
echo >> configure.log
cat > $test.c <<EOF
static int test_ifunc_native(void)
{
return 1;
}

static int (*(check_ifunc_native(void)))(void)
{
return test_ifunc_native;
}

int test_ifunc(void) __attribute__ ((ifunc ("check_ifunc_native")));
EOF

if tryboth $CC -c $CFLAGS $test.c; then
SFLAGS="${SFLAGS} -DHAVE_IFUNC"
CFLAGS="${CFLAGS} -DHAVE_IFUNC"
echo "Checking for attribute(ifunc) support... Yes." | tee -a configure.log
else
echo "Checking for attribute(ifunc) support... No." | tee -a configure.log
fi

# Test to see if we can use the optimizations for Power
echo >> configure.log
cat > $test.c <<EOF
#ifndef _ARCH_PPC
#error "Target is not Power"
#endif
#ifndef HAVE_IFUNC
#error "Target doesn't support ifunc"
#endif
#ifndef __BUILTIN_CPU_SUPPORTS__
#error "Target doesn't support __builtin_cpu_supports()"
#endif
EOF

if tryboth $CC -c $CFLAGS $test.c; then
echo "int main(void){return 0;}" > $test.c

if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
POWER8="-DZ_POWER8"
PIC_OBJC="${PIC_OBJC} slide_hash_power8.lo"
OBJC="${OBJC} slide_hash_power8.o"
echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
else
echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
fi

if tryboth $CC -c $CFLAGS -mcpu=power9 $test.c; then
POWER9="-DZ_POWER9"
PIC_OBJC="${PIC_OBJC}"
OBJC="${OBJC}"
echo "Checking for -mcpu=power9 support... Yes." | tee -a configure.log
else
echo "Checking for -mcpu=power9 support... No." | tee -a configure.log
fi

SFLAGS="${SFLAGS} ${POWER8} ${POWER9} -DZ_POWER_OPT"
CFLAGS="${CFLAGS} ${POWER8} ${POWER9} -DZ_POWER_OPT"
echo "Checking for Power optimizations support... Yes." | tee -a configure.log
else
echo "Checking for Power optimizations support... No." | tee -a configure.log
fi

# show the results in the log
echo >> configure.log
echo ALL = $ALL >> configure.log
Expand Down
8 changes: 8 additions & 0 deletions contrib/README.contrib
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ ada/ by Dmitriy Anisimkov <[email protected]>
blast/ by Mark Adler <[email protected]>
Decompressor for output of PKWare Data Compression Library (DCL)

gcc/ by Matheus Castanho <[email protected]>
and Rogerio Alves <[email protected]>
Optimization helpers using GCC-specific extensions

delphi/ by Cosmin Truta <[email protected]>
Support for Delphi and C++ Builder

Expand Down Expand Up @@ -42,6 +46,10 @@ minizip/ by Gilles Vollant <[email protected]>
pascal/ by Bob Dellaca <[email protected]> et al.
Support for Pascal

power/ by Matheus Castanho <[email protected]>
and Rogerio Alves <[email protected]>
Optimized functions for Power processors

puff/ by Mark Adler <[email protected]>
Small, low memory usage inflate. Also serves to provide an
unambiguous description of the deflate format.
Expand Down
60 changes: 60 additions & 0 deletions contrib/gcc/zifunc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/* Copyright (C) 2019 Matheus Castanho <[email protected]>, IBM
* 2019 Rogerio Alves <[email protected]>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#ifndef Z_IFUNC_H_
#define Z_IFUNC_H_

/* Helpers for arch optimizations */

#define Z_IFUNC(fname) \
typeof(fname) fname __attribute__ ((ifunc (#fname "_resolver"))); \
local typeof(fname) *fname##_resolver(void)
/* This is a helper macro to declare a resolver for an indirect function
* (ifunc). Let's say you have function
*
* int foo (int a);
*
* for which you want to provide different implementations, for example:
*
* int foo_clever (int a) {
* ... clever things ...
* }
*
* int foo_smart (int a) {
* ... smart things ...
* }
*
* You will have to declare foo() as an indirect function and also provide a
* resolver for it, to choose between foo_clever() and foo_smart() based on
* some criteria you define (e.g. processor features).
*
* Since most likely foo() has a default implementation somewhere in zlib, you
* may have to rename it so the 'foo' symbol can be used by the ifunc without
* conflicts.
*
* #define foo foo_default
* int foo (int a) {
* ...
* }
* #undef foo
*
* Now you just have to provide a resolver function to choose which function
* should be used (decided at runtime on the first call to foo()):
*
* Z_IFUNC(foo) {
* if (... some condition ...)
* return foo_clever;
*
* if (... other condition ...)
* return foo_smart;
*
* return foo_default;
* }
*
* All calls to foo() throughout the code can remain untouched, all the magic
* will be done by the linker using the resolver function.
*/

#endif /* Z_IFUNC_H_ */
8 changes: 8 additions & 0 deletions contrib/power/power.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/* Copyright (C) 2019 Matheus Castanho <[email protected]>, IBM
* 2019 Rogerio Alves <[email protected]>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include "../../deflate.h"

void _slide_hash_power8(deflate_state *s);
63 changes: 63 additions & 0 deletions contrib/power/slide_hash_power8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/* Copyright (C) 2019 Matheus Castanho <[email protected]>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include <altivec.h>
#include "../../deflate.h"

local inline void slide_hash_power8_loop OF((deflate_state *s,
unsigned n_elems, Posf *table_end)) __attribute__((always_inline));

local void slide_hash_power8_loop(
deflate_state *s,
unsigned n_elems,
Posf *table_end)
{
vector unsigned short vw, vm, *vp;
unsigned chunks;

/* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
* so instead of processing each of the n_elems in the hash table
* individually, we can do it in chunks of 8 with vector instructions.
*
* This function is only called from slide_hash_power8(), and both calls
* pass n_elems as a power of 2 higher than 2^7, as defined by
* deflateInit2_(), so n_elems will always be a multiple of 8. */
chunks = n_elems >> 3;
Assert(n_elems % 8 == 0, "Weird hash table size!");

/* This type casting is safe since s->w_size is always <= 64KB
* as defined by deflateInit2_() and Posf == unsigned short */
vw[0] = (Posf) s->w_size;
vw = vec_splat(vw,0);

vp = (vector unsigned short *) table_end;

do {
/* Processing 8 elements at a time */
vp--;
vm = *vp;

/* This is equivalent to: m >= w_size ? m - w_size : 0
* Since we are using a saturated unsigned subtraction, any
* values that are > w_size will be set to 0, while the others
* will be subtracted by w_size. */
*vp = vec_subs(vm,vw);
} while (--chunks);
};

void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s)
{
unsigned n;
Posf *p;

n = s->hash_size;
p = &s->head[n];
slide_hash_power8_loop(s,n,p);

#ifndef FASTEST
n = s->w_size;
p = &s->prev[n];
slide_hash_power8_loop(s,n,p);
#endif
}
15 changes: 15 additions & 0 deletions contrib/power/slide_hash_resolver.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/* Copyright (C) 2019 Matheus Castanho <[email protected]>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include "../gcc/zifunc.h"
#include "power.h"

Z_IFUNC(slide_hash) {
#ifdef Z_POWER8
if (__builtin_cpu_supports("arch_2_07"))
return _slide_hash_power8;
#endif

return slide_hash_default;
}
12 changes: 12 additions & 0 deletions deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,13 @@ local const config configuration_table[10] = {
(unsigned)(s->hash_size-1)*sizeof(*s->head)); \
} while (0)

#ifdef Z_POWER_OPT
/* Rename function so resolver can use its symbol. The default version will be
* returned by the resolver if the host has no support for an optimized version.
*/
#define slide_hash slide_hash_default
#endif /* Z_POWER_OPT */

/* ===========================================================================
* Slide the hash table when sliding the window down (could be avoided with 32
* bit values at the expense of memory usage). We slide even when level == 0 to
Expand Down Expand Up @@ -227,6 +234,11 @@ local void slide_hash(s)
#endif
}

#ifdef Z_POWER_OPT
#undef slide_hash
#include "contrib/power/slide_hash_resolver.c"
#endif /* Z_POWER_OPT */

/* ========================================================================= */
int ZEXPORT deflateInit_(strm, level, version, stream_size)
z_streamp strm;
Expand Down