Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multithreading support for Haswell DDOT #1491

Merged
merged 2 commits into from
Mar 27, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion kernel/x86_64/ddot.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ddot_microk_sandy-2.c"
#endif

#if !defined(DSDOT)
#define RETURN_TYPE FLOAT
#else
#define RETURN_TYPE double
#endif


#ifndef HAVE_KERNEL_8

Expand Down Expand Up @@ -71,7 +77,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)

#endif

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
Expand Down Expand Up @@ -139,4 +145,63 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)

}

#if defined(SMP)
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, RETURN_TYPE *result, BLASLONG dummy3)
{
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);

return 0;
}

extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
FLOAT dot = 0.0;

#if defined(SMP)
nthreads = num_cpu_avail(1);

if (inc_x == 0 || inc_y == 0)
nthreads = 1;

if (n <= 10000)
nthreads = 1;

if (nthreads == 1) {
dot = dot_compute(n, x, inc_x, y, inc_y);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
RETURN_TYPE *ptr;

#if !defined(DOUBLE)
mode = BLAS_SINGLE | BLAS_REAL;
#else
mode = BLAS_DOUBLE | BLAS_REAL;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, result, 0,
( void *)dot_thread_function, nthreads);

ptr = (RETURN_TYPE *)result;
for (i = 0; i < nthreads; i++) {
dot = dot + (*ptr);
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
dot = dot_compute(n, x, inc_x, y, inc_y);
#endif

return dot;
}