28 fprintf(stderr,
"gpu_allocate: ptr argument is NULL\n");
31 *ptr = malloc((
size_t) n);
33 perror(
"gpu_allocate: malloc failed");
39 if (ptr == NULL || *ptr == NULL) {
53void gpu_upload(
const void* cpu_ptr,
void* gpu_ptr,
const int64_t n) {
54 if (cpu_ptr == NULL || gpu_ptr == NULL) {
55 fprintf(stderr,
"gpu_upload: NULL pointer argument\n");
58 memcpy(gpu_ptr, cpu_ptr, n);
61void gpu_download(
const void* gpu_ptr,
void* cpu_ptr,
const int64_t n) {
62 if (gpu_ptr == NULL || cpu_ptr == NULL) {
63 fprintf(stderr,
"gpu_download: NULL pointer argument\n");
66 memcpy(cpu_ptr, gpu_ptr, n);
69void gpu_copy(
const void* gpu_ptr_src,
void* gpu_ptr_dest,
const int64_t n) {
70 if (gpu_ptr_src == NULL || gpu_ptr_dest == NULL) {
71 fprintf(stderr,
"gpu_copy: NULL pointer argument\n");
74 memcpy(gpu_ptr_dest, gpu_ptr_src, n);
81 *ptr = (
void*) malloc(
sizeof(
char));
111 if (value > INT32_MAX || value < INT32_MIN) {
112 fprintf(stderr,
"Integer overflow: %s value %lld exceeds int32_t range\n", name, (
long long)value);
119 *handle = (
void*) malloc(
sizeof(
char));
129double ddot_(
const int32_t* n,
const double* x,
const int32_t* incx,
const double* y,
const int32_t* incy);
131void gpu_ddot(
void* handle,
const int64_t n,
const double* x,
const int64_t incx,
const double* y,
const int64_t incy,
double* result) {
132 assert (handle != NULL);
135 int32_t n_, incx_, incy_;
138 incx_ = (int32_t) incx;
139 incy_ = (int32_t) incy;
149 *result =
ddot_(&n_, x, &incx_, y, &incy_);
153float sdot_(
const int32_t* n,
const float* x,
const int32_t* incx,
const float* y,
const int32_t* incy);
155void gpu_sdot(
void* handle,
const int64_t n,
const float* x,
const int64_t incx,
const float* y,
const int64_t incy,
float* result) {
156 assert (handle != NULL);
159 int32_t n_, incx_, incy_;
162 incx_ = (int32_t) incx;
163 incy_ = (int32_t) incy;
166 assert ( (int64_t) n_ == n );
167 assert ( (int64_t) incx_ == incx);
168 assert ( (int64_t) incy_ == incy);
170 *result =
sdot_(&n_, x, &incx_, y, &incy_);
174void dgemv_(
const char* transa,
const int32_t* m,
const int32_t* n,
const double* alpha,
175 const double* a,
const int32_t* lda,
const double* x,
const int32_t* incx,
const double* beta,
double* y,
const int32_t* incy);
177void gpu_dgemv(
void* handle,
const char* transa,
const int64_t m,
const int64_t n,
const double* alpha,
178 const double* a,
const int64_t lda,
const double* x,
const int64_t incx,
const double* beta,
double* y,
const int64_t incy) {
180 assert (handle != NULL);
183 int32_t m_, n_, lda_, incx_, incy_;
187 lda_ = (int32_t) lda;
188 incx_ = (int32_t) incx;
189 incy_ = (int32_t) incy;
192 assert ( (int64_t) m_ == m );
193 assert ( (int64_t) n_ == n );
194 assert ( (int64_t) lda_ == lda );
195 assert ( (int64_t) incx_ == incx);
196 assert ( (int64_t) incy_ == incy);
198 dgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
202void sgemv_(
const char* transa,
const int32_t* m,
const int32_t* n,
const float* alpha,
203 const float* a,
const int32_t* lda,
const float* x,
const int32_t* incx,
const float* beta,
float* y,
const int32_t* incy);
205void gpu_sgemv(
void* handle,
const char* transa,
const int64_t m,
const int64_t n,
const float* alpha,
206 const float* a,
const int64_t lda,
const float* x,
const int64_t incx,
const float* beta,
float* y,
const int64_t incy) {
208 assert (handle != NULL);
211 int32_t m_, n_, lda_, incx_, incy_;
215 lda_ = (int32_t) lda;
216 incx_ = (int32_t) incx;
217 incy_ = (int32_t) incy;
220 assert ( (int64_t) m_ == m );
221 assert ( (int64_t) n_ == n );
222 assert ( (int64_t) lda_ == lda );
223 assert ( (int64_t) incx_ == incx);
224 assert ( (int64_t) incy_ == incy);
226 sgemv_(transa, &m_, &n_, alpha, a, &lda_, x, &incx_, beta, y, &incy_);
230void dgemm_(
const char* transa,
const char* transb,
const int32_t* m,
const int32_t* n,
const int32_t* k,
const double* alpha,
231 const double* a,
const int32_t* lda,
const double* b,
const int32_t* ldb,
const double* beta,
double* c,
const int32_t* ldc);
233void gpu_dgemm(
void* handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const int64_t k,
const double* alpha,
234 const double* a,
const int64_t lda,
const double* b,
const int64_t ldb,
const double* beta,
double* c,
const int64_t ldc) {
236 assert (handle != NULL);
239 int32_t m_, n_, k_, lda_, ldb_, ldc_;
244 lda_ = (int32_t) lda;
245 ldb_ = (int32_t) ldb;
246 ldc_ = (int32_t) ldc;
249 assert ( (int64_t) m_ == m );
250 assert ( (int64_t) n_ == n );
251 assert ( (int64_t) k_ == k );
252 assert ( (int64_t) lda_ == lda);
253 assert ( (int64_t) ldb_ == ldb);
254 assert ( (int64_t) ldc_ == ldc);
256 dgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
261void sgemm_(
const char* transa,
const char* transb,
const int32_t* m,
const int32_t* n,
const int32_t* k,
const float* alpha,
262 const float* a,
const int32_t* lda,
const float* b,
const int32_t* ldb,
const float* beta,
float* c,
const int32_t* ldc);
264void gpu_sgemm(
void* handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const int64_t k,
const float* alpha,
265 const float* a,
const int64_t lda,
const float* b,
const int64_t ldb,
const float* beta,
float* c,
const int64_t ldc) {
267 assert (handle != NULL);
270 int32_t m_, n_, k_, lda_, ldb_, ldc_;
275 lda_ = (int32_t) lda;
276 ldb_ = (int32_t) ldb;
277 ldc_ = (int32_t) ldc;
280 assert ( (int64_t) m_ == m );
281 assert ( (int64_t) n_ == n );
282 assert ( (int64_t) k_ == k );
283 assert ( (int64_t) lda_ == lda);
284 assert ( (int64_t) ldb_ == ldb);
285 assert ( (int64_t) ldc_ == ldc);
287 sgemm_(transa, transb, &m_, &n_, &k_, alpha, a, &lda_, b, &ldb_, beta, c, &ldc_);
291void gpu_dgeam(
void* handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const double* alpha,
292 const double* a,
const int64_t lda,
const double* beta,
const double* b,
const int64_t ldb,
double* c,
const int64_t ldc) {
293 assert (handle != NULL);
295 if ( (*transa ==
'N' && *transb ==
'N') ||
296 (*transa ==
'n' && *transb ==
'N') ||
297 (*transa ==
'N' && *transb ==
'n') ||
298 (*transa ==
'n' && *transb ==
'n') ) {
302 for (int64_t j=0 ; j<n ; ++j) {
303 for (int64_t i=0 ; i<m ; ++i) {
304 c[j*ldc+i] = *beta * b[j*ldb+i];
308 }
else if (*beta == 0.) {
310 for (int64_t j=0 ; j<n ; ++j) {
311 for (int64_t i=0 ; i<m ; ++i) {
312 c[j*ldc+i] = *alpha * a[j*lda+i];
318 for (int64_t j=0 ; j<n ; ++j) {
319 for (int64_t i=0 ; i<m ; ++i) {
320 c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
326 }
else if ( (*transa ==
'N' && *transb ==
'T') ||
327 (*transa ==
'n' && *transb ==
'T') ||
328 (*transa ==
'N' && *transb ==
't') ||
329 (*transa ==
'n' && *transb ==
't') ) {
333 for (int64_t j=0 ; j<n ; ++j) {
334 for (int64_t i=0 ; i<m ; ++i) {
335 c[j*ldc+i] = *beta * b[i*ldb+j];
339 }
else if (*beta == 0.) {
341 for (int64_t j=0 ; j<n ; ++j) {
342 for (int64_t i=0 ; i<m ; ++i) {
343 c[j*ldc+i] = *alpha * a[j*lda+i];
349 for (int64_t j=0 ; j<n ; ++j) {
350 for (int64_t i=0 ; i<m ; ++i) {
351 c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
357 }
else if ( (*transa ==
'T' && *transb ==
'N') ||
358 (*transa ==
't' && *transb ==
'N') ||
359 (*transa ==
'T' && *transb ==
'n') ||
360 (*transa ==
't' && *transb ==
'n') ) {
364 for (int64_t j=0 ; j<n ; ++j) {
365 for (int64_t i=0 ; i<m ; ++i) {
366 c[j*ldc+i] = *beta * b[j*ldb+i];
370 }
else if (*beta == 0.) {
372 for (int64_t j=0 ; j<n ; ++j) {
373 for (int64_t i=0 ; i<m ; ++i) {
374 c[j*ldc+i] = *alpha * a[i*lda+j];
380 for (int64_t j=0 ; j<n ; ++j) {
381 for (int64_t i=0 ; i<m ; ++i) {
382 c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
388 }
else if ( (*transa ==
'T' && *transb ==
'T') ||
389 (*transa ==
't' && *transb ==
'T') ||
390 (*transa ==
'T' && *transb ==
't') ||
391 (*transa ==
't' && *transb ==
't') ) {
395 for (int64_t j=0 ; j<n ; ++j) {
396 for (int64_t i=0 ; i<m ; ++i) {
397 c[j*ldc+i] = *beta * b[i*ldb+j];
401 }
else if (*beta == 0.) {
403 for (int64_t j=0 ; j<n ; ++j) {
404 for (int64_t i=0 ; i<m ; ++i) {
405 c[j*ldc+i] = *alpha * a[i*lda+j];
411 for (int64_t j=0 ; j<n ; ++j) {
412 for (int64_t i=0 ; i<m ; ++i) {
413 c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
423void gpu_sgeam(
void* handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const float* alpha,
424 const float* a,
const int64_t lda,
const float* beta,
const float* b,
const int64_t ldb,
float* c,
const int64_t ldc) {
425 assert (handle != NULL);
427 if ( (*transa ==
'N' && *transb ==
'N') ||
428 (*transa ==
'n' && *transb ==
'N') ||
429 (*transa ==
'N' && *transb ==
'n') ||
430 (*transa ==
'n' && *transb ==
'n') ) {
434 for (int64_t j=0 ; j<n ; ++j) {
435 for (int64_t i=0 ; i<m ; ++i) {
436 c[j*ldc+i] = *beta * b[j*ldb+i];
440 }
else if (*beta == 0.) {
442 for (int64_t j=0 ; j<n ; ++j) {
443 for (int64_t i=0 ; i<m ; ++i) {
444 c[j*ldc+i] = *alpha * a[j*lda+i];
450 for (int64_t j=0 ; j<n ; ++j) {
451 for (int64_t i=0 ; i<m ; ++i) {
452 c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[j*ldb+i];
458 }
else if ( (*transa ==
'N' && *transb ==
'T') ||
459 (*transa ==
'n' && *transb ==
'T') ||
460 (*transa ==
'N' && *transb ==
't') ||
461 (*transa ==
'n' && *transb ==
't') ) {
465 for (int64_t j=0 ; j<n ; ++j) {
466 for (int64_t i=0 ; i<m ; ++i) {
467 c[j*ldc+i] = *beta * b[i*ldb+j];
471 }
else if (*beta == 0.) {
473 for (int64_t j=0 ; j<n ; ++j) {
474 for (int64_t i=0 ; i<m ; ++i) {
475 c[j*ldc+i] = *alpha * a[j*lda+i];
481 for (int64_t j=0 ; j<n ; ++j) {
482 for (int64_t i=0 ; i<m ; ++i) {
483 c[j*ldc+i] = *alpha * a[j*lda+i] + *beta * b[i*ldb+j];
489 }
else if ( (*transa ==
'T' && *transb ==
'N') ||
490 (*transa ==
't' && *transb ==
'N') ||
491 (*transa ==
'T' && *transb ==
'n') ||
492 (*transa ==
't' && *transb ==
'n') ) {
496 for (int64_t j=0 ; j<n ; ++j) {
497 for (int64_t i=0 ; i<m ; ++i) {
498 c[j*ldc+i] = *beta * b[j*ldb+i];
502 }
else if (*beta == 0.) {
504 for (int64_t j=0 ; j<n ; ++j) {
505 for (int64_t i=0 ; i<m ; ++i) {
506 c[j*ldc+i] = *alpha * a[i*lda+j];
512 for (int64_t j=0 ; j<n ; ++j) {
513 for (int64_t i=0 ; i<m ; ++i) {
514 c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[j*ldb+i];
520 }
else if ( (*transa ==
'T' && *transb ==
'T') ||
521 (*transa ==
't' && *transb ==
'T') ||
522 (*transa ==
'T' && *transb ==
't') ||
523 (*transa ==
't' && *transb ==
't') ) {
527 for (int64_t j=0 ; j<n ; ++j) {
528 for (int64_t i=0 ; i<m ; ++i) {
529 c[j*ldc+i] = *beta * b[i*ldb+j];
533 }
else if (*beta == 0.) {
535 for (int64_t j=0 ; j<n ; ++j) {
536 for (int64_t i=0 ; i<m ; ++i) {
537 c[j*ldc+i] = *alpha * a[i*lda+j];
543 for (int64_t j=0 ; j<n ; ++j) {
544 for (int64_t i=0 ; i<m ; ++i) {
545 c[j*ldc+i] = *alpha * a[i*lda+j] + *beta * b[i*ldb+j];
void gpu_set_stream(void *handle, void *stream)
void gpu_blas_destroy(void **handle)
void gpu_stream_create(void **ptr)
void gpu_allocate(void **ptr, const int64_t n)
void gpu_sgeam(void *handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const float *alpha, const float *a, const int64_t lda, const float *beta, const float *b, const int64_t ldb, float *c, const int64_t ldc)
void dgemm_(const char *transa, const char *transb, const int32_t *m, const int32_t *n, const int32_t *k, const double *alpha, const double *a, const int32_t *lda, const double *b, const int32_t *ldb, const double *beta, double *c, const int32_t *ldc)
void gpu_free(void **ptr)
void gpu_copy(const void *gpu_ptr_src, void *gpu_ptr_dest, const int64_t n)
double ddot_(const int32_t *n, const double *x, const int32_t *incx, const double *y, const int32_t *incy)
void gpu_set_device(int32_t i)
void gpu_upload(const void *cpu_ptr, void *gpu_ptr, const int64_t n)
void gpu_dgemm(void *handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const int64_t k, const double *alpha, const double *a, const int64_t lda, const double *b, const int64_t ldb, const double *beta, double *c, const int64_t ldc)
void gpu_get_memory(size_t *free, size_t *total)
void gpu_sgemv(void *handle, const char *transa, const int64_t m, const int64_t n, const float *alpha, const float *a, const int64_t lda, const float *x, const int64_t incx, const float *beta, float *y, const int64_t incy)
void gpu_sdot(void *handle, const int64_t n, const float *x, const int64_t incx, const float *y, const int64_t incy, float *result)
void gpu_deallocate(void **ptr)
void gpu_sgemm(void *handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const int64_t k, const float *alpha, const float *a, const int64_t lda, const float *b, const int64_t ldb, const float *beta, float *c, const int64_t ldc)
void gpu_dgeam(void *handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const double *alpha, const double *a, const int64_t lda, const double *beta, const double *b, const int64_t ldb, double *c, const int64_t ldc)
void gpu_stream_synchronize(void *stream)
void gpu_dgemv(void *handle, const char *transa, const int64_t m, const int64_t n, const double *alpha, const double *a, const int64_t lda, const double *x, const int64_t incx, const double *beta, double *y, const int64_t incy)
void sgemv_(const char *transa, const int32_t *m, const int32_t *n, const float *alpha, const float *a, const int32_t *lda, const float *x, const int32_t *incx, const float *beta, float *y, const int32_t *incy)
void gpu_download(const void *gpu_ptr, void *cpu_ptr, const int64_t n)
void gpu_ddot(void *handle, const int64_t n, const double *x, const int64_t incx, const double *y, const int64_t incy, double *result)
void gpu_blas_create(void **handle)
float sdot_(const int32_t *n, const float *x, const int32_t *incx, const float *y, const int32_t *incy)
static bool check_int32_overflow(int64_t value, const char *name)
Check if an int64_t value can be safely converted to int32_t.
void gpu_stream_destroy(void **ptr)
void sgemm_(const char *transa, const char *transb, const int32_t *m, const int32_t *n, const int32_t *k, const float *alpha, const float *a, const int32_t *lda, const float *b, const int32_t *ldb, const float *beta, float *c, const int32_t *ldc)
void dgemv_(const char *transa, const int32_t *m, const int32_t *n, const double *alpha, const double *a, const int32_t *lda, const double *x, const int32_t *incx, const double *beta, double *y, const int32_t *incy)