9#include <cuda_runtime.h>
16 cudaGetDeviceCount(&ngpus);
22 fprintf(stderr,
"gpu_set_device: invalid device index %d (must be >= 0)\n", igpu);
27 fprintf(stderr,
"gpu_set_device: invalid device index %d (only %d devices available)\n", igpu, ngpus);
30 cudaError_t rc = cudaSetDevice((
int) igpu);
31 if (rc != cudaSuccess) {
32 fprintf(stderr,
"cudaSetDevice(%d) failed: %s\n", igpu, cudaGetErrorString(rc));
33 assert (rc == cudaSuccess);
38 cudaError_t rc = cudaMemGetInfo( free, total );
39 if (rc != cudaSuccess) {
49 fprintf(stderr,
"gpu_allocate: ptr argument is NULL\n");
54 cudaError_t rc = cudaMemGetInfo( &free, &total );
55 if (rc != cudaSuccess) {
59 rc = cudaMalloc(ptr, size);
61 if (rc != cudaSuccess) {
62 fprintf(stderr,
"cudaMalloc failed: %s\n", cudaGetErrorString(rc));
63 assert (rc == cudaSuccess);
68 if (ptr == NULL || *ptr == NULL) {
82void gpu_upload(
const void* cpu_ptr,
void* gpu_ptr,
const int64_t n) {
83 if (cpu_ptr == NULL || gpu_ptr == NULL) {
84 fprintf(stderr,
"gpu_upload: NULL pointer argument\n");
87 cudaError_t rc = cudaMemcpy (gpu_ptr, cpu_ptr, n, cudaMemcpyHostToDevice);
88 if (rc != cudaSuccess) {
89 fprintf(stderr,
"cudaMemcpy (upload) failed: %s\n", cudaGetErrorString(rc));
90 assert (rc == cudaSuccess);
94void gpu_download(
const void* gpu_ptr,
void* cpu_ptr,
const int64_t n) {
95 if (gpu_ptr == NULL || cpu_ptr == NULL) {
96 fprintf(stderr,
"gpu_download: NULL pointer argument\n");
99 cudaError_t rc = cudaMemcpy (cpu_ptr, gpu_ptr, n, cudaMemcpyDeviceToHost);
100 if (rc != cudaSuccess) {
101 fprintf(stderr,
"cudaMemcpy (download) failed: %s\n", cudaGetErrorString(rc));
102 assert (rc == cudaSuccess);
106void gpu_copy(
const void* gpu_ptr_src,
void* gpu_ptr_dest,
const int64_t n) {
107 if (gpu_ptr_src == NULL || gpu_ptr_dest == NULL) {
108 fprintf(stderr,
"gpu_copy: NULL pointer argument\n");
111 cudaError_t rc = cudaMemcpy (gpu_ptr_dest, gpu_ptr_src, n, cudaMemcpyDeviceToDevice);
112 if (rc != cudaSuccess) {
113 fprintf(stderr,
"cudaMemcpy (copy) failed: %s\n", cudaGetErrorString(rc));
114 assert (rc == cudaSuccess);
122 cudaError_t rc = cudaStreamCreate(ptr);
123 if (rc != cudaSuccess) {
124 fprintf(stderr,
"cudaStreamCreate failed: %s\n", cudaGetErrorString(rc));
125 assert (rc == cudaSuccess);
130 assert (ptr != NULL);
131 cudaError_t rc = cudaStreamDestroy(*ptr);
132 if (rc != cudaSuccess) {
133 fprintf(stderr,
"cudaStreamDestroy failed: %s\n", cudaGetErrorString(rc));
134 assert (rc == cudaSuccess);
140 cublasStatus_t rc = cublasSetStream(handle, stream);
141 if (rc != CUBLAS_STATUS_SUCCESS) {
142 fprintf(stderr,
"cublasSetStream failed\n");
143 assert (rc == CUBLAS_STATUS_SUCCESS);
148 cudaError_t rc = cudaDeviceSynchronize();
149 if (rc != cudaSuccess) {
150 fprintf(stderr,
"cudaDeviceSynchronize failed: %s\n", cudaGetErrorString(rc));
151 assert (rc == cudaSuccess);
156 cudaError_t rc = cudaStreamSynchronize(stream);
157 if (rc != cudaSuccess) {
158 fprintf(stderr,
"cudaStreamSynchronize failed: %s\n", cudaGetErrorString(rc));
159 assert (rc == cudaSuccess);
167 cublasStatus_t rc = cublasCreate(ptr);
168 assert (rc == CUBLAS_STATUS_SUCCESS);
173 assert (ptr != NULL);
174 cublasStatus_t rc = cublasDestroy(*ptr);
175 assert (rc == CUBLAS_STATUS_SUCCESS);
180void gpu_ddot(cublasHandle_t handle,
const int64_t n,
const double* x,
const int64_t incx,
const double* y,
const int64_t incy,
double* result) {
181 assert (handle != NULL);
183 int n_, incx_, incy_;
189 assert ( (int64_t) n_ == n );
190 assert ( (int64_t) incx_ == incx);
191 assert ( (int64_t) incy_ == incy);
193 cublasStatus_t rc = cublasDdot(handle, n_, x, incx_, y, incy_, result);
194 assert (rc == CUBLAS_STATUS_SUCCESS);
199void gpu_sdot(cublasHandle_t handle,
const int64_t n,
const float* x,
const int64_t incx,
const float* y,
const int64_t incy,
float* result) {
200 assert (handle != NULL);
203 int n_, incx_, incy_;
210 assert ( (int64_t) n_ == n );
211 assert ( (int64_t) incx_ == incx);
212 assert ( (int64_t) incy_ == incy);
215 cublasStatus_t rc = cublasSdot(handle, n_, x, incx_, y, incy_, &result_);
216 assert (rc == CUBLAS_STATUS_SUCCESS);
222void gpu_dgemv(cublasHandle_t handle,
const char* transa,
const int64_t m,
const int64_t n,
const double* alpha,
223 const double* a,
const int64_t lda,
const double* x,
const int64_t incx,
const double* beta,
double* y,
const int64_t incy) {
225 assert (handle != NULL);
228 int m_, n_, lda_, incx_, incy_;
237 assert ( (int64_t) m_ == m );
238 assert ( (int64_t) n_ == n );
239 assert ( (int64_t) lda_ == lda );
240 assert ( (int64_t) incx_ == incx);
241 assert ( (int64_t) incy_ == incy);
243 cublasOperation_t transa_ = CUBLAS_OP_N;
244 if (*transa ==
'T' || *transa ==
't') transa_ = CUBLAS_OP_T;
246 cublasDgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
251void gpu_sgemv(cublasHandle_t handle,
const char* transa,
const int64_t m,
const int64_t n,
const float* alpha,
252 const float* a,
const int64_t lda,
const float* x,
const int64_t incx,
const float* beta,
float* y,
const int64_t incy) {
254 assert (handle != NULL);
257 int m_, n_, lda_, incx_, incy_;
266 assert ( (int64_t) m_ == m );
267 assert ( (int64_t) n_ == n );
268 assert ( (int64_t) lda_ == lda );
269 assert ( (int64_t) incx_ == incx);
270 assert ( (int64_t) incy_ == incy);
272 cublasOperation_t transa_ = CUBLAS_OP_N;
273 if (*transa ==
'T' || *transa ==
't') transa_ = CUBLAS_OP_T;
275 cublasSgemv(handle, transa_, m_, n_, alpha, a, lda_, x, incx_, beta, y, incy_);
279void gpu_dgemm(cublasHandle_t handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const int64_t k,
const double* alpha,
280 const double* a,
const int64_t lda,
const double* b,
const int64_t ldb,
const double* beta,
double* c,
const int64_t ldc) {
282 assert (handle != NULL);
285 int m_, n_, k_, lda_, ldb_, ldc_;
295 assert ( (int64_t) m_ == m );
296 assert ( (int64_t) n_ == n );
297 assert ( (int64_t) k_ == k );
298 assert ( (int64_t) lda_ == lda);
299 assert ( (int64_t) ldb_ == ldb);
300 assert ( (int64_t) ldc_ == ldc);
302 cublasOperation_t transa_ = CUBLAS_OP_N;
303 cublasOperation_t transb_ = CUBLAS_OP_N;
304 if (*transa ==
'T' || *transa ==
't') transa_ = CUBLAS_OP_T;
305 if (*transb ==
'T' || *transb ==
't') transb_ = CUBLAS_OP_T;
307 cublasDgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
312void gpu_sgemm(cublasHandle_t handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const int64_t k,
const float* alpha,
313 const float* a,
const int64_t lda,
const float* b,
const int64_t ldb,
const float* beta,
float* c,
const int64_t ldc) {
315 assert (handle != NULL);
318 int m_, n_, k_, lda_, ldb_, ldc_;
328 assert ( (int64_t) m_ == m );
329 assert ( (int64_t) n_ == n );
330 assert ( (int64_t) k_ == k );
331 assert ( (int64_t) lda_ == lda);
332 assert ( (int64_t) ldb_ == ldb);
333 assert ( (int64_t) ldc_ == ldc);
335 cublasOperation_t transa_ = CUBLAS_OP_N;
336 cublasOperation_t transb_ = CUBLAS_OP_N;
337 if (*transa ==
'T' || *transa ==
't') transa_ = CUBLAS_OP_T;
338 if (*transb ==
'T' || *transb ==
't') transb_ = CUBLAS_OP_T;
340 cublasSgemm(handle, transa_, transb_, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
344void gpu_dgeam(cublasHandle_t handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const double* alpha,
345 const double* a,
const int64_t lda,
const double* beta,
const double* b,
const int64_t ldb,
double* c,
const int64_t ldc) {
346 assert (handle != NULL);
349 int m_, n_, lda_, ldb_, ldc_;
358 assert ( (int64_t) m_ == m );
359 assert ( (int64_t) n_ == n );
360 assert ( (int64_t) lda_ == lda);
361 assert ( (int64_t) ldb_ == ldb);
362 assert ( (int64_t) ldc_ == ldc);
364 cublasOperation_t transa_ = CUBLAS_OP_N;
365 cublasOperation_t transb_ = CUBLAS_OP_N;
366 if (*transa ==
'T' || *transa ==
't') transa_ = CUBLAS_OP_T;
367 if (*transb ==
'T' || *transb ==
't') transb_ = CUBLAS_OP_T;
369 cublasDgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
374void gpu_sgeam(cublasHandle_t handle,
const char* transa,
const char* transb,
const int64_t m,
const int64_t n,
const float* alpha,
375 const float* a,
const int64_t lda,
const float* beta,
const float* b,
const int64_t ldb,
float* c,
const int64_t ldc) {
376 assert (handle != NULL);
379 int m_, n_, lda_, ldb_, ldc_;
388 assert ( (int64_t) m_ == m );
389 assert ( (int64_t) n_ == n );
390 assert ( (int64_t) lda_ == lda);
391 assert ( (int64_t) ldb_ == ldb);
392 assert ( (int64_t) ldc_ == ldc);
394 cublasOperation_t transa_ = CUBLAS_OP_N;
395 cublasOperation_t transb_ = CUBLAS_OP_N;
396 if (*transa ==
'T' || *transa ==
't') transa_ = CUBLAS_OP_T;
397 if (*transb ==
'T' || *transb ==
't') transb_ = CUBLAS_OP_T;
399 cublasSgeam(handle, transa_, transb_, m_, n_, alpha, a, lda_, beta, b, ldb_, c, ldc_);
void gpu_ddot(cublasHandle_t handle, const int64_t n, const double *x, const int64_t incx, const double *y, const int64_t incy, double *result)
void gpu_dgemv(cublasHandle_t handle, const char *transa, const int64_t m, const int64_t n, const double *alpha, const double *a, const int64_t lda, const double *x, const int64_t incx, const double *beta, double *y, const int64_t incy)
void gpu_stream_create(cudaStream_t *ptr)
void gpu_free(void **ptr)
void gpu_copy(const void *gpu_ptr_src, void *gpu_ptr_dest, const int64_t n)
void gpu_upload(const void *cpu_ptr, void *gpu_ptr, const int64_t n)
void gpu_get_memory(size_t *free, size_t *total)
void gpu_dgeam(cublasHandle_t handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const double *alpha, const double *a, const int64_t lda, const double *beta, const double *b, const int64_t ldb, double *c, const int64_t ldc)
void gpu_deallocate(void **ptr)
void gpu_set_stream(cublasHandle_t handle, cudaStream_t stream)
void gpu_blas_create(cublasHandle_t *ptr)
void gpu_sgeam(cublasHandle_t handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const float *alpha, const float *a, const int64_t lda, const float *beta, const float *b, const int64_t ldb, float *c, const int64_t ldc)
void gpu_stream_synchronize(void *stream)
void gpu_allocate(void **ptr, const int64_t size)
void gpu_blas_destroy(cublasHandle_t *ptr)
void gpu_stream_destroy(cudaStream_t *ptr)
void gpu_download(const void *gpu_ptr, void *cpu_ptr, const int64_t n)
void gpu_sdot(cublasHandle_t handle, const int64_t n, const float *x, const int64_t incx, const float *y, const int64_t incy, float *result)
void gpu_sgemm(cublasHandle_t handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const int64_t k, const float *alpha, const float *a, const int64_t lda, const float *b, const int64_t ldb, const float *beta, float *c, const int64_t ldc)
void gpu_sgemv(cublasHandle_t handle, const char *transa, const int64_t m, const int64_t n, const float *alpha, const float *a, const int64_t lda, const float *x, const int64_t incx, const float *beta, float *y, const int64_t incy)
void gpu_dgemm(cublasHandle_t handle, const char *transa, const char *transb, const int64_t m, const int64_t n, const int64_t k, const double *alpha, const double *a, const int64_t lda, const double *b, const int64_t ldb, const double *beta, double *c, const int64_t ldc)
void gpu_set_device(int32_t igpu)