Simple GPU 1.0
Fortran GPU Computing Library with transparent CPU/GPU support
Loading...
Searching...
No Matches
simple_gpu.F90
Go to the documentation of this file.
1
17module gpu
18 use, intrinsic :: iso_c_binding
19 implicit none
20
21!=============================================================================
22! Data Types
23!=============================================================================
24!
25! GPU array types for double precision (1D through 6D)
26! Each type contains:
27! - c: C pointer to GPU/CPU memory
28! - f: Fortran pointer for accessing data with Fortran array syntax
29!
30! Usage:
31! type(gpu_double1) :: x
32! call gpu_allocate(x, n)
33! x%f(i) = value ! Access using Fortran syntax
34! call gpu_deallocate(x)
35!
36
47 type(c_ptr) :: c
48 double precision, pointer :: f(:)
49 end type
50
61 type(c_ptr) :: c
62 double precision, pointer :: f(:,:)
63 end type
64
67 type(c_ptr) :: c
68 double precision, pointer :: f(:,:,:)
69 end type
70
73 type(c_ptr) :: c
74 double precision, pointer :: f(:,:,:,:)
75 end type
76
79 type(c_ptr) :: c
80 double precision, pointer :: f(:,:,:,:,:)
81 end type
82
85 type(c_ptr) :: c
86 double precision, pointer :: f(:,:,:,:,:,:)
87 end type
88
89!
90! GPU array types for single precision (1D through 6D)
91!
92
95 type(c_ptr) :: c
96 real, pointer :: f(:)
97 end type
98
101 type(c_ptr) :: c
102 real, pointer :: f(:,:)
103 end type
104
107 type(c_ptr) :: c
108 real, pointer :: f(:,:,:)
109 end type
110
113 type(c_ptr) :: c
114 real, pointer :: f(:,:,:,:)
115 end type
116
119 type(c_ptr) :: c
120 real, pointer :: f(:,:,:,:,:)
121 end type
122
125 type(c_ptr) :: c
126 real, pointer :: f(:,:,:,:,:,:)
127 end type
128
129!
130! Handle types
131!
132
144 type(c_ptr) :: c
145 end type
146
159 type(c_ptr) :: c
160 end type
161
162
163!=============================================================================
164! C interfaces
165!=============================================================================
166! Low-level C interface bindings - not intended for direct use
167! These are wrapped by higher-level Fortran subroutines in the contains section
168!
169
170 interface
171
172 integer function gpu_ndevices() bind(C)
173 import
174 end function
175
177 subroutine gpu_set_device(id) bind(C)
178 import
179 integer(c_int32_t), value :: id
180 end subroutine
181
183 subroutine gpu_get_memory(free, total) bind(C)
184 import
185 integer(c_size_t) :: free, total
186 end subroutine
187
189 subroutine gpu_allocate_c(ptr, n) bind(C, name='gpu_allocate')
190 import
191 type(c_ptr) :: ptr
192 integer(c_int64_t), value :: n
193 end subroutine
194
196 subroutine gpu_deallocate_c(ptr) bind(C, name='gpu_deallocate')
197 import
198 type(c_ptr) :: ptr
199 end subroutine
200
202 subroutine gpu_upload_c(cpu_ptr, gpu_ptr, n) bind(C, name='gpu_upload')
203 import
204 type(c_ptr), value :: cpu_ptr
205 type(c_ptr), value :: gpu_ptr
206 integer(c_int64_t), value :: n
207 end subroutine
208
209 subroutine gpu_download_c(gpu_ptr, cpu_ptr, n) bind(C, name='gpu_download')
210 import
211 type(c_ptr), value :: gpu_ptr
212 type(c_ptr), value :: cpu_ptr
213 integer(c_int64_t), value :: n
214 end subroutine
215
216 subroutine gpu_copy_c(gpu_ptr_src, gpu_ptr_dest, n) bind(C, name='gpu_copy')
217 import
218 type(c_ptr), value :: gpu_ptr_src
219 type(c_ptr), value :: gpu_ptr_dest
220 integer(c_int64_t), value :: n
221 end subroutine
222
223 subroutine gpu_stream_create_c(stream) bind(C, name='gpu_stream_create')
224 import
225 type(c_ptr) :: stream
226 end subroutine
227
228 subroutine gpu_stream_destroy_c(stream) bind(C, name='gpu_stream_destroy')
229 import
230 type(c_ptr) :: stream
231 end subroutine
232
233 subroutine gpu_set_stream_c(handle, stream) bind(C, name='gpu_set_stream')
234 import
235 type(c_ptr), value :: handle, stream
236 end subroutine
237
238 subroutine gpu_stream_synchronize(stream) bind(C)
239 import
240 type(c_ptr), value :: stream
241 end subroutine
242
243 subroutine gpu_synchronize() bind(C)
244 import
245 end subroutine
246
247 subroutine gpu_blas_create_c(handle) bind(C, name='gpu_blas_create')
248 import
249 type(c_ptr) :: handle
250 end subroutine
251
252 subroutine gpu_blas_destroy_c(handle) bind(C, name='gpu_blas_destroy')
253 import
254 type(c_ptr) :: handle
255 end subroutine
256
257 subroutine gpu_ddot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_ddot')
258 import
259 type(c_ptr), value, intent(in) :: handle
260 integer(c_int64_t), value :: n, incx, incy
261 type(c_ptr), value :: dx, dy
262 real(c_double), intent(out) :: res
263 end subroutine
264
265 subroutine gpu_sdot_c(handle, n, dx, incx, dy, incy, res) bind(C, name='gpu_sdot')
266 import
267 type(c_ptr), value, intent(in) :: handle
268 integer(c_int64_t), value :: n, incx, incy
269 type(c_ptr), intent(in), value :: dx, dy
270 real(c_float), intent(out) :: res
271 end subroutine
272
273 subroutine gpu_dgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
274 b, ldb, c, ldc) bind(C, name='gpu_dgeam')
275 import
276 type(c_ptr), value, intent(in) :: handle
277 character(c_char), intent(in) :: transa, transb
278 integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
279 real(c_double), intent(in) :: alpha, beta
280 real(c_double) :: a, b, c
281 end subroutine
282
283 subroutine gpu_sgeam_c(handle, transa, transb, m, n, alpha, a, lda, beta, &
284 b, ldb, c, ldc) bind(C, name='gpu_sgeam')
285 import
286 type(c_ptr), value, intent(in) :: handle
287 character(c_char), intent(in) :: transa, transb
288 integer(c_int64_t), intent(in), value :: m, n, lda, ldb, ldc
289 real(c_float), intent(in) :: alpha, beta
290 real(c_float) :: a, b, c
291 end subroutine
292
293 subroutine gpu_dgemv_c(handle, transa, m, n, alpha, a, lda, &
294 x, incx, beta, y, incy) bind(C, name='gpu_dgemv')
295 import
296 type(c_ptr), value, intent(in) :: handle
297 character(c_char), intent(in) :: transa
298 integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
299 real(c_double), intent(in) :: alpha, beta
300 real(c_double) :: a, x, y
301 end subroutine
302
303 subroutine gpu_sgemv_c(handle, transa, m, n, alpha, a, lda, &
304 x, incx, beta, y, incy) bind(C, name='gpu_sgemv')
305 import
306 type(c_ptr), value, intent(in) :: handle
307 character(c_char), intent(in) :: transa
308 integer(c_int64_t), intent(in), value :: m, n, lda, incx, incy
309 real(c_float), intent(in) :: alpha, beta
310 real(c_float) :: a, x, y
311 end subroutine
312
313
314 subroutine gpu_dgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
315 b, ldb, beta, c, ldc) bind(C, name='gpu_dgemm')
316 import
317 type(c_ptr), value, intent(in) :: handle
318 character(c_char), intent(in) :: transa, transb
319 integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
320 real(c_double), intent(in) :: alpha, beta
321 real(c_double) :: a, b, c
322 end subroutine
323
324 subroutine gpu_sgemm_c(handle, transa, transb, m, n, k, alpha, a, lda, &
325 b, ldb, beta, c, ldc) bind(C, name='gpu_sgemm')
326 import
327 type(c_ptr), value, intent(in) :: handle
328 character(c_char), intent(in) :: transa, transb
329 integer(c_int64_t), intent(in), value :: m, n, k, lda, ldb, ldc
330 real(c_float), intent(in) :: alpha, beta
331 real(c_float) :: a, b, c
332 end subroutine
333
334 end interface
335
336
337
338!=============================================================================
339! Polymorphic interfaces
340!=============================================================================
341!
342! These interfaces are overloaded to work with different array types and
343! dimensions. The appropriate implementation is selected automatically based
344! on the argument types.
345!
346
371 interface gpu_allocate
372 procedure gpu_allocate_double1 &
373 ,gpu_allocate_double2 &
374 ,gpu_allocate_double3 &
375 ,gpu_allocate_double4 &
376 ,gpu_allocate_double5 &
377 ,gpu_allocate_double6 &
378 ,gpu_allocate_double1_64 &
379 ,gpu_allocate_double2_64 &
380 ,gpu_allocate_double3_64 &
381 ,gpu_allocate_double4_64 &
382 ,gpu_allocate_double5_64 &
383 ,gpu_allocate_double6_64 &
384 ,gpu_allocate_real1 &
385 ,gpu_allocate_real2 &
386 ,gpu_allocate_real3 &
387 ,gpu_allocate_real4 &
388 ,gpu_allocate_real5 &
389 ,gpu_allocate_real6 &
390 ,gpu_allocate_real1_64 &
391 ,gpu_allocate_real2_64 &
392 ,gpu_allocate_real3_64 &
393 ,gpu_allocate_real4_64 &
394 ,gpu_allocate_real5_64 &
395 ,gpu_allocate_real6_64
396 end interface gpu_allocate
397
409 procedure gpu_deallocate_double1 &
410 ,gpu_deallocate_double2 &
411 ,gpu_deallocate_double3 &
412 ,gpu_deallocate_double4 &
413 ,gpu_deallocate_double5 &
414 ,gpu_deallocate_double6 &
415 ,gpu_deallocate_real1 &
416 ,gpu_deallocate_real2 &
417 ,gpu_deallocate_real3 &
418 ,gpu_deallocate_real4 &
419 ,gpu_deallocate_real5 &
420 ,gpu_deallocate_real6
421 end interface gpu_deallocate
422
438 interface gpu_upload
439 procedure gpu_upload_double0 &
440 ,gpu_upload_double1 &
441 ,gpu_upload_double2 &
442 ,gpu_upload_double3 &
443 ,gpu_upload_double4 &
444 ,gpu_upload_double5 &
445 ,gpu_upload_double6 &
446 ,gpu_upload_real0 &
447 ,gpu_upload_real1 &
448 ,gpu_upload_real2 &
449 ,gpu_upload_real3 &
450 ,gpu_upload_real4 &
451 ,gpu_upload_real5 &
452 ,gpu_upload_real6
453 end interface gpu_upload
454
468 interface gpu_download
469 procedure gpu_download_double0 &
470 ,gpu_download_double1 &
471 ,gpu_download_double2 &
472 ,gpu_download_double3 &
473 ,gpu_download_double4 &
474 ,gpu_download_double5 &
475 ,gpu_download_double6 &
476 ,gpu_download_real0 &
477 ,gpu_download_real1 &
478 ,gpu_download_real2 &
479 ,gpu_download_real3 &
480 ,gpu_download_real4 &
481 ,gpu_download_real5 &
482 ,gpu_download_real6
483 end interface gpu_download
484
500 interface gpu_copy
501 procedure gpu_copy_double0 &
502 ,gpu_copy_double1 &
503 ,gpu_copy_double2 &
504 ,gpu_copy_double3 &
505 ,gpu_copy_double4 &
506 ,gpu_copy_double5 &
507 ,gpu_copy_double6 &
508 ,gpu_copy_real0 &
509 ,gpu_copy_real1 &
510 ,gpu_copy_real2 &
511 ,gpu_copy_real3 &
512 ,gpu_copy_real4 &
513 ,gpu_copy_real5 &
514 ,gpu_copy_real6
515 end interface gpu_copy
516
517
518 contains
519
520!=============================================================================
521! Implementation of polymorphic interfaces
522!=============================================================================
523
524! gpu_allocate implementations
525! ----------------------------
526! These subroutines allocate GPU/CPU memory and set up Fortran pointers
527! for convenient array access.
528
530 subroutine gpu_allocate_double1(ptr, s)
531 implicit none
532 type(gpu_double1), intent(inout) :: ptr
533 integer, intent(in) :: s
534 integer*8 :: s_8, n
535
536 s_8 = s
537 n = s_8 * 8_8
538
539 call gpu_allocate_c(ptr%c, n)
540 call c_f_pointer(ptr%c, ptr%f, (/ s /))
541 end subroutine
542
543 subroutine gpu_allocate_double2(ptr, s1, s2)
544 implicit none
545 type(gpu_double2), intent(inout) :: ptr
546 integer, intent(in) :: s1, s2
547 integer*8 :: s1_8, s2_8, n
548
549 s1_8 = s1
550 s2_8 = s2
551 n = s1_8 * s2_8 * 8_8
552
553 call gpu_allocate_c(ptr%c, n)
554 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
555 end subroutine
556
557 subroutine gpu_allocate_double3(ptr, s1, s2, s3)
558 implicit none
559 type(gpu_double3), intent(inout) :: ptr
560 integer, intent(in) :: s1, s2, s3
561 integer*8 :: s1_8, s2_8, s3_8, n
562
563 s1_8 = s1
564 s2_8 = s2
565 s3_8 = s3
566 n = s1_8 * s2_8 * s3_8 * 8_8
567
568 call gpu_allocate_c(ptr%c, n)
569 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
570 end subroutine
571
572 subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
573 implicit none
574 type(gpu_double4), intent(inout) :: ptr
575 integer, intent(in) :: s1, s2, s3, s4
576 integer*8 :: s1_8, s2_8, s3_8, s4_8, n
577
578 s1_8 = s1
579 s2_8 = s2
580 s3_8 = s3
581 s4_8 = s4
582 n = s1_8 * s2_8 * s3_8 * s4_8 * 8_8
583
584 call gpu_allocate_c(ptr%c, n)
585 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
586 end subroutine
587
588 subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
589 implicit none
590 type(gpu_double5), intent(inout) :: ptr
591 integer, intent(in) :: s1, s2, s3, s4, s5
592 integer*8 :: s1_8, s2_8, s3_8, s4_8, s5_8, n
593
594 s1_8 = s1
595 s2_8 = s2
596 s3_8 = s3
597 s4_8 = s4
598 s5_8 = s5
599 n = s1_8 * s2_8 * s3_8 * s4_8 * s5_8 * 8_8
600
601 call gpu_allocate_c(ptr%c, n)
602 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
603 end subroutine
604
605 subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
606 implicit none
607 type(gpu_double6), intent(inout) :: ptr
608 integer, intent(in) :: s1, s2, s3, s4, s5, s6
609 integer*8 :: s1_8, s2_8, s3_8, s4_8, s5_8, s6_8, n
610
611 s1_8 = s1
612 s2_8 = s2
613 s3_8 = s3
614 s4_8 = s4
615 s5_8 = s5
616 s6_8 = s6
617 n = s1_8 * s2_8 * s3_8 * s4_8 * s5_8 * s6_8 * 8_8
618
619 call gpu_allocate_c(ptr%c, n)
620 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
621 end subroutine
622
623
624 subroutine gpu_allocate_double1_64(ptr, s)
625 implicit none
626 type(gpu_double1), intent(inout) :: ptr
627 integer*8, intent(in) :: s
628
629 call gpu_allocate_c(ptr%c, s)
630 call c_f_pointer(ptr%c, ptr%f, (/ s /))
631 end subroutine
632
633 subroutine gpu_allocate_double2_64(ptr, s1, s2)
634 implicit none
635 type(gpu_double2), intent(inout) :: ptr
636 integer*8, intent(in) :: s1, s2
637
638 call gpu_allocate_c(ptr%c, s1*s2*8_8)
639 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
640 end subroutine
641
642 subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
643 implicit none
644 type(gpu_double3), intent(inout) :: ptr
645 integer*8, intent(in) :: s1, s2, s3
646
647 call gpu_allocate_c(ptr%c, s1*s2*s3*8_8)
648 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
649 end subroutine
650
651 subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
652 implicit none
653 type(gpu_double4), intent(inout) :: ptr
654 integer*8, intent(in) :: s1, s2, s3, s4
655
656 call gpu_allocate_c(ptr%c, s1*s2*s3*s4*8_8)
657 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
658 end subroutine
659
660 subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
661 implicit none
662 type(gpu_double5), intent(inout) :: ptr
663 integer*8, intent(in) :: s1, s2, s3, s4, s5
664
665 call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*8_8)
666 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
667 end subroutine
668
669 subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
670 implicit none
671 type(gpu_double6), intent(inout) :: ptr
672 integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
673
674 call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*8_8)
675 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
676 end subroutine
677
678 subroutine gpu_allocate_real1(ptr, s)
679 implicit none
680 type(gpu_real1), intent(inout) :: ptr
681 integer, intent(in) :: s
682 integer*8 :: s_8, n
683
684 s_8 = s
685 n = s_8 * 4_8
686
687 call gpu_allocate_c(ptr%c, n)
688 call c_f_pointer(ptr%c, ptr%f, (/ s /))
689 end subroutine
690
691 subroutine gpu_allocate_real2(ptr, s1, s2)
692 implicit none
693 type(gpu_real2), intent(inout) :: ptr
694 integer, intent(in) :: s1, s2
695 integer*8 :: s1_8, s2_8, n
696
697 s1_8 = s1
698 s2_8 = s2
699 n = s1_8 * s2_8 * 4_8
700
701 call gpu_allocate_c(ptr%c, n)
702 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
703 end subroutine
704
705 subroutine gpu_allocate_real3(ptr, s1, s2, s3)
706 implicit none
707 type(gpu_real3), intent(inout) :: ptr
708 integer, intent(in) :: s1, s2, s3
709 integer*8 :: s1_8, s2_8, s3_8, n
710
711 s1_8 = s1
712 s2_8 = s2
713 s3_8 = s3
714 n = s1_8 * s2_8 * s3_8 * 4_8
715
716 call gpu_allocate_c(ptr%c, n)
717 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
718 end subroutine
719
720 subroutine gpu_allocate_real4(ptr, s1, s2, s3, s4)
721 implicit none
722 type(gpu_real4), intent(inout) :: ptr
723 integer, intent(in) :: s1, s2, s3, s4
724 integer*8 :: s1_8, s2_8, s3_8, s4_8, n
725
726 s1_8 = s1
727 s2_8 = s2
728 s3_8 = s3
729 s4_8 = s4
730 n = s1_8 * s2_8 * s3_8 * s4_8 * 4_8
731
732 call gpu_allocate_c(ptr%c, n)
733 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
734 end subroutine
735
736 subroutine gpu_allocate_real5(ptr, s1, s2, s3, s4, s5)
737 implicit none
738 type(gpu_real5), intent(inout) :: ptr
739 integer, intent(in) :: s1, s2, s3, s4, s5
740 integer*8 :: s1_8, s2_8, s3_8, s4_8, s5_8, n
741
742 s1_8 = s1
743 s2_8 = s2
744 s3_8 = s3
745 s4_8 = s4
746 s5_8 = s5
747 n = s1_8 * s2_8 * s3_8 * s4_8 * s5_8 * 4_8
748
749 call gpu_allocate_c(ptr%c, n)
750 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
751 end subroutine
752
753 subroutine gpu_allocate_real6(ptr, s1, s2, s3, s4, s5, s6)
754 implicit none
755 type(gpu_real6), intent(inout) :: ptr
756 integer, intent(in) :: s1, s2, s3, s4, s5, s6
757 integer*8 :: s1_8, s2_8, s3_8, s4_8, s5_8, s6_8, n
758
759 s1_8 = s1
760 s2_8 = s2
761 s3_8 = s3
762 s4_8 = s4
763 s5_8 = s5
764 s6_8 = s6
765 n = s1_8 * s2_8 * s3_8 * s4_8 * s5_8 * s6_8 * 4_8
766
767 call gpu_allocate_c(ptr%c, n)
768 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
769 end subroutine
770
771
772 subroutine gpu_allocate_real1_64(ptr, s)
773 implicit none
774 type(gpu_real1), intent(inout) :: ptr
775 integer*8, intent(in) :: s
776
777 call gpu_allocate_c(ptr%c, s)
778 call c_f_pointer(ptr%c, ptr%f, (/ s /))
779 end subroutine
780
781 subroutine gpu_allocate_real2_64(ptr, s1, s2)
782 implicit none
783 type(gpu_real2), intent(inout) :: ptr
784 integer*8, intent(in) :: s1, s2
785
786 call gpu_allocate_c(ptr%c, s1*s2*4_8)
787 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2 /))
788 end subroutine
789
790 subroutine gpu_allocate_real3_64(ptr, s1, s2, s3)
791 implicit none
792 type(gpu_real3), intent(inout) :: ptr
793 integer*8, intent(in) :: s1, s2, s3
794
795 call gpu_allocate_c(ptr%c, s1*s2*s3*4_8)
796 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3 /))
797 end subroutine
798
799 subroutine gpu_allocate_real4_64(ptr, s1, s2, s3, s4)
800 implicit none
801 type(gpu_real4), intent(inout) :: ptr
802 integer*8, intent(in) :: s1, s2, s3, s4
803
804 call gpu_allocate_c(ptr%c, s1*s2*s3*s4*4_8)
805 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4 /))
806 end subroutine
807
808 subroutine gpu_allocate_real5_64(ptr, s1, s2, s3, s4, s5)
809 implicit none
810 type(gpu_real5), intent(inout) :: ptr
811 integer*8, intent(in) :: s1, s2, s3, s4, s5
812
813 call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*4_8)
814 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5 /))
815 end subroutine
816
817 subroutine gpu_allocate_real6_64(ptr, s1, s2, s3, s4, s5, s6)
818 implicit none
819 type(gpu_real6), intent(inout) :: ptr
820 integer*8, intent(in) :: s1, s2, s3, s4, s5, s6
821
822 call gpu_allocate_c(ptr%c, s1*s2*s3*s4*s5*s6*4_8)
823 call c_f_pointer(ptr%c, ptr%f, (/ s1, s2, s3, s4, s5, s6 /))
824 end subroutine
825
826! gpu_deallocate
827! --------------
828
830 implicit none
831 type(gpu_double1), intent(inout) :: ptr
832 call gpu_deallocate_c(ptr%c)
833 NULLIFY(ptr%f)
834 end subroutine
835
837 implicit none
838 type(gpu_double2), intent(inout) :: ptr
839 call gpu_deallocate_c(ptr%c)
840 NULLIFY(ptr%f)
841 end subroutine
842
844 implicit none
845 type(gpu_double3), intent(inout) :: ptr
846 call gpu_deallocate_c(ptr%c)
847 NULLIFY(ptr%f)
848 end subroutine
849
851 implicit none
852 type(gpu_double4), intent(inout) :: ptr
853 call gpu_deallocate_c(ptr%c)
854 NULLIFY(ptr%f)
855 end subroutine
856
858 implicit none
859 type(gpu_double5), intent(inout) :: ptr
860 call gpu_deallocate_c(ptr%c)
861 NULLIFY(ptr%f)
862 end subroutine
863
865 implicit none
866 type(gpu_double6), intent(inout) :: ptr
867 call gpu_deallocate_c(ptr%c)
868 NULLIFY(ptr%f)
869 end subroutine
870
871
872 subroutine gpu_deallocate_real1(ptr)
873 implicit none
874 type(gpu_real1), intent(inout) :: ptr
875 call gpu_deallocate_c(ptr%c)
876 NULLIFY(ptr%f)
877 end subroutine
878
879 subroutine gpu_deallocate_real2(ptr)
880 implicit none
881 type(gpu_real2), intent(inout) :: ptr
882 call gpu_deallocate_c(ptr%c)
883 NULLIFY(ptr%f)
884 end subroutine
885
886 subroutine gpu_deallocate_real3(ptr)
887 implicit none
888 type(gpu_real3), intent(inout) :: ptr
889 call gpu_deallocate_c(ptr%c)
890 NULLIFY(ptr%f)
891 end subroutine
892
893 subroutine gpu_deallocate_real4(ptr)
894 implicit none
895 type(gpu_real4), intent(inout) :: ptr
896 call gpu_deallocate_c(ptr%c)
897 NULLIFY(ptr%f)
898 end subroutine
899
900 subroutine gpu_deallocate_real5(ptr)
901 implicit none
902 type(gpu_real5), intent(inout) :: ptr
903 call gpu_deallocate_c(ptr%c)
904 NULLIFY(ptr%f)
905 end subroutine
906
907 subroutine gpu_deallocate_real6(ptr)
908 implicit none
909 type(gpu_real6), intent(inout) :: ptr
910 call gpu_deallocate_c(ptr%c)
911 NULLIFY(ptr%f)
912 end subroutine
913
914
915! gpu_upload
916! ----------
917
918 subroutine gpu_upload_double0(cpu_ptr, gpu_ptr, n)
919 implicit none
920 double precision, target, intent(in) :: cpu_ptr
921 double precision, target, intent(in) :: gpu_ptr
922 integer, intent(in) :: n
923 call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 8_8*n)
924 end subroutine
925
926 subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
927 implicit none
928 double precision, target, intent(in) :: cpu_ptr(*)
929 type(gpu_double1), intent(in) :: gpu_ptr
930 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 8_8*size(gpu_ptr%f))
931 end subroutine
932
933 subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
934 implicit none
935 double precision, target, intent(in) :: cpu_ptr(:,:)
936 type(gpu_double2), intent(in) :: gpu_ptr
937 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
938 end subroutine
939
940 subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
941 implicit none
942 double precision, target, intent(in) :: cpu_ptr(:,:,:)
943 type(gpu_double3), intent(in) :: gpu_ptr
944 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
945 end subroutine
946
947 subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
948 implicit none
949 double precision, target, intent(in) :: cpu_ptr(:,:,:,:)
950 type(gpu_double4), intent(in) :: gpu_ptr
951 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
952 end subroutine
953
954 subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
955 implicit none
956 double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:)
957 type(gpu_double5), intent(in) :: gpu_ptr
958 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
959 end subroutine
960
961 subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
962 implicit none
963 double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
964 type(gpu_double6), intent(in) :: gpu_ptr
965 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*8_8)
966 end subroutine
967
968
969 subroutine gpu_upload_real0(cpu_ptr, gpu_ptr, n)
970 implicit none
971 real, target, intent(in) :: cpu_ptr
972 real, target, intent(in) :: gpu_ptr
973 integer, intent(in) :: n
974 call gpu_upload_c(c_loc(cpu_ptr), c_loc(gpu_ptr), 4_8*n)
975 end subroutine
976
977 subroutine gpu_upload_real1(cpu_ptr, gpu_ptr)
978 implicit none
979 real, target, intent(in) :: cpu_ptr(*)
980 type(gpu_real1), intent(in) :: gpu_ptr
981 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, 4_8*size(gpu_ptr%f))
982 end subroutine
983
984 subroutine gpu_upload_real2(cpu_ptr, gpu_ptr)
985 implicit none
986 real, target, intent(in) :: cpu_ptr(:,:)
987 type(gpu_real2), intent(in) :: gpu_ptr
988 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*4_8)
989 end subroutine
990
991 subroutine gpu_upload_real3(cpu_ptr, gpu_ptr)
992 implicit none
993 real, target, intent(in) :: cpu_ptr(:,:,:)
994 type(gpu_real3), intent(in) :: gpu_ptr
995 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*4_8)
996 end subroutine
997
998 subroutine gpu_upload_real4(cpu_ptr, gpu_ptr)
999 implicit none
1000 real, target, intent(in) :: cpu_ptr(:,:,:,:)
1001 type(gpu_real4), intent(in) :: gpu_ptr
1002 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*4_8)
1003 end subroutine
1004
1005 subroutine gpu_upload_real5(cpu_ptr, gpu_ptr)
1006 implicit none
1007 real, target, intent(in) :: cpu_ptr(:,:,:,:,:)
1008 type(gpu_real5), intent(in) :: gpu_ptr
1009 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*4_8)
1010 end subroutine
1011
1012 subroutine gpu_upload_real6(cpu_ptr, gpu_ptr)
1013 implicit none
1014 real, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
1015 type(gpu_real6), intent(in) :: gpu_ptr
1016 call gpu_upload_c(c_loc(cpu_ptr), gpu_ptr%c, product(shape(gpu_ptr%f)*1_8)*4_8)
1017 end subroutine
1018
1019
1020! gpu_download
1021! ------------
1022
1023 subroutine gpu_download_double0(gpu_ptr, cpu_ptr, n)
1024 implicit none
1025 double precision, target, intent(in) :: gpu_ptr
1026 double precision, target, intent(in) :: cpu_ptr
1027 integer, intent(in) :: n
1028 call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 8_8*n)
1029 end subroutine
1030
1031 subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
1032 implicit none
1033 type(gpu_double1), intent(in) :: gpu_ptr
1034 double precision, target, intent(in) :: cpu_ptr(:)
1035 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*size(gpu_ptr%f))
1036 end subroutine
1037
1038 subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
1039 implicit none
1040 type(gpu_double2), intent(in) :: gpu_ptr
1041 double precision, target, intent(in) :: cpu_ptr(:,:)
1042 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
1043 end subroutine
1044
1045 subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
1046 implicit none
1047 type(gpu_double3), intent(in) :: gpu_ptr
1048 double precision, target, intent(in) :: cpu_ptr(:,:,:)
1049 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
1050 end subroutine
1051
1052 subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
1053 implicit none
1054 type(gpu_double4), intent(in) :: gpu_ptr
1055 double precision, target, intent(in) :: cpu_ptr(:,:,:,:)
1056 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
1057 end subroutine
1058
1059 subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
1060 implicit none
1061 type(gpu_double5), intent(in) :: gpu_ptr
1062 double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:)
1063 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
1064 end subroutine
1065
1066 subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
1067 implicit none
1068 type(gpu_double6), intent(in) :: gpu_ptr
1069 double precision, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
1070 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 8_8*product(shape(gpu_ptr%f)*1_8))
1071 end subroutine
1072
1073 subroutine gpu_download_real0(gpu_ptr, cpu_ptr, n)
1074 implicit none
1075 real, target, intent(in) :: gpu_ptr
1076 real, target, intent(in) :: cpu_ptr
1077 integer, intent(in) :: n
1078 call gpu_download_c(c_loc(gpu_ptr), c_loc(cpu_ptr), 4_8*n)
1079 end subroutine
1080
1081 subroutine gpu_download_real1(gpu_ptr, cpu_ptr)
1082 implicit none
1083 type(gpu_real1), intent(in) :: gpu_ptr
1084 real, target, intent(in) :: cpu_ptr(:)
1085 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 4_8*size(gpu_ptr%f))
1086 end subroutine
1087
1088 subroutine gpu_download_real2(gpu_ptr, cpu_ptr)
1089 implicit none
1090 type(gpu_real2), intent(in) :: gpu_ptr
1091 real, target, intent(in) :: cpu_ptr(:,:)
1092 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 4_8*product(shape(gpu_ptr%f)*1_8))
1093 end subroutine
1094
1095 subroutine gpu_download_real3(gpu_ptr, cpu_ptr)
1096 implicit none
1097 type(gpu_real3), intent(in) :: gpu_ptr
1098 real, target, intent(in) :: cpu_ptr(:,:,:)
1099 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 4_8*product(shape(gpu_ptr%f)*1_8))
1100 end subroutine
1101
1102 subroutine gpu_download_real4(gpu_ptr, cpu_ptr)
1103 implicit none
1104 type(gpu_real4), intent(in) :: gpu_ptr
1105 real, target, intent(in) :: cpu_ptr(:,:,:,:)
1106 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 4_8*product(shape(gpu_ptr%f)*1_8))
1107 end subroutine
1108
1109 subroutine gpu_download_real5(gpu_ptr, cpu_ptr)
1110 implicit none
1111 type(gpu_real5), intent(in) :: gpu_ptr
1112 real, target, intent(in) :: cpu_ptr(:,:,:,:,:)
1113 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 4_8*product(shape(gpu_ptr%f)*1_8))
1114 end subroutine
1115
1116 subroutine gpu_download_real6(gpu_ptr, cpu_ptr)
1117 implicit none
1118 type(gpu_real6), intent(in) :: gpu_ptr
1119 real, target, intent(in) :: cpu_ptr(:,:,:,:,:,:)
1120 call gpu_download_c(gpu_ptr%c, c_loc(cpu_ptr), 4_8*product(shape(gpu_ptr%f)*1_8))
1121 end subroutine
1122
1123! gpu_copy
1124! --------
1125
1126 subroutine gpu_copy_double0(gpu_ptr_src, gpu_ptr_dest, n)
1127 implicit none
1128 double precision, target, intent(in) :: gpu_ptr_src
1129 double precision, target, intent(in) :: gpu_ptr_dest
1130 integer, intent(in) :: n
1131 call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 8_8*n)
1132 end subroutine
1133
1134 subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
1135 implicit none
1136 type(gpu_double1), intent(in) :: gpu_ptr_src
1137 type(gpu_double1), intent(in) :: gpu_ptr_dest
1138 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*size(gpu_ptr_dest%f))
1139 end subroutine
1140
1141 subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
1142 implicit none
1143 type(gpu_double2), intent(in) :: gpu_ptr_src
1144 type(gpu_double2), intent(in) :: gpu_ptr_dest
1145 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
1146 end subroutine
1147
1148 subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
1149 implicit none
1150 type(gpu_double3), intent(in) :: gpu_ptr_src
1151 type(gpu_double3), intent(in) :: gpu_ptr_dest
1152 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
1153 end subroutine
1154
1155 subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
1156 implicit none
1157 type(gpu_double4), intent(in) :: gpu_ptr_src
1158 type(gpu_double4), intent(in) :: gpu_ptr_dest
1159 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
1160 end subroutine
1161
1162 subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
1163 implicit none
1164 type(gpu_double5), intent(in) :: gpu_ptr_src
1165 type(gpu_double5), intent(in) :: gpu_ptr_dest
1166 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
1167 end subroutine
1168
1169 subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
1170 implicit none
1171 type(gpu_double6), intent(in) :: gpu_ptr_src
1172 type(gpu_double6), intent(in) :: gpu_ptr_dest
1173 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 8_8*product(shape(gpu_ptr_dest%f)*1_8))
1174 end subroutine
1175
1176 subroutine gpu_copy_real0(gpu_ptr_src, gpu_ptr_dest, n)
1177 implicit none
1178 real, target, intent(in) :: gpu_ptr_src
1179 real, target, intent(in) :: gpu_ptr_dest
1180 integer, intent(in) :: n
1181 call gpu_copy_c(c_loc(gpu_ptr_src), c_loc(gpu_ptr_dest), 4_8*n)
1182 end subroutine
1183
1184 subroutine gpu_copy_real1(gpu_ptr_src, gpu_ptr_dest)
1185 implicit none
1186 type(gpu_real1), intent(in) :: gpu_ptr_src
1187 type(gpu_real1), intent(in) :: gpu_ptr_dest
1188 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 4_8*size(gpu_ptr_dest%f))
1189 end subroutine
1190
1191 subroutine gpu_copy_real2(gpu_ptr_src, gpu_ptr_dest)
1192 implicit none
1193 type(gpu_real2), intent(in) :: gpu_ptr_src
1194 type(gpu_real2), intent(in) :: gpu_ptr_dest
1195 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 4_8*product(shape(gpu_ptr_dest%f)*1_8))
1196 end subroutine
1197
1198 subroutine gpu_copy_real3(gpu_ptr_src, gpu_ptr_dest)
1199 implicit none
1200 type(gpu_real3), intent(in) :: gpu_ptr_src
1201 type(gpu_real3), intent(in) :: gpu_ptr_dest
1202 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 4_8*product(shape(gpu_ptr_dest%f)*1_8))
1203 end subroutine
1204
1205 subroutine gpu_copy_real4(gpu_ptr_src, gpu_ptr_dest)
1206 implicit none
1207 type(gpu_real4), intent(in) :: gpu_ptr_src
1208 type(gpu_real4), intent(in) :: gpu_ptr_dest
1209 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 4_8*product(shape(gpu_ptr_dest%f)*1_8))
1210 end subroutine
1211
1212 subroutine gpu_copy_real5(gpu_ptr_src, gpu_ptr_dest)
1213 implicit none
1214 type(gpu_real5), intent(in) :: gpu_ptr_src
1215 type(gpu_real5), intent(in) :: gpu_ptr_dest
1216 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 4_8*product(shape(gpu_ptr_dest%f)*1_8))
1217 end subroutine
1218
1219 subroutine gpu_copy_real6(gpu_ptr_src, gpu_ptr_dest)
1220 implicit none
1221 type(gpu_real6), intent(in) :: gpu_ptr_src
1222 type(gpu_real6), intent(in) :: gpu_ptr_dest
1223 call gpu_copy_c(gpu_ptr_src%c, gpu_ptr_dest%c, 4_8*product(shape(gpu_ptr_dest%f)*1_8))
1224 end subroutine
1225
1226
1227! gpu_stream
1228! ----------
1229
1230 subroutine gpu_stream_create(stream)
1231 type(gpu_stream) :: stream
1232 call gpu_stream_create_c(stream%c)
1233 end subroutine
1234
1235 subroutine gpu_stream_destroy(stream)
1236 type(gpu_stream) :: stream
1237 call gpu_stream_destroy_c(stream%c)
1238 end subroutine
1239
1240 subroutine gpu_set_stream(handle, stream)
1241 type(gpu_blas) :: handle
1242 type(gpu_stream) :: stream
1243 call gpu_set_stream_c(handle%c, stream%c)
1244 end subroutine
1245
1246
1247! gpu_blas
1248! BLAS handle management
1249! ----------------------
1250
1258 subroutine gpu_blas_create(handle)
1259 type(gpu_blas) :: handle
1260 call gpu_blas_create_c(handle%c)
1261 end subroutine
1262
1269 subroutine gpu_blas_destroy(handle)
1270 type(gpu_blas) :: handle
1271 call gpu_blas_destroy_c(handle%c)
1272 end subroutine
1273
1274
1275
1276
1277! BLAS Level 1: Vector operations
1278! --------------------------------
1279
1293subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
1294 type(gpu_blas), intent(in) :: handle
1295 integer*4 :: n, incx, incy
1296 double precision, target :: dx, dy
1297 double precision, intent(out) :: res
1298 call gpu_ddot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
1299end subroutine
1300
1301
1314subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
1315 type(gpu_blas), intent(in) :: handle
1316 integer*8 :: n, incx, incy
1317 double precision, target :: dx, dy
1318 double precision, intent(out) :: res
1319 call gpu_ddot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
1320end subroutine
1321
1322
1323subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
1324 type(gpu_blas), intent(in) :: handle
1325 integer*4 :: n, incx, incy
1326 real, target :: dx, dy
1327 real, intent(out) :: res
1328 call gpu_sdot_c(handle%c, int(n,c_int64_t), c_loc(dx), int(incx,c_int64_t), c_loc(dy), int(incy,c_int64_t), res)
1329end subroutine
1330
1331
1332subroutine gpu_sdot_64(handle, n, dx, incx, dy, incy, res)
1333 type(gpu_blas), intent(in) :: handle
1334 integer*8 :: n, incx, incy
1335 real, target :: dx, dy
1336 real, intent(out) :: res
1337 call gpu_sdot_c(handle%c, n, c_loc(dx), incx, c_loc(dy), incy, res)
1338end subroutine
1339
1340
1341! geam
1342! ----
1343
1344subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
1345 b, ldb, c, ldc)
1346 type(gpu_blas), intent(in) :: handle
1347 character, intent(in) :: transa, transb
1348 integer*4, intent(in) :: m, n, lda, ldb, ldc
1349 double precision, intent(in) :: alpha, beta
1350 double precision :: a, b, c
1351 call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a, int(lda,c_int64_t), beta, &
1352 b, int(ldb,c_int64_t), c, int(ldc,c_int64_t))
1353end subroutine
1354
1355
1356subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
1357 b, ldb, c, ldc)
1358 type(gpu_blas), intent(in) :: handle
1359 character, intent(in) :: transa, transb
1360 integer*8, intent(in) :: m, n, lda, ldb, ldc
1361 double precision, intent(in) :: alpha, beta
1362 double precision :: a, b, c
1363 call gpu_dgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a, int(lda,c_int64_t), beta, &
1364 b, int(ldb,c_int64_t), c, int(ldc,c_int64_t))
1365end subroutine
1366
1367
1368subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, &
1369 b, ldb, c, ldc)
1370 type(gpu_blas), intent(in) :: handle
1371 character, intent(in) :: transa, transb
1372 integer*4, intent(in) :: m, n, lda, ldb, ldc
1373 real, intent(in) :: alpha, beta
1374 real :: a, b, c
1375 call gpu_sgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a, int(lda,c_int64_t), beta, &
1376 b, int(ldb,c_int64_t), c, int(ldc,c_int64_t))
1377end subroutine
1378
1379
1380subroutine gpu_sgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, &
1381 b, ldb, c, ldc)
1382 type(gpu_blas), intent(in) :: handle
1383 character, intent(in) :: transa, transb
1384 integer*8, intent(in) :: m, n, lda, ldb, ldc
1385 real, intent(in) :: alpha, beta
1386 real :: a, b, c
1387 call gpu_sgeam_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), alpha, a, int(lda,c_int64_t), beta, &
1388 b, int(ldb,c_int64_t), c, int(ldc,c_int64_t))
1389end subroutine
1390
1391
1392! gemv
1393! ----
1394
1395subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, &
1396 x, incx, beta, y, incy)
1397 type(gpu_blas), intent(in) :: handle
1398 character, intent(in) :: transa
1399 integer*4, intent(in) :: m, n, lda, incx, incy
1400 double precision, intent(in) :: alpha, beta
1401 double precision :: a, x, y
1402 call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
1403 alpha, a, int(lda,c_int64_t), &
1404 x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
1405end subroutine
1406
1407subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, &
1408 x, incx, beta, y, incy)
1409 type(gpu_blas), intent(in) :: handle
1410 character, intent(in) :: transa
1411 integer*8, intent(in) :: m, n, lda, incx, incy
1412 double precision, intent(in) :: alpha, beta
1413 double precision :: a, x, y
1414 call gpu_dgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
1415 alpha, a, int(lda,c_int64_t), &
1416 x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
1417end subroutine
1418
1419
1420subroutine gpu_sgemv(handle, transa, m, n, alpha, a, lda, &
1421 x, incx, beta, y, incy)
1422 type(gpu_blas), intent(in) :: handle
1423 character, intent(in) :: transa
1424 integer*4, intent(in) :: m, n, lda, incx, incy
1425 real, intent(in) :: alpha, beta
1426 real:: a, x, y
1427 call gpu_sgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
1428 alpha, a, int(lda,c_int64_t), &
1429 x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
1430end subroutine
1431
1432subroutine gpu_sgemv_64(handle, transa, m, n, alpha, a, lda, &
1433 x, incx, beta, y, incy)
1434 type(gpu_blas), intent(in) :: handle
1435 character, intent(in) :: transa
1436 integer*8, intent(in) :: m, n, lda, incx, incy
1437 real, intent(in) :: alpha, beta
1438 real:: a, x, y
1439 call gpu_sgemv_c(handle%c, transa, int(m,c_int64_t), int(n,c_int64_t), &
1440 alpha, a, int(lda,c_int64_t), &
1441 x, int(incx,c_int64_t), beta, y, int(incy,c_int64_t))
1442end subroutine
1443
1444
1445! gemm
1446! ----
1447
1448subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
1449 b, ldb, beta, c, ldc)
1450 type(gpu_blas), intent(in) :: handle
1451 character, intent(in) :: transa, transb
1452 integer*4, intent(in) :: m, n, k, lda, ldb, ldc
1453 double precision, intent(in) :: alpha, beta
1454 double precision :: a, b, c
1455 call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
1456 alpha, a, int(lda,c_int64_t), &
1457 b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
1458end subroutine
1459
1460subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
1461 b, ldb, beta, c, ldc)
1462 type(gpu_blas), intent(in) :: handle
1463 character, intent(in) :: transa, transb
1464 integer*8, intent(in) :: m, n, k, lda, ldb, ldc
1465 double precision, intent(in) :: alpha, beta
1466 double precision :: a, b, c
1467 call gpu_dgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
1468 alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
1469end subroutine
1470
1471subroutine gpu_sgemm(handle, transa, transb, m, n, k, alpha, a, lda, &
1472 b, ldb, beta, c, ldc)
1473 type(gpu_blas), intent(in) :: handle
1474 character, intent(in) :: transa, transb
1475 integer*4, intent(in) :: m, n, k, lda, ldb, ldc
1476 real, intent(in) :: alpha, beta
1477 real:: a, b, c
1478 call gpu_sgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
1479 alpha, a, int(lda,c_int64_t), &
1480 b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
1481end subroutine
1482
1483subroutine gpu_sgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, &
1484 b, ldb, beta, c, ldc)
1485 type(gpu_blas), intent(in) :: handle
1486 character, intent(in) :: transa, transb
1487 integer*8, intent(in) :: m, n, k, lda, ldb, ldc
1488 real, intent(in) :: alpha, beta
1489 real:: a, b, c
1490 call gpu_sgemm_c(handle%c, transa, transb, int(m,c_int64_t), int(n,c_int64_t), int(k,c_int64_t), &
1491 alpha, a, int(lda,c_int64_t), b, int(ldb,c_int64_t), beta, c, int(ldc,c_int64_t))
1492end subroutine
1493
1494end module
Allocate GPU/CPU memory (C binding)
Allocate GPU/CPU memory for arrays.
Copy data between GPU arrays.
Free GPU/CPU memory (C binding)
Free GPU/CPU memory.
Download data from device (GPU) to host (CPU)
Query GPU memory usage (C binding)
Get number of GPU devices (C binding)
Set active GPU device (C binding)
Upload data to GPU (C binding)
Upload data from host (CPU) to device (GPU)
Simple GPU - Fortran GPU Computing Library.
subroutine gpu_deallocate_double5(ptr)
subroutine gpu_allocate_double1(ptr, s)
Allocate 1D double precision array (32-bit dimensions)
subroutine gpu_upload_real6(cpu_ptr, gpu_ptr)
subroutine gpu_sgeam(handle, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc)
subroutine gpu_download_real1(gpu_ptr, cpu_ptr)
subroutine gpu_upload_real3(cpu_ptr, gpu_ptr)
subroutine gpu_copy_real4(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_blas_destroy(handle)
Destroy a BLAS handle.
subroutine gpu_dgemv_64(handle, transa, m, n, alpha, a, lda, x, incx, beta, y, incy)
subroutine gpu_allocate_real5_64(ptr, s1, s2, s3, s4, s5)
subroutine gpu_download_real2(gpu_ptr, cpu_ptr)
subroutine gpu_copy_real5(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_download_double2(gpu_ptr, cpu_ptr)
subroutine gpu_copy_real0(gpu_ptr_src, gpu_ptr_dest, n)
subroutine gpu_deallocate_real1(ptr)
subroutine gpu_sgemv(handle, transa, m, n, alpha, a, lda, x, incx, beta, y, incy)
subroutine gpu_deallocate_real3(ptr)
subroutine gpu_deallocate_double3(ptr)
subroutine gpu_copy_real3(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_download_double3(gpu_ptr, cpu_ptr)
subroutine gpu_download_real4(gpu_ptr, cpu_ptr)
subroutine gpu_allocate_real2_64(ptr, s1, s2)
subroutine gpu_allocate_double3(ptr, s1, s2, s3)
subroutine gpu_upload_double0(cpu_ptr, gpu_ptr, n)
subroutine gpu_upload_real5(cpu_ptr, gpu_ptr)
subroutine gpu_download_double1(gpu_ptr, cpu_ptr)
subroutine gpu_allocate_real1(ptr, s)
subroutine gpu_upload_double6(cpu_ptr, gpu_ptr)
subroutine gpu_allocate_double5_64(ptr, s1, s2, s3, s4, s5)
subroutine gpu_allocate_double2(ptr, s1, s2)
subroutine gpu_upload_double4(cpu_ptr, gpu_ptr)
subroutine gpu_ddot(handle, n, dx, incx, dy, incy, res)
Double precision dot product (32-bit dimensions)
subroutine gpu_allocate_real4_64(ptr, s1, s2, s3, s4)
subroutine gpu_copy_double2(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_download_double6(gpu_ptr, cpu_ptr)
subroutine gpu_copy_real1(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_set_stream(handle, stream)
subroutine gpu_download_real5(gpu_ptr, cpu_ptr)
subroutine gpu_allocate_double4_64(ptr, s1, s2, s3, s4)
subroutine gpu_allocate_real4(ptr, s1, s2, s3, s4)
subroutine gpu_upload_double5(cpu_ptr, gpu_ptr)
subroutine gpu_deallocate_real4(ptr)
subroutine gpu_sgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc)
subroutine gpu_allocate_double1_64(ptr, s)
subroutine gpu_allocate_double2_64(ptr, s1, s2)
subroutine gpu_stream_create(stream)
subroutine gpu_allocate_real6(ptr, s1, s2, s3, s4, s5, s6)
subroutine gpu_download_real0(gpu_ptr, cpu_ptr, n)
subroutine gpu_sgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
subroutine gpu_download_real3(gpu_ptr, cpu_ptr)
subroutine gpu_dgemm(handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
subroutine gpu_dgemv(handle, transa, m, n, alpha, a, lda, x, incx, beta, y, incy)
subroutine gpu_download_double4(gpu_ptr, cpu_ptr)
subroutine gpu_allocate_real6_64(ptr, s1, s2, s3, s4, s5, s6)
subroutine gpu_download_double5(gpu_ptr, cpu_ptr)
subroutine gpu_allocate_real2(ptr, s1, s2)
subroutine gpu_copy_double0(gpu_ptr_src, gpu_ptr_dest, n)
subroutine gpu_dgemm_64(handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
subroutine gpu_sdot_64(handle, n, dx, incx, dy, incy, res)
subroutine gpu_deallocate_real5(ptr)
subroutine gpu_dgeam(handle, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc)
subroutine gpu_deallocate_real6(ptr)
subroutine gpu_ddot_64(handle, n, dx, incx, dy, incy, res)
Double precision dot product (64-bit dimensions)
subroutine gpu_upload_double2(cpu_ptr, gpu_ptr)
subroutine gpu_upload_real0(cpu_ptr, gpu_ptr, n)
subroutine gpu_deallocate_real2(ptr)
subroutine gpu_copy_real6(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_allocate_real3(ptr, s1, s2, s3)
subroutine gpu_download_double0(gpu_ptr, cpu_ptr, n)
subroutine gpu_allocate_double4(ptr, s1, s2, s3, s4)
subroutine gpu_upload_double3(cpu_ptr, gpu_ptr)
subroutine gpu_deallocate_double1(ptr)
subroutine gpu_deallocate_double2(ptr)
subroutine gpu_deallocate_double4(ptr)
subroutine gpu_sdot(handle, n, dx, incx, dy, incy, res)
subroutine gpu_sgemv_64(handle, transa, m, n, alpha, a, lda, x, incx, beta, y, incy)
subroutine gpu_upload_real1(cpu_ptr, gpu_ptr)
subroutine gpu_allocate_double6_64(ptr, s1, s2, s3, s4, s5, s6)
subroutine gpu_allocate_real3_64(ptr, s1, s2, s3)
subroutine gpu_copy_double5(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_allocate_double6(ptr, s1, s2, s3, s4, s5, s6)
subroutine gpu_copy_double4(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_deallocate_double6(ptr)
subroutine gpu_allocate_double5(ptr, s1, s2, s3, s4, s5)
subroutine gpu_copy_real2(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_allocate_real5(ptr, s1, s2, s3, s4, s5)
subroutine gpu_download_real6(gpu_ptr, cpu_ptr)
subroutine gpu_sgemm(handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
subroutine gpu_copy_double3(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_upload_double1(cpu_ptr, gpu_ptr)
subroutine gpu_dgeam_64(handle, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc)
subroutine gpu_copy_double6(gpu_ptr_src, gpu_ptr_dest)
subroutine gpu_upload_real2(cpu_ptr, gpu_ptr)
subroutine gpu_upload_real4(cpu_ptr, gpu_ptr)
subroutine gpu_stream_destroy(stream)
subroutine gpu_allocate_real1_64(ptr, s)
subroutine gpu_blas_create(handle)
Create a BLAS handle.
subroutine gpu_allocate_double3_64(ptr, s1, s2, s3)
subroutine gpu_copy_double1(gpu_ptr_src, gpu_ptr_dest)
Handle for BLAS operations.
1-dimensional array of double precision values
2-dimensional array of double precision values
3-dimensional array of double precision values
4-dimensional array of double precision values
5-dimensional array of double precision values
6-dimensional array of double precision values
1-dimensional array of single precision values
2-dimensional array of single precision values
3-dimensional array of single precision values
4-dimensional array of single precision values
5-dimensional array of single precision values
6-dimensional array of single precision values
Handle for CUDA streams.