Slide 22
Slide 22 text
@nyghtowl
Ex: Cuda GPU Code
allocate memory cudaMalloc((void**)&dA,
sizeof(double) * size * size);
data in cublasSetMatrix (size, size, sizeof(double), B,
size, dB, size);
run kernel cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
size, size, size, &one, dB, size, dB, size,
&zero, dA, size );
data out cublasGetMatrix (size, size, sizeof(double),
dA, size, A, size);
sync cudaDeviceSynchronize();