Slide 12
Slide 12 text
Dot product four ways
• CUDA (parallel):
- Step 2: vector reduction … is somewhat complex
struct
SharedMemory
{
__device__
inline
operator
float
*()
{
extern
__shared__
int
__smem[];
return
(float
*)__smem;
}
!
__device__
inline
operator
const
float
*()
const
{
extern
__shared__
int
__smem[];
return
(float
*)__smem;
}
};
!
template
__global__
void
reduce_kernel(float
*g_idata,
float
*g_odata,
unsigned
int
n)
{
float
*sdata
=
SharedMemory();
!
unsigned
int
tid
=
threadIdx.x;
unsigned
int
i
=
blockIdx.x*blockSize*2
+
threadIdx.x;
unsigned
int
gridSize
=
blockSize*2*gridDim.x;
!
float
sum
=
0;
!
while
(i
<
n)
{
sum
+=
g_idata[i];
!
if
(nIsPow2
||
i
+
blockSize
<
n)
sum
+=
g_idata[i+blockSize];
!
i
+=
gridSize;
}
!
sdata[tid]
=
sum;
__syncthreads();
!
if
(blockSize
>=
512)
{
if
(tid
<
256)
{
sdata[tid]
=
sum
=
sum
+
sdata[tid
+
256];
}
!
__syncthreads();
}
!
if
(blockSize
>=
256)
{
if
(tid
<
128)
{
sdata[tid]
=
sum
=
sum
+
sdata[tid
+
128];
}
!
__syncthreads();
}
!
if
(blockSize
>=
128)
{
if
(tid
<
64)
{
sdata[tid]
=
sum
=
sum
+
sdata[tid
+
64];
}
!
__syncthreads();
}
!
if
(tid
<
32)
{
volatile
float
*smem
=
sdata;
!
if
(blockSize
>=
64)
{
smem[tid]
=
sum
=
sum
+
smem[tid
+
32];
}
!
if
(blockSize
>=
32)
{
smem[tid]
=
sum
=
sum
+
smem[tid
+
16];
}
!
if
(blockSize
>=
16)
{
smem[tid]
=
sum
=
sum
+
smem[tid
+
8];
}
!
if
(blockSize
>=
8)
{
smem[tid]
=
sum
=
sum
+
smem[tid
+
4];
}
!
if
(blockSize
>=
4)
{
smem[tid]
=
sum
=
sum
+
smem[tid
+
2];
}
!
if
(blockSize
>=
2)
{
smem[tid]
=
sum
=
sum
+
smem[tid
+
1];
}
}
!
if
(tid
==
0)
g_odata[blockIdx.x]
=
sdata[0];
}
!
void
getNumBlocksAndThreads(int
n,
int
maxBlocks,
int
maxThreads,
int
&blocks,
int
&threads)
{
cudaDeviceProp
prop;
int
device;
checkCudaErrors(cudaGetDevice(&device));
checkCudaErrors(cudaGetDeviceProperties(&prop,
device));
!
threads
=
(n
<
maxThreads*2)
?
nextPow2((n
+
1)/
2)
:
maxThreads;
blocks
=
(n
+
(threads
*
2
-‐
1))
/
(threads
*
2);
!
if
(blocks
>
prop.maxGridSize[0])
{
blocks
/=
2;
threads
*=
2;
}
!
blocks
=
min(maxBlocks,
blocks);
}
!
float
reduce(int
n,
float
*d_idata,
float
*d_odata)
{
int
threads
=
0;
int
blocks
=
0;
int
maxThreads
=
256;
int
maxBlocks
=
64;
int
size
=
n
!
while
(size
>
1)
{
getNumBlocksAndThreads(size,
maxBlocks,
maxThreads,
blocks,
threads);
!
int
smemSize
=
(threads
<=
32)
?
2
*
threads
*
sizeof(float)
:
threads
*
sizeof(float);
dim3
dimBlock(threads,
1,
1);
dim3
dimGrid(blocks,
1,
1);
!
if
(isPow2(size))
{
switch
(threads)
{
case
512:
reduce_kernel<512,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
256:
reduce_kernel<256,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
128:
reduce_kernel<128,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
64:
reduce_kernel<
64,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
32:
reduce_kernel<
32,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
16:
reduce_kernel<
16,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
8:
reduce_kernel<
8,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
4:
reduce_kernel<
4,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
2:
reduce_kernel<
2,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
1:
reduce_kernel<
1,
true><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
}
}
else
{
switch
(threads)
{
case
512:
reduce_kernel<512,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
256:
reduce_kernel<256,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
128:
reduce_kernel<128,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
64:
reduce_kernel<
64,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
32:
reduce_kernel<
32,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
16:
reduce_kernel<
16,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
8:
reduce_kernel<
8,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
4:
reduce_kernel<
4,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
2:
reduce_kernel<
2,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
case
1:
reduce_kernel<
1,
false><<<
dimGrid,
dimBlock,
smemSize
>>>(d_idata,
d_odata,
size);
break;
}
}
!
size
=
(size
+
(threads*2-‐1))
/
(threads*2);
}
!
float
sum;
checkCudaErrors(cudaMemcpy(&sum,
d_odata,
sizeof(float),
cudaMemcpyDeviceToHost));
!
}
!