vector reduction … is somewhat complex struct SharedMemory { __device__ inline operator float *() { extern __shared__ int __smem[]; return (float *)__smem; } ! __device__ inline operator const float *() const { extern __shared__ int __smem[]; return (float *)__smem; } }; ! template <unsigned int blockSize, bool nIsPow2> __global__ void reduce_kernel(float *g_idata, float *g_odata, unsigned int n) { float *sdata = SharedMemory(); ! unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x; unsigned int gridSize = blockSize*2*gridDim.x; ! float sum = 0; ! while (i < n) { sum += g_idata[i]; ! if (nIsPow2 || i + blockSize < n) sum += g_idata[i+blockSize]; ! i += gridSize; } ! sdata[tid] = sum; __syncthreads(); ! if (blockSize >= 512) { if (tid < 256) { sdata[tid] = sum = sum + sdata[tid + 256]; } ! __syncthreads(); } ! if (blockSize >= 256) { if (tid < 128) { sdata[tid] = sum = sum + sdata[tid + 128]; } ! __syncthreads(); } ! if (blockSize >= 128) { if (tid < 64) { sdata[tid] = sum = sum + sdata[tid + 64]; } ! __syncthreads(); } ! if (tid < 32) { volatile float *smem = sdata; ! if (blockSize >= 64) { smem[tid] = sum = sum + smem[tid + 32]; } ! if (blockSize >= 32) { smem[tid] = sum = sum + smem[tid + 16]; } ! if (blockSize >= 16) { smem[tid] = sum = sum + smem[tid + 8]; } ! if (blockSize >= 8) { smem[tid] = sum = sum + smem[tid + 4]; } ! if (blockSize >= 4) { smem[tid] = sum = sum + smem[tid + 2]; } ! if (blockSize >= 2) { smem[tid] = sum = sum + smem[tid + 1]; } } ! if (tid == 0) g_odata[blockIdx.x] = sdata[0]; } ! void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads) { cudaDeviceProp prop; int device; checkCudaErrors(cudaGetDevice(&device)); checkCudaErrors(cudaGetDeviceProperties(&prop, device)); ! threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads; blocks = (n + (threads * 2 -‐ 1)) / (threads * 2); ! if (blocks > prop.maxGridSize[0]) { blocks /= 2; threads *= 2; } ! blocks = min(maxBlocks, blocks); } ! float reduce(int n, float *d_idata, float *d_odata) { int threads = 0; int blocks = 0; int maxThreads = 256; int maxBlocks = 64; int size = n ! while (size > 1) { getNumBlocksAndThreads(size, maxBlocks, maxThreads, blocks, threads); ! int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float); dim3 dimBlock(threads, 1, 1); dim3 dimGrid(blocks, 1, 1); ! if (isPow2(size)) { switch (threads) { case 512: reduce_kernel<512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 256: reduce_kernel<256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 128: reduce_kernel<128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 64: reduce_kernel< 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 32: reduce_kernel< 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 16: reduce_kernel< 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 8: reduce_kernel< 8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 4: reduce_kernel< 4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 2: reduce_kernel< 2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 1: reduce_kernel< 1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; } } else { switch (threads) { case 512: reduce_kernel<512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 256: reduce_kernel<256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 128: reduce_kernel<128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 64: reduce_kernel< 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 32: reduce_kernel< 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 16: reduce_kernel< 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 8: reduce_kernel< 8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 4: reduce_kernel< 4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 2: reduce_kernel< 2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; case 1: reduce_kernel< 1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break; } } ! size = (size + (threads*2-‐1)) / (threads*2); } ! float sum; checkCudaErrors(cudaMemcpy(&sum, d_odata, sizeof(float), cudaMemcpyDeviceToHost)); ! } ! o_O