diff options
Diffstat (limited to 'src/com/mbien/opencl')
-rw-r--r-- | src/com/mbien/opencl/demos/radixsort/RadixSort.cl | 358 | ||||
-rw-r--r-- | src/com/mbien/opencl/demos/radixsort/RadixSort.java | 183 | ||||
-rw-r--r-- | src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java | 122 | ||||
-rw-r--r-- | src/com/mbien/opencl/demos/radixsort/Scan.java | 131 | ||||
-rw-r--r-- | src/com/mbien/opencl/demos/radixsort/Scan_b.cl | 190 |
5 files changed, 984 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl b/src/com/mbien/opencl/demos/radixsort/RadixSort.cl new file mode 100644 index 0000000..d014692 --- /dev/null +++ b/src/com/mbien/opencl/demos/radixsort/RadixSort.cl @@ -0,0 +1,358 @@ +/* +* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. +* +* NVIDIA Corporation and its licensors retain all intellectual property and +* proprietary rights in and to this software and related documentation. +* Any use, reproduction, disclosure, or distribution of this software +* and related documentation without an express license agreement from +* NVIDIA Corporation is strictly prohibited. +* +* Please refer to the applicable NVIDIA end user license agreement (EULA) +* associated with this source code for terms and conditions that govern +* your use of this NVIDIA software. +* +*/ + +//---------------------------------------------------------------------------- +// Scans each warp in parallel ("warp-scan"), one element per thread. +// uses 2 numElements of shared memory per thread (64 = elements per warp) +//---------------------------------------------------------------------------- +//#define WARP_SIZE 32 +uint scanwarp(uint val, __local uint* sData, int maxlevel) +{ + // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = + // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1)) + int localId = get_local_id(0); + int idx = 2 * localId - (localId & (WARP_SIZE - 1)); + sData[idx] = 0; + idx += WARP_SIZE; + sData[idx] = val; + + if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } + if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } + if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } + if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } + if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } + + return sData[idx] - val; // convert inclusive -> exclusive +} + +//---------------------------------------------------------------------------- +// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using +// a warp-scan algorithm +//---------------------------------------------------------------------------- +uint4 scan4(uint4 idata, __local uint* ptr) +{ + + uint idx = get_local_id(0); + + uint4 val4 = idata; + uint sum[3]; + sum[0] = val4.x; + sum[1] = val4.y + sum[0]; + sum[2] = val4.z + sum[1]; + + uint val = val4.w + sum[2]; + + val = scanwarp(val, ptr, 4); + barrier(CLK_LOCAL_MEM_FENCE); + + if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1) + { + ptr[idx >> 5] = val + val4.w + sum[2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (idx < WARP_SIZE) + ptr[idx] = scanwarp(ptr[idx], ptr, 2); + + barrier(CLK_LOCAL_MEM_FENCE); + + val += ptr[idx >> 5]; + + val4.x = val; + val4.y = val + sum[0]; + val4.z = val + sum[1]; + val4.w = val + sum[2]; + + return val4; +} + +#ifdef MAC +__kernel uint4 rank4(uint4 preds, __local uint* sMem) +#else +uint4 rank4(uint4 preds, __local uint* sMem) +#endif +{ + int localId = get_local_id(0); + int localSize = get_local_size(0); + + uint4 address = scan4(preds, sMem); + + __local uint numtrue; + if (localId == localSize - 1) + { + numtrue = address.w + preds.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 rank; + int idx = localId*4; + rank.x = (preds.x) ? address.x : numtrue + idx - address.x; + rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y; + rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z; + rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w; + + return rank; +} + +void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem) +{ + int localId = get_local_id(0); + int localSize = get_local_size(0); + + for(uint shift = startbit; shift < (startbit + nbits); ++shift) + { + uint4 lsb; + lsb.x = !(((*key).x >> shift) & 0x1); + lsb.y = !(((*key).y >> shift) & 0x1); + lsb.z = !(((*key).z >> shift) & 0x1); + lsb.w = !(((*key).w >> shift) & 0x1); + + uint4 r; + + r = rank4(lsb, sMem); + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x; + sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y; + sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z; + sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w; + barrier(CLK_LOCAL_MEM_FENCE); + + // The above allows us to read without 4-way bank conflicts: + (*key).x = sMem[localId]; + (*key).y = sMem[localId + localSize]; + (*key).z = sMem[localId + 2 * localSize]; + (*key).w = sMem[localId + 3 * localSize]; + + barrier(CLK_LOCAL_MEM_FENCE); + } +} + +__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, + __global uint4* keysOut, + uint nbits, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sMem) +{ + int globalId = get_global_id(0); + + uint4 key; + key = keysIn[globalId]; + + barrier(CLK_LOCAL_MEM_FENCE); + + radixSortBlockKeysOnly(&key, nbits, startbit, sMem); + + keysOut[globalId] = key; +} + +//---------------------------------------------------------------------------- +// Given an array with blocks sorted according to a 4-bit radix group, each +// block counts the number of keys that fall into each radix in the group, and +// finds the starting offset of each radix in the block. It then writes the radix +// counts to the counters array, and the starting offsets to the blockOffsets array. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +// +//---------------------------------------------------------------------------- +__kernel void findRadixOffsets(__global uint2* keys, + __global uint* counters, + __global uint* blockOffsets, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sRadix1) +{ + __local uint sStartPointers[16]; + + uint groupId = get_group_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + uint2 radix2; + + radix2 = keys[get_global_id(0)]; + + + sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF; + sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF; + + // Finds the position where the sRadix1 entries differ and stores start + // index for each radix. + if(localId < 16) + { + sStartPointers[localId] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId]] = localId; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) + { + sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + blockOffsets[groupId*16 + localId] = sStartPointers[localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Compute the sizes of each block. + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId - 1]] = + localId - sStartPointers[sRadix1[localId - 1]]; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) + { + sStartPointers[sRadix1[localId + groupSize - 1]] = + localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]]; + } + + + if(localId == groupSize - 1) + { + sStartPointers[sRadix1[2 * groupSize - 1]] = + 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + counters[localId * totalBlocks + groupId] = sStartPointers[localId]; + } +} + +// a naive scan routine that works only for array that +// can fit into a single block, just for debugging purpose, +// not used in the sort now +__kernel void scanNaive(__global uint *g_odata, + __global uint *g_idata, + uint n, + __local uint* temp) +{ + + int localId = get_local_id(0); + + int pout = 0; + int pin = 1; + + // Cache the computational window in shared memory + temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0; + + for (int offset = 1; offset < n; offset *= 2) + { + pout = 1 - pout; + pin = 1 - pout; + barrier(CLK_LOCAL_MEM_FENCE); + + temp[pout*n+localId] = temp[pin*n+localId]; + + if (localId >= offset) + temp[pout*n+localId] += temp[pin*n+localId - offset]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + g_odata[localId] = temp[pout*n+localId]; +} + +//---------------------------------------------------------------------------- +// reorderData shuffles data in the array globally after the radix offsets +// have been found. On compute version 1.1 and earlier GPUs, this code depends +// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +// +// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +// that all writes are coalesced using extra work in the kernel. On later +// GPUs coalescing rules have been relaxed, so this extra overhead hurts +// performance. On these GPUs we set manualCoalesce=false and directly store +// the results. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +//---------------------------------------------------------------------------- +__kernel void reorderDataKeysOnly(__global uint *outKeys, + __global uint2 *keys, + __global uint *blockOffsets, + __global uint *offsets, + __global uint *sizes, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint2* sKeys2) +{ + __local uint sOffsets[16]; + __local uint sBlockOffsets[16]; + + __local uint *sKeys1 = (__local uint*)sKeys2; + + uint groupId = get_group_id(0); + + uint globalId = get_global_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + sKeys2[localId] = keys[globalId]; + + if(localId < 16) + { + sOffsets[localId] = offsets[localId * totalBlocks + groupId]; + sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + uint radix = (sKeys1[localId] >> startbit) & 0xF; + uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId]; + } + + radix = (sKeys1[localId + groupSize] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId + groupSize]; + } + + +} diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.java b/src/com/mbien/opencl/demos/radixsort/RadixSort.java new file mode 100644 index 0000000..783c094 --- /dev/null +++ b/src/com/mbien/opencl/demos/radixsort/RadixSort.java @@ -0,0 +1,183 @@ +/* + * 20:38 Sunday, February 28 2010 + */ + +package com.mbien.opencl.demos.radixsort; + +import com.mbien.opencl.CLBuffer; +import com.mbien.opencl.CLCommandQueue; +import com.mbien.opencl.CLContext; +import com.mbien.opencl.CLKernel; +import com.mbien.opencl.CLProgram; +import com.mbien.opencl.CLResource; +import java.io.IOException; +import java.nio.IntBuffer; + +import static com.mbien.opencl.CLMemory.Mem.*; +import static com.mbien.opencl.CLProgram.*; +import static com.mbien.opencl.CLProgram.CompilerOptions.*; +import static java.lang.System.*; + +/** + * + * @author Michael Bien + */ +public class RadixSort implements CLResource { + + private static final int NUM_BANKS = 16; + private static final int WARP_SIZE = 32; + private static final int bitStep = 4; + + private final int CTA_SIZE; + + private final CLKernel ckRadixSortBlocksKeysOnly; + private final CLKernel ckFindRadixOffsets; + private final CLKernel ckScanNaive; + private final CLKernel ckReorderDataKeysOnly; + + private final CLBuffer<?> tempKeys; + private final CLBuffer<?> mCounters; + private final CLBuffer<?> mCountersSum; + private final CLBuffer<?> mBlockOffsets; + + private final CLCommandQueue queue; + private final Scan scan; + private final CLProgram program; + + public RadixSort(CLCommandQueue queue, int maxElements, int CTA_SIZE) throws IOException { + + this.CTA_SIZE = CTA_SIZE; + scan = new Scan(queue, maxElements / 2 / CTA_SIZE * 16); + + int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0) + ? (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1); + + this.queue = queue; + + CLContext context = queue.getContext(); + this.tempKeys = context.createBuffer(4 * maxElements, READ_WRITE); + this.mCounters = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); + this.mCountersSum = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); + this.mBlockOffsets = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); + + program = context.createProgram(getClass().getResourceAsStream("RadixSort.cl")) + .build(ENABLE_MAD, define("WARP_SIZE", WARP_SIZE)); + +// out.println(program.getBuildLog()); + + ckRadixSortBlocksKeysOnly = program.createCLKernel("radixSortBlocksKeysOnly"); + ckFindRadixOffsets = program.createCLKernel("findRadixOffsets"); + ckScanNaive = program.createCLKernel("scanNaive"); + ckReorderDataKeysOnly = program.createCLKernel("reorderDataKeysOnly"); + + } + + void sort(CLBuffer<IntBuffer> d_keys, int numElements, int keyBits) { + radixSortKeysOnly(d_keys, numElements, keyBits); + } + + //---------------------------------------------------------------------------- + // Main key-only radix sort function. Sorts in place in the keys and values + // arrays, but uses the other device arrays as temporary storage. All pointer + // parameters are device pointers. Uses cudppScan() for the prefix sum of + // radix counters. + //---------------------------------------------------------------------------- + void radixSortKeysOnly(CLBuffer<IntBuffer> keys, int numElements, int keyBits) { + int i = 0; + while (keyBits > i * bitStep) { + radixSortStepKeysOnly(keys, bitStep, i * bitStep, numElements); + i++; + } + } + + //---------------------------------------------------------------------------- + // Perform one step of the radix sort. Sorts by nbits key bits per step, + // starting at startbit. + //---------------------------------------------------------------------------- + void radixSortStepKeysOnly(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) { + + // Four step algorithms from Satish, Harris & Garland + radixSortBlocksKeysOnlyOCL(keys, nbits, startbit, numElements); + + findRadixOffsetsOCL(startbit, numElements); + + scan.scanExclusiveLarge(mCountersSum, mCounters, 1, numElements / 2 / CTA_SIZE * 16); + + reorderDataKeysOnlyOCL(keys, startbit, numElements); + } + + //---------------------------------------------------------------------------- + // Wrapper for the kernels of the four steps + //---------------------------------------------------------------------------- + void radixSortBlocksKeysOnlyOCL(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) { + + int totalBlocks = numElements / 4 / CTA_SIZE; + int globalWorkSize = CTA_SIZE * totalBlocks; + int localWorkSize = CTA_SIZE; + + ckRadixSortBlocksKeysOnly.putArg(keys).putArg(tempKeys).putArg(nbits).putArg(startbit) + .putArg(numElements).putArg(totalBlocks).putNullArg(4 * CTA_SIZE * 4) + .rewind(); + + queue.put1DRangeKernel(ckRadixSortBlocksKeysOnly, 0, globalWorkSize, localWorkSize); + } + + void findRadixOffsetsOCL(int startbit, int numElements) { + + int totalBlocks = numElements / 2 / CTA_SIZE; + int globalWorkSize = CTA_SIZE * totalBlocks; + int localWorkSize = CTA_SIZE; + + ckFindRadixOffsets.putArg(tempKeys).putArg(mCounters).putArg(mBlockOffsets) + .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4) + .rewind(); + + queue.put1DRangeKernel(ckFindRadixOffsets, 0, globalWorkSize, localWorkSize); + } + + void scanNaiveOCL(int numElements) { + + int nHist = numElements / 2 / CTA_SIZE * 16; + int globalWorkSize = nHist; + int localWorkSize = nHist; + int extra_space = nHist / NUM_BANKS; + int shared_mem_size = 4 * (nHist + extra_space); + + ckScanNaive.putArg(mCountersSum).putArg(mCounters).putArg(nHist).putNullArg(2 * shared_mem_size).rewind(); + + queue.put1DRangeKernel(ckScanNaive, 0, globalWorkSize, localWorkSize); + } + + void reorderDataKeysOnlyOCL(CLBuffer<IntBuffer> keys, int startbit, int numElements) { + + int totalBlocks = numElements / 2 / CTA_SIZE; + int globalWorkSize = CTA_SIZE * totalBlocks; + int localWorkSize = CTA_SIZE; + + ckReorderDataKeysOnly.putArg(keys).putArg(tempKeys).putArg(mBlockOffsets).putArg(mCountersSum).putArg(mCounters) + .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4).rewind(); + + queue.put1DRangeKernel(ckReorderDataKeysOnly, 0, globalWorkSize, localWorkSize); + } + + public void release() { + + scan.release(); + + //program & kernels + program.release(); + + //buffers + tempKeys.release(); + mCounters.release(); + mCountersSum.release(); + mBlockOffsets.release(); + } + + public void close() { + release(); + } + + + +} diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java b/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java new file mode 100644 index 0000000..a46eb22 --- /dev/null +++ b/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java @@ -0,0 +1,122 @@ +/* + * 20:48 Sunday, February 28 2010 + */ + +package com.mbien.opencl.demos.radixsort; + +import com.mbien.opencl.CLBuffer; +import com.mbien.opencl.CLCommandQueue; +import com.mbien.opencl.CLContext; +import com.mbien.opencl.CLPlatform; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.Random; + +import static com.mbien.opencl.CLMemory.Mem.*; +import static java.lang.System.*; +import static com.mbien.opencl.CLDevice.Type.*; + +/** + * GPU radix sort demo. + * @author Michael Bien + */ +public class RadixSortDemo { + + public RadixSortDemo() throws IOException { + + CLContext context = null; + try{ + //single GPU setup + context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU)); + CLCommandQueue queue = context.getDevices()[0].createCommandQueue(); + + int maxValue = 10000000; + + int[] workgroupSizes = new int[] {128, 256}; + + int[] runs = new int[] { 131072, + 262144, + 524288, + 1048576, + 2097152, + 4194304, + 8388608 }; + + for (int i = 0; i < workgroupSizes.length; i++) { + + int workgroupSize = workgroupSizes[i]; + + out.println("\n = = = workgroup size: "+workgroupSize+" = = = "); + + for(int run = 0; run < runs.length; run++) { + + if(workgroupSize==128 && run == runs.length-1) { + break; // we can only sort up to 4MB with wg size of 128 + } + + int numElements = runs[run]; + + CLBuffer<IntBuffer> array = context.createIntBuffer(numElements, READ_WRITE); + out.print("array size: " + array.getCLSize()/1000000.0f+"MB; "); + out.println("elements: " + array.getCapacity()/1000+"K"); + + fillBuffer(array, maxValue); + + RadixSort radixSort = new RadixSort(queue, numElements, workgroupSize); + queue.finish(); + + long time = currentTimeMillis(); + + queue.putWriteBuffer(array, false); + radixSort.sort(array, numElements, 32); + queue.putReadBuffer(array, true); + + out.println("time: " + (currentTimeMillis() - time)+"ms"); + + out.print("snapshot: "); + printSnapshot(array.getBuffer(), 20); + + out.println("validating..."); + checkIfSorted(array.getBuffer()); + out.println("values sorted"); + + array.release(); + radixSort.release(); + } + } + + }finally{ + if(context != null) { + context.release(); + } + } + + } + + private void fillBuffer(CLBuffer<IntBuffer> array, int maxValue) { + Random random = new Random(42); + for (int n = 0; n < array.getBuffer().capacity(); n++) { + int rnd = random.nextInt(maxValue); + array.getBuffer().put(n, rnd); + } + } + + private void printSnapshot(IntBuffer buffer, int snapshot) { + for(int i = 0; i < snapshot; i++) + out.print(buffer.get() + ", "); + out.println("...; " + buffer.remaining() + " more"); + buffer.rewind(); + } + + private void checkIfSorted(IntBuffer keys) { + for (int i = 1; i < keys.capacity(); i++) { + if (keys.get(i - 1) > keys.get(i)) { + throw new RuntimeException("not sorted "+ keys.get(i - 1) +" !> "+ keys.get(i)); + } + } + } + + public static void main(String[] args) throws IOException { + new RadixSortDemo(); + } +} diff --git a/src/com/mbien/opencl/demos/radixsort/Scan.java b/src/com/mbien/opencl/demos/radixsort/Scan.java new file mode 100644 index 0000000..1426e9b --- /dev/null +++ b/src/com/mbien/opencl/demos/radixsort/Scan.java @@ -0,0 +1,131 @@ +/* + * 22:12 Sunday, February 28 2010 + */ +package com.mbien.opencl.demos.radixsort; + +import com.mbien.opencl.CLBuffer; +import com.mbien.opencl.CLCommandQueue; +import com.mbien.opencl.CLContext; +import com.mbien.opencl.CLKernel; +import com.mbien.opencl.CLProgram; +import com.mbien.opencl.CLResource; +import java.io.IOException; + +import static com.mbien.opencl.CLMemory.Mem.*; +import static com.mbien.opencl.CLProgram.CompilerOptions.*; + +/** + * + * @author Michael Bien + */ +public class Scan implements CLResource { + + private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024; + private final static int MAX_LOCAL_GROUP_SIZE = 256; + private final static int WORKGROUP_SIZE = 256; + private final static int MAX_BATCH_ELEMENTS = 64 * 1048576; + private final static int MIN_SHORT_ARRAY_SIZE = 4; + private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE; + private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE; + private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE; + + private final CLKernel ckScanExclusiveLocal1; + private final CLKernel ckScanExclusiveLocal2; + private final CLKernel ckUniformUpdate; + + private final CLCommandQueue queue; + private final CLProgram program; + private CLBuffer<?> buffer; + + public Scan(CLCommandQueue queue, int numElements) throws IOException { + + this.queue = queue; + + CLContext context = queue.getContext(); + if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) { + buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE); + } + program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl")) + .build(ENABLE_MAD); + + ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1"); + ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2"); + ckUniformUpdate = program.createCLKernel("uniformUpdate"); + } + + // main exclusive scan routine + void scanExclusiveLarge(CLBuffer<?> dst, CLBuffer<?> src, int batchSize, int arrayLength) { + + //Check power-of-two factorization + if(!isPowerOf2(arrayLength)) { + throw new RuntimeException(); + } + + //Check supported size range + if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) { + throw new RuntimeException(); + } + + //Check total batch size limit + if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) { + throw new RuntimeException(); + } + + scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE); + scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE)); + uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE)); + } + + void scanExclusiveLocal1(CLBuffer<?> dst, CLBuffer<?> src, int n, int size) { + + ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size) + .rewind(); + + int localWorkSize = WORKGROUP_SIZE; + int globalWorkSize = (n * size) / 4; + + queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize); + } + + void scanExclusiveLocal2(CLBuffer<?> buffer, CLBuffer<?> dst, CLBuffer<?> src, int n, int size) { + + int elements = n * size; + ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4) + .putArg(elements).putArg(size).rewind(); + + int localWorkSize = WORKGROUP_SIZE; + int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE); + + queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize); + } + + void uniformUpdate(CLBuffer<?> dst, CLBuffer<?> buffer, int n) { + + ckUniformUpdate.setArgs(dst, buffer); + + int localWorkSize = WORKGROUP_SIZE; + int globalWorkSize = n * WORKGROUP_SIZE; + + queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize); + } + + private int iSnapUp(int dividend, int divisor) { + return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor); + } + + public static boolean isPowerOf2(int x) { + return ((x - 1) & x) == 0; + } + + public void release() { + program.release(); + + if(buffer!=null) { + buffer.release(); + } + } + + public void close() { + release(); + } +} diff --git a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl b/src/com/mbien/opencl/demos/radixsort/Scan_b.cl new file mode 100644 index 0000000..32fd4dd --- /dev/null +++ b/src/com/mbien/opencl/demos/radixsort/Scan_b.cl @@ -0,0 +1,190 @@ +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. + * Any use, reproduction, disclosure, or distribution of this software + * and related documentation without an express license agreement from + * NVIDIA Corporation is strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + + + +//All three kernels run 512 threads per workgroup +//Must be a power of two +#define WORKGROUP_SIZE 256 + + + +//////////////////////////////////////////////////////////////////////////////// +// Scan codelets +//////////////////////////////////////////////////////////////////////////////// +#if(1) + //Naive inclusive scan: O(N * log2(N)) operations + //Allocate 2 * 'size' local memory, initialize the first half + //with 'size' zeros avoiding if(pos >= offset) condition evaluation + //and saving instructions + inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); + l_Data[pos] = 0; + pos += size; + l_Data[pos] = idata; + + for(uint offset = 1; offset < size; offset <<= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint t = l_Data[pos] + l_Data[pos - offset]; + barrier(CLK_LOCAL_MEM_FENCE); + l_Data[pos] = t; + } + + return l_Data[pos]; + } + + inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ + return scan1Inclusive(idata, l_Data, size) - idata; + } + +#else + #define LOG2_WARP_SIZE 5U + #define WARP_SIZE (1U << LOG2_WARP_SIZE) + + //Almost the same as naiveScan1 but doesn't need barriers + //assuming size <= WARP_SIZE + inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){ + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); + l_Data[pos] = 0; + pos += size; + l_Data[pos] = idata; + + for(uint offset = 1; offset < size; offset <<= 1) + l_Data[pos] += l_Data[pos - offset]; + + return l_Data[pos]; + } + + inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){ + return warpScanInclusive(idata, l_Data, size) - idata; + } + + inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ + if(size > WARP_SIZE){ + //Bottom-level inclusive warp scan + uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE); + + //Save top elements of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because l_Data is being overwritten) + barrier(CLK_LOCAL_MEM_FENCE); + if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) ) + l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult; + + //wait for warp scans to complete + barrier(CLK_LOCAL_MEM_FENCE); + if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){ + //grab top warp elements + uint val = l_Data[get_local_id(0)]; + //calculate exclsive scan and write back to shared memory + l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE); + } + + //return updated warp scans with exclusive scan results + barrier(CLK_LOCAL_MEM_FENCE); + return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE]; + }else{ + return warpScanInclusive(idata, l_Data, size); + } + } + + inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ + return scan1Inclusive(idata, l_Data, size) - idata; + } +#endif + + +//Vector scan: the array to be scanned is stored +//in work-item private memory as uint4 +inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){ + //Level-0 inclusive scan + data4.y += data4.x; + data4.z += data4.y; + data4.w += data4.z; + + //Level-1 exclusive scan + uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w; + + return (data4 + (uint4)val); +} + +inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){ + return scan4Inclusive(data4, l_Data, size) - data4; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Scan kernels +//////////////////////////////////////////////////////////////////////////////// +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void scanExclusiveLocal1( + __global uint4 *d_Dst, + __global uint4 *d_Src, + __local uint* l_Data, + uint size +){ + //Load data + uint4 idata4 = d_Src[get_global_id(0)]; + + //Calculate exclusive scan + uint4 odata4 = scan4Exclusive(idata4, l_Data, size); + + //Write back + d_Dst[get_global_id(0)] = odata4; +} + +//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE) +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void scanExclusiveLocal2( + __global uint *d_Buf, + __global uint *d_Dst, + __global uint *d_Src, + __local uint* l_Data, + uint N, + uint arrayLength +){ + //Load top elements + //Convert results of bottom-level scan back to inclusive + //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N) + uint data = 0; + if(get_global_id(0) < N) + data = + d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + + d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)]; + + //Compute + uint odata = scan1Exclusive(data, l_Data, arrayLength); + + //Avoid out-of-bound access + if(get_global_id(0) < N) + d_Buf[get_global_id(0)] = odata; +} + +//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void uniformUpdate( + __global uint4 *d_Data, + __global uint *d_Buf +){ + __local uint buf[1]; + + uint4 data4 = d_Data[get_global_id(0)]; + + if(get_local_id(0) == 0) + buf[0] = d_Buf[get_group_id(0)]; + + barrier(CLK_LOCAL_MEM_FENCE); + data4 += (uint4)buf[0]; + d_Data[get_global_id(0)] = data4; +} |