initial import of OpenCL RadixSort GPU example.

author: Michael Bien <[email protected]> 2010-03-01 02:17:29 +0100
committer: Michael Bien <[email protected]> 2010-03-01 02:17:29 +0100
commit: 48a46a6084878803acadd44b0e96e237cf5768c1 (patch)
tree: d7f25e2500bcad62ae9e11963d8eeb3c3194ef05 /src
parent: 238d41376cde8d1f0426d50e9b2fdebbe31cee74 (diff)
5 files changed, 984 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl b/src/com/mbien/opencl/demos/radixsort/RadixSort.cl
new file mode 100644
index 0000000..d014692
--- /dev/null
+++ b/src/com/mbien/opencl/demos/radixsort/RadixSort.cl
@@ -0,0 +1,358 @@
+/*
+* Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+*
+* NVIDIA Corporation and its licensors retain all intellectual property and 
+* proprietary rights in and to this software and related documentation.
+* Any use, reproduction, disclosure, or distribution of this software
+* and related documentation without an express license agreement from 
+* NVIDIA Corporation is strictly prohibited.
+* 
+* Please refer to the applicable NVIDIA end user license agreement (EULA) 
+* associated with this source code for terms and conditions that govern 
+* your use of this NVIDIA software.
+* 
+*/
+
+//----------------------------------------------------------------------------
+// Scans each warp in parallel ("warp-scan"), one element per thread.
+// uses 2 numElements of shared memory per thread (64 = elements per warp)
+//----------------------------------------------------------------------------
+//#define WARP_SIZE 32
+uint scanwarp(uint val, __local uint* sData, int maxlevel)
+{
+    // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
+    int localId = get_local_id(0);
+    int idx = 2 * localId - (localId & (WARP_SIZE - 1));
+    sData[idx] = 0;
+    idx += WARP_SIZE;
+    sData[idx] = val;     
+
+    if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; }
+    if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; }
+    if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; }
+    if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; }
+    if (4 <= maxlevel) { sData[idx] += sData[idx -16]; }
+
+    return sData[idx] - val;  // convert inclusive -> exclusive
+}
+
+//----------------------------------------------------------------------------
+// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using 
+// a warp-scan algorithm
+//----------------------------------------------------------------------------
+uint4 scan4(uint4 idata, __local uint* ptr)
+{    
+    
+    uint idx = get_local_id(0);
+
+    uint4 val4 = idata;
+    uint sum[3];
+    sum[0] = val4.x;
+    sum[1] = val4.y + sum[0];
+    sum[2] = val4.z + sum[1];
+    
+    uint val = val4.w + sum[2];
+    
+    val = scanwarp(val, ptr, 4);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
+    {
+        ptr[idx >> 5] = val + val4.w + sum[2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+	if (idx < WARP_SIZE)
+		ptr[idx] = scanwarp(ptr[idx], ptr, 2);
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    val += ptr[idx >> 5];
+
+    val4.x = val;
+    val4.y = val + sum[0];
+    val4.z = val + sum[1];
+    val4.w = val + sum[2];
+
+    return val4;
+}
+
+#ifdef MAC
+__kernel uint4 rank4(uint4 preds, __local uint* sMem)
+#else
+uint4 rank4(uint4 preds, __local uint* sMem)
+#endif
+{
+	int localId = get_local_id(0);
+	int localSize = get_local_size(0);
+
+	uint4 address = scan4(preds, sMem);
+	
+	__local uint numtrue;
+	if (localId == localSize - 1) 
+	{
+		numtrue = address.w + preds.w;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	uint4 rank;
+	int idx = localId*4;
+	rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
+	rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
+	rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
+	rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
+	
+	return rank;
+}
+
+void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem)
+{
+	int localId = get_local_id(0);
+    int localSize = get_local_size(0);
+	
+	for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+	{
+		uint4 lsb;
+		lsb.x = !(((*key).x >> shift) & 0x1);
+		lsb.y = !(((*key).y >> shift) & 0x1);
+        lsb.z = !(((*key).z >> shift) & 0x1);
+        lsb.w = !(((*key).w >> shift) & 0x1);
+        
+		uint4 r;
+		
+		r = rank4(lsb, sMem);
+
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x;
+        sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y;
+        sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z;
+        sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // The above allows us to read without 4-way bank conflicts:
+        (*key).x = sMem[localId];
+        (*key).y = sMem[localId +     localSize];
+        (*key).z = sMem[localId + 2 * localSize];
+        (*key).w = sMem[localId + 3 * localSize];
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+}
+
+__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, 
+                                      __global uint4* keysOut,
+                                      uint nbits,
+                                      uint startbit,
+                                      uint numElements,
+                                      uint totalBlocks,
+                                      __local uint* sMem)
+{
+	int globalId = get_global_id(0);
+	
+	uint4 key;
+	key = keysIn[globalId];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	radixSortBlockKeysOnly(&key, nbits, startbit, sMem);
+	
+	keysOut[globalId] = key;
+}
+
+//----------------------------------------------------------------------------
+// Given an array with blocks sorted according to a 4-bit radix group, each 
+// block counts the number of keys that fall into each radix in the group, and 
+// finds the starting offset of each radix in the block.  It then writes the radix 
+// counts to the counters array, and the starting offsets to the blockOffsets array.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//                                
+//----------------------------------------------------------------------------
+__kernel void findRadixOffsets(__global uint2* keys,
+                               __global uint* counters,
+                               __global uint* blockOffsets,
+                               uint startbit,
+                               uint numElements,
+                               uint totalBlocks,
+                               __local uint* sRadix1)
+{
+	__local uint  sStartPointers[16];
+
+    uint groupId = get_group_id(0);
+    uint localId = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    uint2 radix2;
+
+    radix2 = keys[get_global_id(0)];
+        
+
+    sRadix1[2 * localId]     = (radix2.x >> startbit) & 0xF;
+    sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
+
+    // Finds the position where the sRadix1 entries differ and stores start 
+    // index for each radix.
+    if(localId < 16) 
+    {
+        sStartPointers[localId] = 0; 
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId]] = localId;
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) 
+    {
+        sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        blockOffsets[groupId*16 + localId] = sStartPointers[localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Compute the sizes of each block.
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId - 1]] = 
+            localId - sStartPointers[sRadix1[localId - 1]];
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) 
+    {
+        sStartPointers[sRadix1[localId + groupSize - 1]] = 
+            localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
+    }
+        
+
+    if(localId == groupSize - 1) 
+    {
+        sStartPointers[sRadix1[2 * groupSize - 1]] = 
+            2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        counters[localId * totalBlocks + groupId] = sStartPointers[localId];
+    }
+}
+
+// a naive scan routine that works only for array that
+// can fit into a single block, just for debugging purpose,
+// not used in the sort now
+__kernel void scanNaive(__global uint *g_odata, 
+                        __global uint *g_idata, 
+                        uint n,
+                        __local uint* temp)
+{
+
+    int localId = get_local_id(0);
+
+    int pout = 0;
+    int pin = 1;
+
+    // Cache the computational window in shared memory
+    temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0;
+
+    for (int offset = 1; offset < n; offset *= 2)
+    {
+        pout = 1 - pout;
+        pin  = 1 - pout;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        temp[pout*n+localId] = temp[pin*n+localId];
+
+        if (localId >= offset)
+            temp[pout*n+localId] += temp[pin*n+localId - offset];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    g_odata[localId] = temp[pout*n+localId];
+}
+
+//----------------------------------------------------------------------------
+// reorderData shuffles data in the array globally after the radix offsets 
+// have been found. On compute version 1.1 and earlier GPUs, this code depends 
+// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+// 
+// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+// that all writes are coalesced using extra work in the kernel.  On later
+// GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+// performance.  On these GPUs we set manualCoalesce=false and directly store
+// the results.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//----------------------------------------------------------------------------
+__kernel void reorderDataKeysOnly(__global uint  *outKeys, 
+                                  __global uint2  *keys, 
+                                  __global uint  *blockOffsets, 
+                                  __global uint  *offsets, 
+                                  __global uint  *sizes, 
+                                  uint startbit,
+                                  uint numElements,
+                                  uint totalBlocks,
+                                  __local uint2* sKeys2)
+{
+    __local uint sOffsets[16];
+    __local uint sBlockOffsets[16];
+
+    __local uint *sKeys1 = (__local uint*)sKeys2; 
+
+    uint groupId = get_group_id(0);
+
+	uint globalId = get_global_id(0);
+    uint localId = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    sKeys2[localId]   = keys[globalId];
+
+    if(localId < 16)  
+    {
+        sOffsets[localId]      = offsets[localId * totalBlocks + groupId];
+        sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint radix = (sKeys1[localId] >> startbit) & 0xF;
+    uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId];
+    }
+
+    radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
+    globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId + groupSize];
+    }
+ 
+
+}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.java b/src/com/mbien/opencl/demos/radixsort/RadixSort.java
new file mode 100644
index 0000000..783c094
--- /dev/null
+++ b/src/com/mbien/opencl/demos/radixsort/RadixSort.java
@@ -0,0 +1,183 @@
+/*
+ * 20:38 Sunday, February 28 2010
+ */
+
+package com.mbien.opencl.demos.radixsort;
+
+import com.mbien.opencl.CLBuffer;
+import com.mbien.opencl.CLCommandQueue;
+import com.mbien.opencl.CLContext;
+import com.mbien.opencl.CLKernel;
+import com.mbien.opencl.CLProgram;
+import com.mbien.opencl.CLResource;
+import java.io.IOException;
+import java.nio.IntBuffer;
+
+import static com.mbien.opencl.CLMemory.Mem.*;
+import static com.mbien.opencl.CLProgram.*;
+import static com.mbien.opencl.CLProgram.CompilerOptions.*;
+import static java.lang.System.*;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class RadixSort implements CLResource {
+
+    private static final int NUM_BANKS = 16;
+    private static final int WARP_SIZE = 32;
+    private static final int bitStep   = 4;
+
+    private final int CTA_SIZE;
+
+    private final CLKernel ckRadixSortBlocksKeysOnly;
+    private final CLKernel ckFindRadixOffsets;
+    private final CLKernel ckScanNaive;
+    private final CLKernel ckReorderDataKeysOnly;
+
+    private final CLBuffer<?> tempKeys;
+    private final CLBuffer<?> mCounters;
+    private final CLBuffer<?> mCountersSum;
+    private final CLBuffer<?> mBlockOffsets;
+
+    private final CLCommandQueue queue;
+    private final Scan scan;
+    private final CLProgram program;
+
+    public RadixSort(CLCommandQueue queue, int maxElements, int CTA_SIZE) throws IOException {
+
+        this.CTA_SIZE = CTA_SIZE;
+        scan = new Scan(queue, maxElements / 2 / CTA_SIZE * 16);
+
+        int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0)
+                ? (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1);
+
+        this.queue = queue;
+
+        CLContext context  = queue.getContext();
+        this.tempKeys      = context.createBuffer(4 * maxElements,           READ_WRITE);
+        this.mCounters     = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
+        this.mCountersSum  = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
+        this.mBlockOffsets = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
+
+        program = context.createProgram(getClass().getResourceAsStream("RadixSort.cl"))
+                         .build(ENABLE_MAD, define("WARP_SIZE", WARP_SIZE));
+
+//        out.println(program.getBuildLog());
+
+        ckRadixSortBlocksKeysOnly  = program.createCLKernel("radixSortBlocksKeysOnly");
+        ckFindRadixOffsets         = program.createCLKernel("findRadixOffsets");
+        ckScanNaive                = program.createCLKernel("scanNaive");
+        ckReorderDataKeysOnly      = program.createCLKernel("reorderDataKeysOnly");
+
+    }
+
+    void sort(CLBuffer<IntBuffer> d_keys, int numElements, int keyBits) {
+        radixSortKeysOnly(d_keys, numElements, keyBits);
+    }
+
+    //----------------------------------------------------------------------------
+    // Main key-only radix sort function.  Sorts in place in the keys and values
+    // arrays, but uses the other device arrays as temporary storage.  All pointer
+    // parameters are device pointers.  Uses cudppScan() for the prefix sum of
+    // radix counters.
+    //----------------------------------------------------------------------------
+    void radixSortKeysOnly(CLBuffer<IntBuffer> keys, int numElements, int keyBits) {
+        int i = 0;
+        while (keyBits > i * bitStep) {
+            radixSortStepKeysOnly(keys, bitStep, i * bitStep, numElements);
+            i++;
+        }
+    }
+
+    //----------------------------------------------------------------------------
+    // Perform one step of the radix sort.  Sorts by nbits key bits per step,
+    // starting at startbit.
+    //----------------------------------------------------------------------------
+    void radixSortStepKeysOnly(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
+
+        // Four step algorithms from Satish, Harris & Garland
+        radixSortBlocksKeysOnlyOCL(keys, nbits, startbit, numElements);
+
+        findRadixOffsetsOCL(startbit, numElements);
+
+        scan.scanExclusiveLarge(mCountersSum, mCounters, 1, numElements / 2 / CTA_SIZE * 16);
+
+        reorderDataKeysOnlyOCL(keys, startbit, numElements);
+    }
+
+    //----------------------------------------------------------------------------
+    // Wrapper for the kernels of the four steps
+    //----------------------------------------------------------------------------
+    void radixSortBlocksKeysOnlyOCL(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
+
+        int totalBlocks = numElements / 4 / CTA_SIZE;
+        int globalWorkSize = CTA_SIZE * totalBlocks;
+        int localWorkSize = CTA_SIZE;
+
+        ckRadixSortBlocksKeysOnly.putArg(keys).putArg(tempKeys).putArg(nbits).putArg(startbit)
+                                 .putArg(numElements).putArg(totalBlocks).putNullArg(4 * CTA_SIZE * 4)
+                                 .rewind();
+
+        queue.put1DRangeKernel(ckRadixSortBlocksKeysOnly, 0, globalWorkSize, localWorkSize);
+    }
+
+    void findRadixOffsetsOCL(int startbit, int numElements) {
+
+        int totalBlocks = numElements / 2 / CTA_SIZE;
+        int globalWorkSize = CTA_SIZE * totalBlocks;
+        int localWorkSize = CTA_SIZE;
+
+        ckFindRadixOffsets.putArg(tempKeys).putArg(mCounters).putArg(mBlockOffsets)
+                          .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4)
+                          .rewind();
+
+        queue.put1DRangeKernel(ckFindRadixOffsets, 0, globalWorkSize, localWorkSize);
+    }
+
+    void scanNaiveOCL(int numElements) {
+        
+        int nHist = numElements / 2 / CTA_SIZE * 16;
+        int globalWorkSize = nHist;
+        int localWorkSize = nHist;
+        int extra_space = nHist / NUM_BANKS;
+        int shared_mem_size = 4 * (nHist + extra_space);
+
+        ckScanNaive.putArg(mCountersSum).putArg(mCounters).putArg(nHist).putNullArg(2 * shared_mem_size).rewind();
+
+        queue.put1DRangeKernel(ckScanNaive, 0, globalWorkSize, localWorkSize);
+    }
+
+    void reorderDataKeysOnlyOCL(CLBuffer<IntBuffer> keys, int startbit, int numElements) {
+
+        int totalBlocks = numElements / 2 / CTA_SIZE;
+        int globalWorkSize = CTA_SIZE * totalBlocks;
+        int localWorkSize = CTA_SIZE;
+
+        ckReorderDataKeysOnly.putArg(keys).putArg(tempKeys).putArg(mBlockOffsets).putArg(mCountersSum).putArg(mCounters)
+                             .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4).rewind();
+
+        queue.put1DRangeKernel(ckReorderDataKeysOnly, 0, globalWorkSize, localWorkSize);
+    }
+
+    public void release() {
+
+        scan.release();
+
+        //program & kernels
+        program.release();
+
+        //buffers
+        tempKeys.release();
+        mCounters.release();
+        mCountersSum.release();
+        mBlockOffsets.release();
+    }
+
+    public void close() {
+        release();
+    }
+
+
+
+}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java b/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java
new file mode 100644
index 0000000..a46eb22
--- /dev/null
+++ b/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java
@@ -0,0 +1,122 @@
+/*
+ * 20:48 Sunday, February 28 2010
+ */
+
+package com.mbien.opencl.demos.radixsort;
+
+import com.mbien.opencl.CLBuffer;
+import com.mbien.opencl.CLCommandQueue;
+import com.mbien.opencl.CLContext;
+import com.mbien.opencl.CLPlatform;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.Random;
+
+import static com.mbien.opencl.CLMemory.Mem.*;
+import static java.lang.System.*;
+import static com.mbien.opencl.CLDevice.Type.*;
+
+/**
+ * GPU radix sort demo.
+ * @author Michael Bien
+ */
+public class RadixSortDemo {
+
+    public RadixSortDemo() throws IOException {
+
+        CLContext context = null;
+        try{
+            //single GPU setup
+            context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU));
+            CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
+
+            int maxValue = 10000000;
+
+            int[] workgroupSizes = new int[] {128, 256};
+
+            int[] runs = new int[] {  131072,
+                                      262144,
+                                      524288,
+                                     1048576,
+                                     2097152,
+                                     4194304,
+                                     8388608 };
+
+            for (int i = 0; i < workgroupSizes.length; i++) {
+
+                int workgroupSize = workgroupSizes[i];
+
+                out.println("\n = = = workgroup size: "+workgroupSize+" = = = ");
+
+                for(int run = 0; run < runs.length; run++) {
+
+                    if(workgroupSize==128 && run == runs.length-1) {
+                        break; // we can only sort up to 4MB with wg size of 128
+                    }
+
+                    int numElements = runs[run];
+
+                    CLBuffer<IntBuffer> array = context.createIntBuffer(numElements, READ_WRITE);
+                    out.print("array size: " + array.getCLSize()/1000000.0f+"MB; ");
+                    out.println("elements: " + array.getCapacity()/1000+"K");
+
+                    fillBuffer(array, maxValue);
+
+                    RadixSort radixSort = new RadixSort(queue, numElements, workgroupSize);
+                    queue.finish();
+
+                    long time = currentTimeMillis();
+
+                        queue.putWriteBuffer(array, false);
+                        radixSort.sort(array, numElements, 32);
+                        queue.putReadBuffer(array, true);
+
+                    out.println("time: " + (currentTimeMillis() - time)+"ms");
+
+                    out.print("snapshot: ");
+                    printSnapshot(array.getBuffer(), 20);
+
+                    out.println("validating...");
+                    checkIfSorted(array.getBuffer());
+                    out.println("values sorted");
+
+                    array.release();
+                    radixSort.release();
+                }
+            }
+            
+        }finally{
+            if(context != null) {
+                context.release();
+            }
+        }
+
+    }
+
+    private void fillBuffer(CLBuffer<IntBuffer> array, int maxValue) {
+        Random random = new Random(42);
+        for (int n = 0; n < array.getBuffer().capacity(); n++) {
+            int rnd = random.nextInt(maxValue);
+            array.getBuffer().put(n, rnd);
+        }
+    }
+
+    private void printSnapshot(IntBuffer buffer, int snapshot) {
+        for(int i = 0; i < snapshot; i++)
+            out.print(buffer.get() + ", ");
+        out.println("...; " + buffer.remaining() + " more");
+        buffer.rewind();
+    }
+
+    private void checkIfSorted(IntBuffer keys) {
+        for (int i = 1; i < keys.capacity(); i++) {
+            if (keys.get(i - 1) > keys.get(i)) {
+                throw new RuntimeException("not sorted "+ keys.get(i - 1) +" !> "+ keys.get(i));
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        new RadixSortDemo();
+    }
+}
diff --git a/src/com/mbien/opencl/demos/radixsort/Scan.java b/src/com/mbien/opencl/demos/radixsort/Scan.java
new file mode 100644
index 0000000..1426e9b
--- /dev/null
+++ b/src/com/mbien/opencl/demos/radixsort/Scan.java
@@ -0,0 +1,131 @@
+/*
+ * 22:12 Sunday, February 28 2010
+ */
+package com.mbien.opencl.demos.radixsort;
+
+import com.mbien.opencl.CLBuffer;
+import com.mbien.opencl.CLCommandQueue;
+import com.mbien.opencl.CLContext;
+import com.mbien.opencl.CLKernel;
+import com.mbien.opencl.CLProgram;
+import com.mbien.opencl.CLResource;
+import java.io.IOException;
+
+import static com.mbien.opencl.CLMemory.Mem.*;
+import static com.mbien.opencl.CLProgram.CompilerOptions.*;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class Scan implements CLResource {
+
+    private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024;
+    private final static int MAX_LOCAL_GROUP_SIZE = 256;
+    private final static int WORKGROUP_SIZE = 256;
+    private final static int MAX_BATCH_ELEMENTS = 64 * 1048576;
+    private final static int MIN_SHORT_ARRAY_SIZE = 4;
+    private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE;
+    private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE;
+    private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE;
+
+    private final CLKernel ckScanExclusiveLocal1;
+    private final CLKernel ckScanExclusiveLocal2;
+    private final CLKernel ckUniformUpdate;
+
+    private final CLCommandQueue queue;
+    private final CLProgram program;
+    private CLBuffer<?> buffer;
+
+    public Scan(CLCommandQueue queue, int numElements) throws IOException {
+
+        this.queue = queue;
+
+        CLContext context = queue.getContext();
+        if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) {
+            buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE);
+        }
+        program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl"))
+                         .build(ENABLE_MAD);
+
+        ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1");
+        ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2");
+        ckUniformUpdate       = program.createCLKernel("uniformUpdate");
+    }
+
+    // main exclusive scan routine
+    void scanExclusiveLarge(CLBuffer<?> dst, CLBuffer<?> src, int batchSize, int arrayLength) {
+
+        //Check power-of-two factorization
+        if(!isPowerOf2(arrayLength)) {
+            throw new RuntimeException();
+        }
+
+        //Check supported size range
+        if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) {
+            throw new RuntimeException();
+        }
+
+        //Check total batch size limit
+        if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) {
+            throw new RuntimeException();
+        }
+
+        scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE);
+        scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE));
+        uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE));
+    }
+
+    void scanExclusiveLocal1(CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
+
+        ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size)
+                             .rewind();
+
+        int localWorkSize = WORKGROUP_SIZE;
+        int globalWorkSize = (n * size) / 4;
+
+        queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize);
+    }
+
+    void scanExclusiveLocal2(CLBuffer<?> buffer, CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
+
+        int elements = n * size;
+        ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4)
+                             .putArg(elements).putArg(size).rewind();
+
+        int localWorkSize = WORKGROUP_SIZE;
+        int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);
+
+        queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize);
+    }
+
+    void uniformUpdate(CLBuffer<?> dst, CLBuffer<?> buffer, int n) {
+
+        ckUniformUpdate.setArgs(dst, buffer);
+
+        int localWorkSize  = WORKGROUP_SIZE;
+        int globalWorkSize = n * WORKGROUP_SIZE;
+
+        queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize);
+    }
+
+    private int iSnapUp(int dividend, int divisor) {
+        return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor);
+    }
+
+    public static boolean isPowerOf2(int x) {
+        return ((x - 1) & x) == 0;
+    }
+
+    public void release() {
+        program.release();
+
+        if(buffer!=null) {
+            buffer.release();
+        }
+    }
+
+    public void close() {
+        release();
+    }
+}
diff --git a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl b/src/com/mbien/opencl/demos/radixsort/Scan_b.cl
new file mode 100644
index 0000000..32fd4dd
--- /dev/null
+++ b/src/com/mbien/opencl/demos/radixsort/Scan_b.cl
@@ -0,0 +1,190 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+//All three kernels run 512 threads per workgroup
+//Must be a power of two
+#define WORKGROUP_SIZE 256
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Scan codelets
+////////////////////////////////////////////////////////////////////////////////
+#if(1)
+    //Naive inclusive scan: O(N * log2(N)) operations
+    //Allocate 2 * 'size' local memory, initialize the first half
+    //with 'size' zeros avoiding if(pos >= offset) condition evaluation
+    //and saving instructions
+    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
+        l_Data[pos] = 0;
+        pos += size;
+        l_Data[pos] = idata;
+
+        for(uint offset = 1; offset < size; offset <<= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint t = l_Data[pos] + l_Data[pos - offset];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            l_Data[pos] = t;
+        }
+
+        return l_Data[pos];
+    }
+
+    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
+        return scan1Inclusive(idata, l_Data, size) - idata;
+    }
+
+#else
+    #define LOG2_WARP_SIZE 5U
+    #define      WARP_SIZE (1U << LOG2_WARP_SIZE)
+
+    //Almost the same as naiveScan1 but doesn't need barriers
+    //assuming size <= WARP_SIZE
+    inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
+        l_Data[pos] = 0;
+        pos += size;
+        l_Data[pos] = idata;
+
+        for(uint offset = 1; offset < size; offset <<= 1)
+            l_Data[pos] += l_Data[pos - offset];
+
+        return l_Data[pos];
+    }
+
+    inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){
+        return warpScanInclusive(idata, l_Data, size) - idata;
+    }
+
+    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
+        if(size > WARP_SIZE){
+            //Bottom-level inclusive warp scan
+            uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE);
+
+            //Save top elements of each warp for exclusive warp scan
+            //sync to wait for warp scans to complete (because l_Data is being overwritten)
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
+                l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult;
+
+            //wait for warp scans to complete
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){
+                //grab top warp elements
+                uint val = l_Data[get_local_id(0)];
+                //calculate exclsive scan and write back to shared memory
+                l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE);
+            }
+
+            //return updated warp scans with exclusive scan results
+            barrier(CLK_LOCAL_MEM_FENCE);
+            return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE];
+        }else{
+            return warpScanInclusive(idata, l_Data, size);
+        }
+    }
+
+    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
+        return scan1Inclusive(idata, l_Data, size) - idata;
+    }
+#endif
+
+
+//Vector scan: the array to be scanned is stored
+//in work-item private memory as uint4
+inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){
+    //Level-0 inclusive scan
+    data4.y += data4.x;
+    data4.z += data4.y;
+    data4.w += data4.z;
+
+    //Level-1 exclusive scan
+    uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w;
+
+    return (data4 + (uint4)val);
+}
+
+inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){
+    return scan4Inclusive(data4, l_Data, size) - data4;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Scan kernels
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void scanExclusiveLocal1(
+    __global uint4 *d_Dst,
+    __global uint4 *d_Src,
+    __local uint* l_Data,
+    uint size
+){
+    //Load data
+    uint4 idata4 = d_Src[get_global_id(0)];
+
+    //Calculate exclusive scan
+    uint4 odata4  = scan4Exclusive(idata4, l_Data, size);
+
+    //Write back
+    d_Dst[get_global_id(0)] = odata4;
+}
+
+//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void scanExclusiveLocal2(
+    __global uint *d_Buf,
+    __global uint *d_Dst,
+    __global uint *d_Src,
+    __local uint* l_Data,
+    uint N,
+    uint arrayLength
+){
+    //Load top elements
+    //Convert results of bottom-level scan back to inclusive
+    //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N)
+    uint data = 0;
+    if(get_global_id(0) < N)
+    data =
+        d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + 
+        d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)];
+
+    //Compute
+    uint odata = scan1Exclusive(data, l_Data, arrayLength);
+
+    //Avoid out-of-bound access
+    if(get_global_id(0) < N)
+        d_Buf[get_global_id(0)] = odata;
+}
+
+//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void uniformUpdate(
+    __global uint4 *d_Data,
+    __global uint *d_Buf
+){
+    __local uint buf[1];
+
+    uint4 data4 = d_Data[get_global_id(0)];
+
+    if(get_local_id(0) == 0)
+        buf[0] = d_Buf[get_group_id(0)];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    data4 += (uint4)buf[0];
+    d_Data[get_global_id(0)] = data4;
+}
author	Michael Bien <[email protected]>	2010-03-01 02:17:29 +0100
committer	Michael Bien <[email protected]>	2010-03-01 02:17:29 +0100
commit	48a46a6084878803acadd44b0e96e237cf5768c1 (patch)
tree	d7f25e2500bcad62ae9e11963d8eeb3c3194ef05 /src
parent	238d41376cde8d1f0426d50e9b2fdebbe31cee74 (diff)