2 files changed, 461 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.cl b/src/com/mbien/opencl/demos/sort/BitonicSort.cl
new file mode 100644
index 0000000..a89b06b
--- /dev/null
+++ b/src/com/mbien/opencl/demos/sort/BitonicSort.cl
@@ -0,0 +1,253 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+//Passed down by clBuildProgram
+//#define LOCAL_SIZE_LIMIT 1024
+
+
+
+inline void ComparatorPrivate(
+    uint *keyA,
+    uint *valA,
+    uint *keyB,
+    uint *valB,
+    uint arrowDir
+){
+    if( (*keyA > *keyB) == arrowDir ){
+        uint t;
+        t = *keyA; *keyA = *keyB; *keyB = t;
+        t = *valA; *valA = *valB; *valB = t;
+    }
+}
+
+inline void ComparatorLocal(
+    __local uint *keyA,
+    __local uint *valA,
+    __local uint *keyB,
+    __local uint *valB,
+    uint arrowDir
+){
+    if( (*keyA > *keyB) == arrowDir ){
+        uint t;
+        t = *keyA; *keyA = *keyB; *keyB = t;
+        t = *valA; *valA = *valB; *valB = t;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Monolithic bitonic sort kernel for short arrays fitting into local memory
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal(
+    __global uint *d_DstKey,
+    __global uint *d_DstVal,
+    __global uint *d_SrcKey,
+    __global uint *d_SrcVal,
+    uint arrayLength,
+    uint sortDir
+){
+    __local  uint l_key[LOCAL_SIZE_LIMIT];
+    __local  uint l_val[LOCAL_SIZE_LIMIT];
+
+    //Offset to the beginning of subbatch and load data
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_val[get_local_id(0) +                      0] = d_SrcVal[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+    l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
+
+    for(uint size = 2; size < arrayLength; size <<= 1){
+        //Bitonic merge
+        uint dir = ( (get_local_id(0) & (size / 2)) != 0 );
+        for(uint stride = size / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0], &l_val[pos +      0],
+                &l_key[pos + stride], &l_val[pos + stride],
+                dir
+            );
+        }
+    }
+
+    //dir == sortDir for the last bitonic merge step
+    {
+        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0], &l_val[pos +      0],
+                &l_key[pos + stride], &l_val[pos + stride],
+                sortDir
+            );
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstVal[                     0] = l_val[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+    d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bitonic sort kernel for large arrays (not fitting into local memory)
+////////////////////////////////////////////////////////////////////////////////
+//Bottom-level bitonic sort
+//Almost the same as bitonicSortLocal with the only exception
+//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
+//sorted in opposite directions
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal1(
+    __global uint *d_DstKey,
+    __global uint *d_DstVal,
+    __global uint *d_SrcKey,
+    __global uint *d_SrcVal
+){
+    __local uint l_key[LOCAL_SIZE_LIMIT];
+    __local uint l_val[LOCAL_SIZE_LIMIT];
+
+    //Offset to the beginning of subarray and load data
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_val[get_local_id(0) +                      0] = d_SrcVal[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+    l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
+
+    uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
+
+    for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
+        //Bitonic merge
+        uint dir = (comparatorI & (size / 2)) != 0;
+        for(uint stride = size / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0], &l_val[pos +      0],
+                &l_key[pos + stride], &l_val[pos + stride],
+                dir
+            );
+        }
+    }
+
+    //Odd / even arrays of LOCAL_SIZE_LIMIT elements
+    //sorted in opposite directions
+    {
+        uint dir = (get_group_id(0) & 1);
+        for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0], &l_val[pos +      0],
+                &l_key[pos + stride], &l_val[pos + stride],
+               dir
+            );
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstVal[                     0] = l_val[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+    d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
+__kernel void bitonicMergeGlobal(
+    __global uint *d_DstKey,
+    __global uint *d_DstVal,
+    __global uint *d_SrcKey,
+    __global uint *d_SrcVal,
+    uint arrayLength,
+    uint size,
+    uint stride,
+    uint sortDir
+){
+    uint global_comparatorI = get_global_id(0);
+    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
+
+    //Bitonic merge
+    uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
+
+    uint keyA = d_SrcKey[pos +      0];
+    uint valA = d_SrcVal[pos +      0];
+    uint keyB = d_SrcKey[pos + stride];
+    uint valB = d_SrcVal[pos + stride];
+
+    ComparatorPrivate(
+        &keyA, &valA,
+        &keyB, &valB,
+        dir
+    );
+
+    d_DstKey[pos +      0] = keyA;
+    d_DstVal[pos +      0] = valA;
+    d_DstKey[pos + stride] = keyB;
+    d_DstVal[pos + stride] = valB;
+}
+
+//Combined bitonic merge steps for
+//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicMergeLocal(
+    __global uint *d_DstKey,
+    __global uint *d_DstVal,
+    __global uint *d_SrcKey,
+    __global uint *d_SrcVal,
+    uint arrayLength,
+    uint stride,
+    uint size,
+    uint sortDir
+){
+    __local uint l_key[LOCAL_SIZE_LIMIT];
+    __local uint l_val[LOCAL_SIZE_LIMIT];
+
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_val[get_local_id(0) +                      0] = d_SrcVal[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+    l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
+
+    //Bitonic merge
+    uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
+    uint         dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+    for(; stride > 0; stride >>= 1){
+        barrier(CLK_LOCAL_MEM_FENCE);
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+        ComparatorLocal(
+            &l_key[pos +      0], &l_val[pos +      0],
+            &l_key[pos + stride], &l_val[pos + stride],
+            dir
+        );
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstVal[                     0] = l_val[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+    d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.java b/src/com/mbien/opencl/demos/sort/BitonicSort.java
new file mode 100644
index 0000000..be28409
--- /dev/null
+++ b/src/com/mbien/opencl/demos/sort/BitonicSort.java
@@ -0,0 +1,208 @@
+/*
+ * 18:42 Saturday, February 27 2010
+ */
+package com.mbien.opencl.demos.sort;
+
+import com.mbien.opencl.CLBuffer;
+import com.mbien.opencl.CLCommandQueue;
+import com.mbien.opencl.CLContext;
+import com.mbien.opencl.CLDevice;
+import com.mbien.opencl.CLKernel;
+import com.mbien.opencl.CLProgram;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.Map;
+import java.util.Random;
+
+import static java.lang.System.*;
+import static com.mbien.opencl.CLMemory.Mem.*;
+import static com.mbien.opencl.CLProgram.*;
+
+/**
+ * Bitonic sort optimized for GPUs.
+ * Uses NVIDIA's bitonic merge sort kernel.
+ * @author Michael Bien
+ */
+public class BitonicSort {
+
+    private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal";
+    private static final String BITONIC_SORT_LOCAL  = "bitonicSortLocal";
+    private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1";
+
+    private final static int LOCAL_SIZE_LIMIT = 1024;
+    private final Map<String, CLKernel> kernels;
+
+    public BitonicSort() throws IOException {
+
+        final int sortDir  = 1;
+        final int elements = 1024;
+        final int maxvalue = 1000000000;
+
+        System.out.println("Initializing OpenCL...");
+
+        //Create the context
+        CLContext context = CLContext.create();
+        CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+        System.out.println("Initializing OpenCL bitonic sorter...");
+        kernels = initBitonicSort(context, queue);
+
+
+        System.out.println("Creating OpenCL memory objects...");
+        CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+
+        // in case of key/value pairs
+//        CLBuffer<IntBuffer> valueBuffer  = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+
+        System.out.println("Initializing data...\n");
+        Random random = new Random();
+        for (int i = 0; i < elements; i++) {
+            int rnd = random.nextInt(maxvalue);
+            keyBuffer.getBuffer().put(i, rnd);
+//            valueBuffer.getBuffer().put(i, rnd); // value can be arbitary
+        }
+
+        int arrayLength = elements;
+        int batch = elements / arrayLength;
+
+        System.out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
+
+//            long time = System.currentTimeMillis();
+
+        bitonicSort(queue, keyBuffer, batch, arrayLength, sortDir);
+
+        queue.putReadBuffer(keyBuffer, true);
+//        queue.putReadBuffer(valueBuffer, true);
+//            System.out.println(System.currentTimeMillis() - time);
+
+        IntBuffer keys = keyBuffer.getBuffer();
+        printSnapshot(keys, 10);
+        checkIfSorted(keys);
+
+//        IntBuffer values = valueBuffer.getBuffer();
+//        printSnapshot(values, 10);
+//        checkIfSorted(values);
+
+        System.out.println();
+
+        System.out.println("TEST PASSED");
+        
+        context.release();
+
+    }
+    
+    private Map<String, CLKernel> initBitonicSort(CLContext context, CLCommandQueue queue) throws IOException {
+
+        System.out.println("    creating bitonic sort program");
+
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
+                                   .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
+
+        Map<String, CLKernel> kernels = program.createCLKernels();
+
+        System.out.println("    checking minimum supported workgroup size");
+        //Check for work group size
+        CLDevice device = queue.getDevice();
+        long szBitonicSortLocal  = kernels.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
+        long szBitonicSortLocal1 = kernels.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
+        long szBitonicMergeLocal = kernels.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
+
+        if (    (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
+             || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
+             || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2))  ) {
+            throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2
+                    +" required by this application is not supported on this device.");
+        }
+
+        return kernels;
+
+    }
+
+    public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, int batch, int arrayLength, int dir) {
+        this.bitonicSort(queue, keys, keys, keys, keys, batch, arrayLength, dir);
+    }
+
+    public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, CLBuffer<?> values, int batch, int arrayLength, int dir) {
+        this.bitonicSort(queue, keys, values, keys, values, batch, arrayLength, dir);
+    }
+
+    public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> dstVal, CLBuffer<?> srcKey, CLBuffer<?> srcVal, int batch, int arrayLength, int dir) {
+
+        if (arrayLength < 2) {
+            throw new IllegalArgumentException("arrayLength was "+arrayLength);
+        }
+
+        // TODO Only power-of-two array lengths are supported so far
+
+        dir = (dir != 0) ? 1 : 0;
+
+        if (arrayLength <= LOCAL_SIZE_LIMIT) {
+
+            //        oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
+
+            //Launch bitonicSortLocal
+            CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL)
+                    .putArgs(dstKey, dstVal, srcKey, srcVal)
+                    .putArg(arrayLength).putArg(dir).rewind();
+
+            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+            int globalWorkSize = batch * arrayLength / 2;
+            queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+
+        } else {
+
+            //Launch bitonicSortLocal1
+            CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL1)
+                    .setArgs(dstKey, dstVal, srcKey, srcVal);
+
+            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+            int globalWorkSize = batch * arrayLength / 2;
+
+            queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+
+            for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
+                for (int stride = size / 2; stride > 0; stride >>= 1) {
+                    if (stride >= LOCAL_SIZE_LIMIT) {
+                        //Launch bitonicMergeGlobal
+                        kernel = kernels.get("bitonicMergeGlobal")
+                                .putArgs(dstKey, dstVal, dstKey, dstVal)
+                                .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
+
+                        globalWorkSize = batch * arrayLength / 2;
+                        queue.put1DRangeKernel(kernel, 0, globalWorkSize, 0);
+                    } else {
+                        //Launch bitonicMergeLocal
+                        kernel = kernels.get(BITONIC_MERGE_LOCAL)
+                                .putArgs(dstKey, dstVal, dstKey, dstVal)
+                                .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
+
+                        localWorkSize = LOCAL_SIZE_LIMIT / 2;
+                        globalWorkSize = batch * arrayLength / 2;
+
+                        queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    private void printSnapshot(IntBuffer buffer, int snapshot) {
+        for(int i = 0; i < snapshot; i++)
+            out.print(buffer.get() + ", ");
+        out.println("...; " + buffer.remaining() + " more");
+        buffer.rewind();
+    }
+
+    private void checkIfSorted(IntBuffer keys) {
+        for (int i = 1; i < keys.capacity(); i++) {
+            if (keys.get(i - 1) > keys.get(i)) {
+                throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i));
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        new BitonicSort();
+    }
+}