moved bitonic sort into seperate package.

author: Michael Bien <[email protected]> 2010-03-01 02:37:16 +0100
committer: Michael Bien <[email protected]> 2010-03-01 02:37:16 +0100
commit: 85a5c5f455aec2062f12d1a4c0a1a3c10beeefb7 (patch)
tree: 659759b91f9b0c7992dd6dfedbd45fe385d331cb /src/com/mbien/opencl/demos/bitonicsort
parent: 04fac379b2bb5876b21d047294bd3e029e6ebe3c (diff)
2 files changed, 415 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
new file mode 100644
index 0000000..a8d0e1d
--- /dev/null
+++ b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
@@ -0,0 +1,214 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+//Passed down by clBuildProgram
+//#define LOCAL_SIZE_LIMIT 1024
+
+
+
+inline void ComparatorPrivate(
+    uint *keyA,
+    uint *keyB,
+    uint arrowDir
+){
+    if( (*keyA > *keyB) == arrowDir ){
+        uint t;
+        t = *keyA; *keyA = *keyB; *keyB = t;
+    }
+}
+
+inline void ComparatorLocal(
+    __local uint *keyA,
+    __local uint *keyB,
+    uint arrowDir
+){
+    if( (*keyA > *keyB) == arrowDir ){
+        uint t;
+        t = *keyA; *keyA = *keyB; *keyB = t;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Monolithic bitonic sort kernel for short arrays fitting into local memory
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey,
+    uint arrayLength,
+    uint sortDir
+){
+    __local  uint l_key[LOCAL_SIZE_LIMIT];
+
+    //Offset to the beginning of subbatch and load data
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+    for(uint size = 2; size < arrayLength; size <<= 1){
+        //Bitonic merge
+        uint dir = ( (get_local_id(0) & (size / 2)) != 0 );
+        for(uint stride = size / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+                dir
+            );
+        }
+    }
+
+    //dir == sortDir for the last bitonic merge step
+    {
+        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+                sortDir
+            );
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bitonic sort kernel for large arrays (not fitting into local memory)
+////////////////////////////////////////////////////////////////////////////////
+//Bottom-level bitonic sort
+//Almost the same as bitonicSortLocal with the only exception
+//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
+//sorted in opposite directions
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal1(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey
+){
+    __local uint l_key[LOCAL_SIZE_LIMIT];
+
+    //Offset to the beginning of subarray and load data
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+    uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
+
+    for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
+        //Bitonic merge
+        uint dir = (comparatorI & (size / 2)) != 0;
+        for(uint stride = size / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+                dir
+            );
+        }
+    }
+
+    //Odd / even arrays of LOCAL_SIZE_LIMIT elements
+    //sorted in opposite directions
+    {
+        uint dir = (get_group_id(0) & 1);
+        for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+               dir
+            );
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
+__kernel void bitonicMergeGlobal(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey,
+    uint arrayLength,
+    uint size,
+    uint stride,
+    uint sortDir
+){
+    uint global_comparatorI = get_global_id(0);
+    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
+
+    //Bitonic merge
+    uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
+
+    uint keyA = d_SrcKey[pos +      0];
+    uint keyB = d_SrcKey[pos + stride];
+
+    ComparatorPrivate(
+        &keyA, 
+        &keyB, 
+        dir
+    );
+
+    d_DstKey[pos +      0] = keyA;
+    d_DstKey[pos + stride] = keyB;
+}
+
+//Combined bitonic merge steps for
+//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicMergeLocal(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey,
+    uint arrayLength,
+    uint stride,
+    uint size,
+    uint sortDir
+){
+    __local uint l_key[LOCAL_SIZE_LIMIT];
+
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+    //Bitonic merge
+    uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
+    uint         dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+    for(; stride > 0; stride >>= 1){
+        barrier(CLK_LOCAL_MEM_FENCE);
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+        ComparatorLocal(
+            &l_key[pos +      0],
+            &l_key[pos + stride],
+            dir
+        );
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
new file mode 100644
index 0000000..ed5b32e
--- /dev/null
+++ b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
@@ -0,0 +1,201 @@
+/*
+ * 18:42 Saturday, February 27 2010
+ */
+package com.mbien.opencl.demos.bitonicsort;
+
+import com.mbien.opencl.CLBuffer;
+import com.mbien.opencl.CLCommandQueue;
+import com.mbien.opencl.CLContext;
+import com.mbien.opencl.CLDevice;
+import com.mbien.opencl.CLKernel;
+import com.mbien.opencl.CLProgram;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.Map;
+import java.util.Random;
+
+import static java.lang.System.*;
+import static com.mbien.opencl.CLMemory.Mem.*;
+import static com.mbien.opencl.CLProgram.*;
+
+/**
+ * Bitonic sort optimized for GPUs.
+ * Uses NVIDIA's bitonic merge sort kernel.
+ * @author Michael Bien
+ */
+public class BitonicSort {
+
+    private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal";
+    private static final String BITONIC_MERGE_LOCAL  = "bitonicMergeLocal";
+    private static final String BITONIC_SORT_LOCAL   = "bitonicSortLocal";
+    private static final String BITONIC_SORT_LOCAL1  = "bitonicSortLocal1";
+
+    private final static int LOCAL_SIZE_LIMIT = 1024;
+    private final Map<String, CLKernel> kernels;
+
+    public BitonicSort() throws IOException {
+
+        final int sortDir  = 1;
+        final int elements = 1048576;
+        final int maxvalue = 1000000;
+
+        out.println("Initializing OpenCL...");
+
+        //Create the context
+        CLContext context = null;
+
+        try{
+
+            context = CLContext.create();
+            CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+            out.println("Initializing OpenCL bitonic sorter...");
+            kernels = initBitonicSort(queue);
+
+            out.println("Creating OpenCL memory objects...");
+            CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+            System.out.println(keyBuffer.getCLSize()/1000000.0f);
+
+            out.println("Initializing data...\n");
+            Random random = new Random();
+            for (int i = 0; i < elements; i++) {
+                int rnd = random.nextInt(maxvalue);
+                keyBuffer.getBuffer().put(i, rnd);
+            }
+
+            int arrayLength = elements;
+            int batch = elements / arrayLength;
+
+            out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
+
+            long time = currentTimeMillis();
+
+            bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir);
+            queue.putReadBuffer(keyBuffer, true);
+
+            out.println(currentTimeMillis() - time+"ms");
+
+            IntBuffer keys = keyBuffer.getBuffer();
+            printSnapshot(keys, 20);
+            checkIfSorted(keys);
+
+            out.println("\nTEST PASSED");
+        
+        }finally{
+            if(context!=null) {
+                context.release();
+            }
+        }
+
+    }
+    
+    private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException {
+
+        out.println("    creating bitonic sort program");
+
+        CLContext context = queue.getContext();
+
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
+                                   .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
+
+        Map<String, CLKernel> kernelMap = program.createCLKernels();
+
+        out.println("    checking minimum supported workgroup size");
+        //Check for work group size
+        CLDevice device = queue.getDevice();
+        long szBitonicSortLocal  = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
+        long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
+        long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
+
+        if (    (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
+             || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
+             || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2))  ) {
+            throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2
+                    +" required by this application is not supported on this device.");
+        }
+
+        return kernelMap;
+
+    }
+
+    public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) {
+
+        if (arrayLength < 2) {
+            throw new IllegalArgumentException("arrayLength was "+arrayLength);
+        }
+
+        // TODO Only power-of-two array lengths are supported so far
+
+        dir = (dir != 0) ? 1 : 0;
+
+        CLKernel sortlocal1  = kernels.get(BITONIC_SORT_LOCAL1);
+        CLKernel sortlocal   = kernels.get(BITONIC_SORT_LOCAL);
+        CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL);
+        CLKernel mergeLocal  = kernels.get(BITONIC_MERGE_LOCAL);
+
+        if (arrayLength <= LOCAL_SIZE_LIMIT) {
+
+            //        oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
+
+            //Launch bitonicSortLocal
+            sortlocal.putArgs(dstKey, srcKey)
+                     .putArg(arrayLength).putArg(dir).rewind();
+
+            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+            int globalWorkSize = batch * arrayLength / 2;
+            queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize);
+
+        } else {
+
+            //Launch bitonicSortLocal1
+            sortlocal1.setArgs(dstKey, srcKey);
+
+            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+            int globalWorkSize = batch * arrayLength / 2;
+
+            queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize);
+
+            for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
+                for (int stride = size / 2; stride > 0; stride >>= 1) {
+                    if (stride >= LOCAL_SIZE_LIMIT) {
+                        //Launch bitonicMergeGlobal
+                        mergeGlobal.putArgs(dstKey, dstKey)
+                                   .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
+
+                        globalWorkSize = batch * arrayLength / 2;
+                        queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0);
+                    } else {
+                        //Launch bitonicMergeLocal
+                        mergeLocal.putArgs(dstKey, dstKey)
+                                  .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
+
+                        localWorkSize = LOCAL_SIZE_LIMIT / 2;
+                        globalWorkSize = batch * arrayLength / 2;
+
+                        queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    private void printSnapshot(IntBuffer buffer, int snapshot) {
+        for(int i = 0; i < snapshot; i++)
+            out.print(buffer.get() + ", ");
+        out.println("...; " + buffer.remaining() + " more");
+        buffer.rewind();
+    }
+
+    private void checkIfSorted(IntBuffer keys) {
+        for (int i = 1; i < keys.capacity(); i++) {
+            if (keys.get(i - 1) > keys.get(i)) {
+                throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i));
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        new BitonicSort();
+    }
+}
author	Michael Bien <[email protected]>	2010-03-01 02:37:16 +0100
committer	Michael Bien <[email protected]>	2010-03-01 02:37:16 +0100
commit	85a5c5f455aec2062f12d1a4c0a1a3c10beeefb7 (patch)
tree	659759b91f9b0c7992dd6dfedbd45fe385d331cb /src/com/mbien/opencl/demos/bitonicsort
parent	04fac379b2bb5876b21d047294bd3e029e6ebe3c (diff)