aboutsummaryrefslogtreecommitdiffstats
path: root/src/com/mbien/opencl/demos/bitonicsort
diff options
context:
space:
mode:
authorMichael Bien <[email protected]>2010-03-01 02:37:16 +0100
committerMichael Bien <[email protected]>2010-03-01 02:37:16 +0100
commit85a5c5f455aec2062f12d1a4c0a1a3c10beeefb7 (patch)
tree659759b91f9b0c7992dd6dfedbd45fe385d331cb /src/com/mbien/opencl/demos/bitonicsort
parent04fac379b2bb5876b21d047294bd3e029e6ebe3c (diff)
moved bitonic sort into seperate package.
Diffstat (limited to 'src/com/mbien/opencl/demos/bitonicsort')
-rw-r--r--src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl214
-rw-r--r--src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java201
2 files changed, 415 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
new file mode 100644
index 0000000..a8d0e1d
--- /dev/null
+++ b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
@@ -0,0 +1,214 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.
+ * Any use, reproduction, disclosure, or distribution of this software
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA)
+ * associated with this source code for terms and conditions that govern
+ * your use of this NVIDIA software.
+ *
+ */
+
+
+
+//Passed down by clBuildProgram
+//#define LOCAL_SIZE_LIMIT 1024
+
+
+
+inline void ComparatorPrivate(
+ uint *keyA,
+ uint *keyB,
+ uint arrowDir
+){
+ if( (*keyA > *keyB) == arrowDir ){
+ uint t;
+ t = *keyA; *keyA = *keyB; *keyB = t;
+ }
+}
+
+inline void ComparatorLocal(
+ __local uint *keyA,
+ __local uint *keyB,
+ uint arrowDir
+){
+ if( (*keyA > *keyB) == arrowDir ){
+ uint t;
+ t = *keyA; *keyA = *keyB; *keyB = t;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Monolithic bitonic sort kernel for short arrays fitting into local memory
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal(
+ __global uint *d_DstKey,
+ __global uint *d_SrcKey,
+ uint arrayLength,
+ uint sortDir
+){
+ __local uint l_key[LOCAL_SIZE_LIMIT];
+
+ //Offset to the beginning of subbatch and load data
+ d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
+ l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+ for(uint size = 2; size < arrayLength; size <<= 1){
+ //Bitonic merge
+ uint dir = ( (get_local_id(0) & (size / 2)) != 0 );
+ for(uint stride = size / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0],
+ &l_key[pos + stride],
+ dir
+ );
+ }
+ }
+
+ //dir == sortDir for the last bitonic merge step
+ {
+ for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0],
+ &l_key[pos + stride],
+ sortDir
+ );
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ d_DstKey[ 0] = l_key[get_local_id(0) + 0];
+ d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bitonic sort kernel for large arrays (not fitting into local memory)
+////////////////////////////////////////////////////////////////////////////////
+//Bottom-level bitonic sort
+//Almost the same as bitonicSortLocal with the only exception
+//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
+//sorted in opposite directions
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal1(
+ __global uint *d_DstKey,
+ __global uint *d_SrcKey
+){
+ __local uint l_key[LOCAL_SIZE_LIMIT];
+
+ //Offset to the beginning of subarray and load data
+ d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
+ l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+ uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
+
+ for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
+ //Bitonic merge
+ uint dir = (comparatorI & (size / 2)) != 0;
+ for(uint stride = size / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0],
+ &l_key[pos + stride],
+ dir
+ );
+ }
+ }
+
+ //Odd / even arrays of LOCAL_SIZE_LIMIT elements
+ //sorted in opposite directions
+ {
+ uint dir = (get_group_id(0) & 1);
+ for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0],
+ &l_key[pos + stride],
+ dir
+ );
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ d_DstKey[ 0] = l_key[get_local_id(0) + 0];
+ d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
+__kernel void bitonicMergeGlobal(
+ __global uint *d_DstKey,
+ __global uint *d_SrcKey,
+ uint arrayLength,
+ uint size,
+ uint stride,
+ uint sortDir
+){
+ uint global_comparatorI = get_global_id(0);
+ uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
+
+ //Bitonic merge
+ uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+ uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
+
+ uint keyA = d_SrcKey[pos + 0];
+ uint keyB = d_SrcKey[pos + stride];
+
+ ComparatorPrivate(
+ &keyA,
+ &keyB,
+ dir
+ );
+
+ d_DstKey[pos + 0] = keyA;
+ d_DstKey[pos + stride] = keyB;
+}
+
+//Combined bitonic merge steps for
+//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicMergeLocal(
+ __global uint *d_DstKey,
+ __global uint *d_SrcKey,
+ uint arrayLength,
+ uint stride,
+ uint size,
+ uint sortDir
+){
+ __local uint l_key[LOCAL_SIZE_LIMIT];
+
+ d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
+ l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+ //Bitonic merge
+ uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
+ uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+ for(; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0],
+ &l_key[pos + stride],
+ dir
+ );
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ d_DstKey[ 0] = l_key[get_local_id(0) + 0];
+ d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
new file mode 100644
index 0000000..ed5b32e
--- /dev/null
+++ b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
@@ -0,0 +1,201 @@
+/*
+ * 18:42 Saturday, February 27 2010
+ */
+package com.mbien.opencl.demos.bitonicsort;
+
+import com.mbien.opencl.CLBuffer;
+import com.mbien.opencl.CLCommandQueue;
+import com.mbien.opencl.CLContext;
+import com.mbien.opencl.CLDevice;
+import com.mbien.opencl.CLKernel;
+import com.mbien.opencl.CLProgram;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.Map;
+import java.util.Random;
+
+import static java.lang.System.*;
+import static com.mbien.opencl.CLMemory.Mem.*;
+import static com.mbien.opencl.CLProgram.*;
+
+/**
+ * Bitonic sort optimized for GPUs.
+ * Uses NVIDIA's bitonic merge sort kernel.
+ * @author Michael Bien
+ */
+public class BitonicSort {
+
+ private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal";
+ private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal";
+ private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal";
+ private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1";
+
+ private final static int LOCAL_SIZE_LIMIT = 1024;
+ private final Map<String, CLKernel> kernels;
+
+ public BitonicSort() throws IOException {
+
+ final int sortDir = 1;
+ final int elements = 1048576;
+ final int maxvalue = 1000000;
+
+ out.println("Initializing OpenCL...");
+
+ //Create the context
+ CLContext context = null;
+
+ try{
+
+ context = CLContext.create();
+ CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+ out.println("Initializing OpenCL bitonic sorter...");
+ kernels = initBitonicSort(queue);
+
+ out.println("Creating OpenCL memory objects...");
+ CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+ System.out.println(keyBuffer.getCLSize()/1000000.0f);
+
+ out.println("Initializing data...\n");
+ Random random = new Random();
+ for (int i = 0; i < elements; i++) {
+ int rnd = random.nextInt(maxvalue);
+ keyBuffer.getBuffer().put(i, rnd);
+ }
+
+ int arrayLength = elements;
+ int batch = elements / arrayLength;
+
+ out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
+
+ long time = currentTimeMillis();
+
+ bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir);
+ queue.putReadBuffer(keyBuffer, true);
+
+ out.println(currentTimeMillis() - time+"ms");
+
+ IntBuffer keys = keyBuffer.getBuffer();
+ printSnapshot(keys, 20);
+ checkIfSorted(keys);
+
+ out.println("\nTEST PASSED");
+
+ }finally{
+ if(context!=null) {
+ context.release();
+ }
+ }
+
+ }
+
+ private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException {
+
+ out.println(" creating bitonic sort program");
+
+ CLContext context = queue.getContext();
+
+ CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
+ .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
+
+ Map<String, CLKernel> kernelMap = program.createCLKernels();
+
+ out.println(" checking minimum supported workgroup size");
+ //Check for work group size
+ CLDevice device = queue.getDevice();
+ long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
+ long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
+ long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
+
+ if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
+ || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
+ || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) {
+ throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2
+ +" required by this application is not supported on this device.");
+ }
+
+ return kernelMap;
+
+ }
+
+ public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) {
+
+ if (arrayLength < 2) {
+ throw new IllegalArgumentException("arrayLength was "+arrayLength);
+ }
+
+ // TODO Only power-of-two array lengths are supported so far
+
+ dir = (dir != 0) ? 1 : 0;
+
+ CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1);
+ CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL);
+ CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL);
+ CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL);
+
+ if (arrayLength <= LOCAL_SIZE_LIMIT) {
+
+ // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
+
+ //Launch bitonicSortLocal
+ sortlocal.putArgs(dstKey, srcKey)
+ .putArg(arrayLength).putArg(dir).rewind();
+
+ int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+ int globalWorkSize = batch * arrayLength / 2;
+ queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize);
+
+ } else {
+
+ //Launch bitonicSortLocal1
+ sortlocal1.setArgs(dstKey, srcKey);
+
+ int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+ int globalWorkSize = batch * arrayLength / 2;
+
+ queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize);
+
+ for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
+ for (int stride = size / 2; stride > 0; stride >>= 1) {
+ if (stride >= LOCAL_SIZE_LIMIT) {
+ //Launch bitonicMergeGlobal
+ mergeGlobal.putArgs(dstKey, dstKey)
+ .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
+
+ globalWorkSize = batch * arrayLength / 2;
+ queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0);
+ } else {
+ //Launch bitonicMergeLocal
+ mergeLocal.putArgs(dstKey, dstKey)
+ .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
+
+ localWorkSize = LOCAL_SIZE_LIMIT / 2;
+ globalWorkSize = batch * arrayLength / 2;
+
+ queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ private void printSnapshot(IntBuffer buffer, int snapshot) {
+ for(int i = 0; i < snapshot; i++)
+ out.print(buffer.get() + ", ");
+ out.println("...; " + buffer.remaining() + " more");
+ buffer.rewind();
+ }
+
+ private void checkIfSorted(IntBuffer keys) {
+ for (int i = 1; i < keys.capacity(); i++) {
+ if (keys.get(i - 1) > keys.get(i)) {
+ throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i));
+ }
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ new BitonicSort();
+ }
+}