summaryrefslogtreecommitdiffstats
path: root/src/com
diff options
context:
space:
mode:
Diffstat (limited to 'src/com')
-rw-r--r--src/com/mbien/opencl/demos/sort/BitonicSort.cl253
-rw-r--r--src/com/mbien/opencl/demos/sort/BitonicSort.java208
2 files changed, 461 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.cl b/src/com/mbien/opencl/demos/sort/BitonicSort.cl
new file mode 100644
index 0000000..a89b06b
--- /dev/null
+++ b/src/com/mbien/opencl/demos/sort/BitonicSort.cl
@@ -0,0 +1,253 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.
+ * Any use, reproduction, disclosure, or distribution of this software
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA)
+ * associated with this source code for terms and conditions that govern
+ * your use of this NVIDIA software.
+ *
+ */
+
+
+
+//Passed down by clBuildProgram
+//#define LOCAL_SIZE_LIMIT 1024
+
+
+
+inline void ComparatorPrivate(
+ uint *keyA,
+ uint *valA,
+ uint *keyB,
+ uint *valB,
+ uint arrowDir
+){
+ if( (*keyA > *keyB) == arrowDir ){
+ uint t;
+ t = *keyA; *keyA = *keyB; *keyB = t;
+ t = *valA; *valA = *valB; *valB = t;
+ }
+}
+
+inline void ComparatorLocal(
+ __local uint *keyA,
+ __local uint *valA,
+ __local uint *keyB,
+ __local uint *valB,
+ uint arrowDir
+){
+ if( (*keyA > *keyB) == arrowDir ){
+ uint t;
+ t = *keyA; *keyA = *keyB; *keyB = t;
+ t = *valA; *valA = *valB; *valB = t;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Monolithic bitonic sort kernel for short arrays fitting into local memory
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal(
+ __global uint *d_DstKey,
+ __global uint *d_DstVal,
+ __global uint *d_SrcKey,
+ __global uint *d_SrcVal,
+ uint arrayLength,
+ uint sortDir
+){
+ __local uint l_key[LOCAL_SIZE_LIMIT];
+ __local uint l_val[LOCAL_SIZE_LIMIT];
+
+ //Offset to the beginning of subbatch and load data
+ d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
+ l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
+ l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+ l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
+
+ for(uint size = 2; size < arrayLength; size <<= 1){
+ //Bitonic merge
+ uint dir = ( (get_local_id(0) & (size / 2)) != 0 );
+ for(uint stride = size / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0], &l_val[pos + 0],
+ &l_key[pos + stride], &l_val[pos + stride],
+ dir
+ );
+ }
+ }
+
+ //dir == sortDir for the last bitonic merge step
+ {
+ for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0], &l_val[pos + 0],
+ &l_key[pos + stride], &l_val[pos + stride],
+ sortDir
+ );
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ d_DstKey[ 0] = l_key[get_local_id(0) + 0];
+ d_DstVal[ 0] = l_val[get_local_id(0) + 0];
+ d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+ d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bitonic sort kernel for large arrays (not fitting into local memory)
+////////////////////////////////////////////////////////////////////////////////
+//Bottom-level bitonic sort
+//Almost the same as bitonicSortLocal with the only exception
+//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
+//sorted in opposite directions
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal1(
+ __global uint *d_DstKey,
+ __global uint *d_DstVal,
+ __global uint *d_SrcKey,
+ __global uint *d_SrcVal
+){
+ __local uint l_key[LOCAL_SIZE_LIMIT];
+ __local uint l_val[LOCAL_SIZE_LIMIT];
+
+ //Offset to the beginning of subarray and load data
+ d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
+ l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
+ l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+ l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
+
+ uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
+
+ for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
+ //Bitonic merge
+ uint dir = (comparatorI & (size / 2)) != 0;
+ for(uint stride = size / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0], &l_val[pos + 0],
+ &l_key[pos + stride], &l_val[pos + stride],
+ dir
+ );
+ }
+ }
+
+ //Odd / even arrays of LOCAL_SIZE_LIMIT elements
+ //sorted in opposite directions
+ {
+ uint dir = (get_group_id(0) & 1);
+ for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0], &l_val[pos + 0],
+ &l_key[pos + stride], &l_val[pos + stride],
+ dir
+ );
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ d_DstKey[ 0] = l_key[get_local_id(0) + 0];
+ d_DstVal[ 0] = l_val[get_local_id(0) + 0];
+ d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+ d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
+__kernel void bitonicMergeGlobal(
+ __global uint *d_DstKey,
+ __global uint *d_DstVal,
+ __global uint *d_SrcKey,
+ __global uint *d_SrcVal,
+ uint arrayLength,
+ uint size,
+ uint stride,
+ uint sortDir
+){
+ uint global_comparatorI = get_global_id(0);
+ uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
+
+ //Bitonic merge
+ uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+ uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
+
+ uint keyA = d_SrcKey[pos + 0];
+ uint valA = d_SrcVal[pos + 0];
+ uint keyB = d_SrcKey[pos + stride];
+ uint valB = d_SrcVal[pos + stride];
+
+ ComparatorPrivate(
+ &keyA, &valA,
+ &keyB, &valB,
+ dir
+ );
+
+ d_DstKey[pos + 0] = keyA;
+ d_DstVal[pos + 0] = valA;
+ d_DstKey[pos + stride] = keyB;
+ d_DstVal[pos + stride] = valB;
+}
+
+//Combined bitonic merge steps for
+//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicMergeLocal(
+ __global uint *d_DstKey,
+ __global uint *d_DstVal,
+ __global uint *d_SrcKey,
+ __global uint *d_SrcVal,
+ uint arrayLength,
+ uint stride,
+ uint size,
+ uint sortDir
+){
+ __local uint l_key[LOCAL_SIZE_LIMIT];
+ __local uint l_val[LOCAL_SIZE_LIMIT];
+
+ d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+ l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
+ l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
+ l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+ l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
+
+ //Bitonic merge
+ uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
+ uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+ for(; stride > 0; stride >>= 1){
+ barrier(CLK_LOCAL_MEM_FENCE);
+ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ ComparatorLocal(
+ &l_key[pos + 0], &l_val[pos + 0],
+ &l_key[pos + stride], &l_val[pos + stride],
+ dir
+ );
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ d_DstKey[ 0] = l_key[get_local_id(0) + 0];
+ d_DstVal[ 0] = l_val[get_local_id(0) + 0];
+ d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+ d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.java b/src/com/mbien/opencl/demos/sort/BitonicSort.java
new file mode 100644
index 0000000..be28409
--- /dev/null
+++ b/src/com/mbien/opencl/demos/sort/BitonicSort.java
@@ -0,0 +1,208 @@
+/*
+ * 18:42 Saturday, February 27 2010
+ */
+package com.mbien.opencl.demos.sort;
+
+import com.mbien.opencl.CLBuffer;
+import com.mbien.opencl.CLCommandQueue;
+import com.mbien.opencl.CLContext;
+import com.mbien.opencl.CLDevice;
+import com.mbien.opencl.CLKernel;
+import com.mbien.opencl.CLProgram;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.Map;
+import java.util.Random;
+
+import static java.lang.System.*;
+import static com.mbien.opencl.CLMemory.Mem.*;
+import static com.mbien.opencl.CLProgram.*;
+
+/**
+ * Bitonic sort optimized for GPUs.
+ * Uses NVIDIA's bitonic merge sort kernel.
+ * @author Michael Bien
+ */
+public class BitonicSort {
+
+ private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal";
+ private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal";
+ private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1";
+
+ private final static int LOCAL_SIZE_LIMIT = 1024;
+ private final Map<String, CLKernel> kernels;
+
+ public BitonicSort() throws IOException {
+
+ final int sortDir = 1;
+ final int elements = 1024;
+ final int maxvalue = 1000000000;
+
+ System.out.println("Initializing OpenCL...");
+
+ //Create the context
+ CLContext context = CLContext.create();
+ CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+ System.out.println("Initializing OpenCL bitonic sorter...");
+ kernels = initBitonicSort(context, queue);
+
+
+ System.out.println("Creating OpenCL memory objects...");
+ CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+
+ // in case of key/value pairs
+// CLBuffer<IntBuffer> valueBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+
+ System.out.println("Initializing data...\n");
+ Random random = new Random();
+ for (int i = 0; i < elements; i++) {
+ int rnd = random.nextInt(maxvalue);
+ keyBuffer.getBuffer().put(i, rnd);
+// valueBuffer.getBuffer().put(i, rnd); // value can be arbitary
+ }
+
+ int arrayLength = elements;
+ int batch = elements / arrayLength;
+
+ System.out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
+
+// long time = System.currentTimeMillis();
+
+ bitonicSort(queue, keyBuffer, batch, arrayLength, sortDir);
+
+ queue.putReadBuffer(keyBuffer, true);
+// queue.putReadBuffer(valueBuffer, true);
+// System.out.println(System.currentTimeMillis() - time);
+
+ IntBuffer keys = keyBuffer.getBuffer();
+ printSnapshot(keys, 10);
+ checkIfSorted(keys);
+
+// IntBuffer values = valueBuffer.getBuffer();
+// printSnapshot(values, 10);
+// checkIfSorted(values);
+
+ System.out.println();
+
+ System.out.println("TEST PASSED");
+
+ context.release();
+
+ }
+
+ private Map<String, CLKernel> initBitonicSort(CLContext context, CLCommandQueue queue) throws IOException {
+
+ System.out.println(" creating bitonic sort program");
+
+ CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
+ .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
+
+ Map<String, CLKernel> kernels = program.createCLKernels();
+
+ System.out.println(" checking minimum supported workgroup size");
+ //Check for work group size
+ CLDevice device = queue.getDevice();
+ long szBitonicSortLocal = kernels.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
+ long szBitonicSortLocal1 = kernels.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
+ long szBitonicMergeLocal = kernels.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
+
+ if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
+ || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
+ || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) {
+ throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2
+ +" required by this application is not supported on this device.");
+ }
+
+ return kernels;
+
+ }
+
+ public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, int batch, int arrayLength, int dir) {
+ this.bitonicSort(queue, keys, keys, keys, keys, batch, arrayLength, dir);
+ }
+
+ public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, CLBuffer<?> values, int batch, int arrayLength, int dir) {
+ this.bitonicSort(queue, keys, values, keys, values, batch, arrayLength, dir);
+ }
+
+ public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> dstVal, CLBuffer<?> srcKey, CLBuffer<?> srcVal, int batch, int arrayLength, int dir) {
+
+ if (arrayLength < 2) {
+ throw new IllegalArgumentException("arrayLength was "+arrayLength);
+ }
+
+ // TODO Only power-of-two array lengths are supported so far
+
+ dir = (dir != 0) ? 1 : 0;
+
+ if (arrayLength <= LOCAL_SIZE_LIMIT) {
+
+ // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
+
+ //Launch bitonicSortLocal
+ CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL)
+ .putArgs(dstKey, dstVal, srcKey, srcVal)
+ .putArg(arrayLength).putArg(dir).rewind();
+
+ int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+ int globalWorkSize = batch * arrayLength / 2;
+ queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+
+ } else {
+
+ //Launch bitonicSortLocal1
+ CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL1)
+ .setArgs(dstKey, dstVal, srcKey, srcVal);
+
+ int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+ int globalWorkSize = batch * arrayLength / 2;
+
+ queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+
+ for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
+ for (int stride = size / 2; stride > 0; stride >>= 1) {
+ if (stride >= LOCAL_SIZE_LIMIT) {
+ //Launch bitonicMergeGlobal
+ kernel = kernels.get("bitonicMergeGlobal")
+ .putArgs(dstKey, dstVal, dstKey, dstVal)
+ .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
+
+ globalWorkSize = batch * arrayLength / 2;
+ queue.put1DRangeKernel(kernel, 0, globalWorkSize, 0);
+ } else {
+ //Launch bitonicMergeLocal
+ kernel = kernels.get(BITONIC_MERGE_LOCAL)
+ .putArgs(dstKey, dstVal, dstKey, dstVal)
+ .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
+
+ localWorkSize = LOCAL_SIZE_LIMIT / 2;
+ globalWorkSize = batch * arrayLength / 2;
+
+ queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ private void printSnapshot(IntBuffer buffer, int snapshot) {
+ for(int i = 0; i < snapshot; i++)
+ out.print(buffer.get() + ", ");
+ out.println("...; " + buffer.remaining() + " more");
+ buffer.rewind();
+ }
+
+ private void checkIfSorted(IntBuffer keys) {
+ for (int i = 1; i < keys.capacity(); i++) {
+ if (keys.get(i - 1) > keys.get(i)) {
+ throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i));
+ }
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ new BitonicSort();
+ }
+}