diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/com/mbien/opencl/demos/sort/BitonicSort.cl | 253 | ||||
-rw-r--r-- | src/com/mbien/opencl/demos/sort/BitonicSort.java | 208 |
2 files changed, 461 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.cl b/src/com/mbien/opencl/demos/sort/BitonicSort.cl new file mode 100644 index 0000000..a89b06b --- /dev/null +++ b/src/com/mbien/opencl/demos/sort/BitonicSort.cl @@ -0,0 +1,253 @@ +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. + * Any use, reproduction, disclosure, or distribution of this software + * and related documentation without an express license agreement from + * NVIDIA Corporation is strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + + + +//Passed down by clBuildProgram +//#define LOCAL_SIZE_LIMIT 1024 + + + +inline void ComparatorPrivate( + uint *keyA, + uint *valA, + uint *keyB, + uint *valB, + uint arrowDir +){ + if( (*keyA > *keyB) == arrowDir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + t = *valA; *valA = *valB; *valB = t; + } +} + +inline void ComparatorLocal( + __local uint *keyA, + __local uint *valA, + __local uint *keyB, + __local uint *valB, + uint arrowDir +){ + if( (*keyA > *keyB) == arrowDir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + t = *valA; *valA = *valB; *valB = t; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into local memory +//////////////////////////////////////////////////////////////////////////////// +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicSortLocal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint sortDir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subbatch and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + for(uint size = 2; size < arrayLength; size <<= 1){ + //Bitonic merge + uint dir = ( (get_local_id(0) & (size / 2)) != 0 ); + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + dir + ); + } + } + + //dir == sortDir for the last bitonic merge step + { + for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + sortDir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into local memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortLocal with the only exception +//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being +//sorted in opposite directions +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicSortLocal1( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subarray and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); + + for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ + //Bitonic merge + uint dir = (comparatorI & (size / 2)) != 0; + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + dir + ); + } + } + + //Odd / even arrays of LOCAL_SIZE_LIMIT elements + //sorted in opposite directions + { + uint dir = (get_group_id(0) & 1); + for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + dir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT +__kernel void bitonicMergeGlobal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint size, + uint stride, + uint sortDir +){ + uint global_comparatorI = get_global_id(0); + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + uint keyA = d_SrcKey[pos + 0]; + uint valA = d_SrcVal[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + uint valB = d_SrcVal[pos + stride]; + + ComparatorPrivate( + &keyA, &valA, + &keyB, &valB, + dir + ); + + d_DstKey[pos + 0] = keyA; + d_DstVal[pos + 0] = valA; + d_DstKey[pos + stride] = keyB; + d_DstVal[pos + stride] = valB; +} + +//Combined bitonic merge steps for +//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicMergeLocal( + __global uint *d_DstKey, + __global uint *d_DstVal, + __global uint *d_SrcKey, + __global uint *d_SrcVal, + uint arrayLength, + uint stride, + uint size, + uint sortDir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + __local uint l_val[LOCAL_SIZE_LIMIT]; + + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; + + //Bitonic merge + uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); + uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); + for(; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], &l_val[pos + 0], + &l_key[pos + stride], &l_val[pos + stride], + dir + ); + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstVal[ 0] = l_val[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; + d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.java b/src/com/mbien/opencl/demos/sort/BitonicSort.java new file mode 100644 index 0000000..be28409 --- /dev/null +++ b/src/com/mbien/opencl/demos/sort/BitonicSort.java @@ -0,0 +1,208 @@ +/* + * 18:42 Saturday, February 27 2010 + */ +package com.mbien.opencl.demos.sort; + +import com.mbien.opencl.CLBuffer; +import com.mbien.opencl.CLCommandQueue; +import com.mbien.opencl.CLContext; +import com.mbien.opencl.CLDevice; +import com.mbien.opencl.CLKernel; +import com.mbien.opencl.CLProgram; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.Map; +import java.util.Random; + +import static java.lang.System.*; +import static com.mbien.opencl.CLMemory.Mem.*; +import static com.mbien.opencl.CLProgram.*; + +/** + * Bitonic sort optimized for GPUs. + * Uses NVIDIA's bitonic merge sort kernel. + * @author Michael Bien + */ +public class BitonicSort { + + private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal"; + private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal"; + private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1"; + + private final static int LOCAL_SIZE_LIMIT = 1024; + private final Map<String, CLKernel> kernels; + + public BitonicSort() throws IOException { + + final int sortDir = 1; + final int elements = 1024; + final int maxvalue = 1000000000; + + System.out.println("Initializing OpenCL..."); + + //Create the context + CLContext context = CLContext.create(); + CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); + + System.out.println("Initializing OpenCL bitonic sorter..."); + kernels = initBitonicSort(context, queue); + + + System.out.println("Creating OpenCL memory objects..."); + CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); + + // in case of key/value pairs +// CLBuffer<IntBuffer> valueBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); + + System.out.println("Initializing data...\n"); + Random random = new Random(); + for (int i = 0; i < elements; i++) { + int rnd = random.nextInt(maxvalue); + keyBuffer.getBuffer().put(i, rnd); +// valueBuffer.getBuffer().put(i, rnd); // value can be arbitary + } + + int arrayLength = elements; + int batch = elements / arrayLength; + + System.out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch); + +// long time = System.currentTimeMillis(); + + bitonicSort(queue, keyBuffer, batch, arrayLength, sortDir); + + queue.putReadBuffer(keyBuffer, true); +// queue.putReadBuffer(valueBuffer, true); +// System.out.println(System.currentTimeMillis() - time); + + IntBuffer keys = keyBuffer.getBuffer(); + printSnapshot(keys, 10); + checkIfSorted(keys); + +// IntBuffer values = valueBuffer.getBuffer(); +// printSnapshot(values, 10); +// checkIfSorted(values); + + System.out.println(); + + System.out.println("TEST PASSED"); + + context.release(); + + } + + private Map<String, CLKernel> initBitonicSort(CLContext context, CLCommandQueue queue) throws IOException { + + System.out.println(" creating bitonic sort program"); + + CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl")) + .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT)); + + Map<String, CLKernel> kernels = program.createCLKernels(); + + System.out.println(" checking minimum supported workgroup size"); + //Check for work group size + CLDevice device = queue.getDevice(); + long szBitonicSortLocal = kernels.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device); + long szBitonicSortLocal1 = kernels.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device); + long szBitonicMergeLocal = kernels.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device); + + if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) + || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) + || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) { + throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2 + +" required by this application is not supported on this device."); + } + + return kernels; + + } + + public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, int batch, int arrayLength, int dir) { + this.bitonicSort(queue, keys, keys, keys, keys, batch, arrayLength, dir); + } + + public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, CLBuffer<?> values, int batch, int arrayLength, int dir) { + this.bitonicSort(queue, keys, values, keys, values, batch, arrayLength, dir); + } + + public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> dstVal, CLBuffer<?> srcKey, CLBuffer<?> srcVal, int batch, int arrayLength, int dir) { + + if (arrayLength < 2) { + throw new IllegalArgumentException("arrayLength was "+arrayLength); + } + + // TODO Only power-of-two array lengths are supported so far + + dir = (dir != 0) ? 1 : 0; + + if (arrayLength <= LOCAL_SIZE_LIMIT) { + + // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE ); + + //Launch bitonicSortLocal + CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL) + .putArgs(dstKey, dstVal, srcKey, srcVal) + .putArg(arrayLength).putArg(dir).rewind(); + + int localWorkSize = LOCAL_SIZE_LIMIT / 2; + int globalWorkSize = batch * arrayLength / 2; + queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + + } else { + + //Launch bitonicSortLocal1 + CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL1) + .setArgs(dstKey, dstVal, srcKey, srcVal); + + int localWorkSize = LOCAL_SIZE_LIMIT / 2; + int globalWorkSize = batch * arrayLength / 2; + + queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + + for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { + for (int stride = size / 2; stride > 0; stride >>= 1) { + if (stride >= LOCAL_SIZE_LIMIT) { + //Launch bitonicMergeGlobal + kernel = kernels.get("bitonicMergeGlobal") + .putArgs(dstKey, dstVal, dstKey, dstVal) + .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind(); + + globalWorkSize = batch * arrayLength / 2; + queue.put1DRangeKernel(kernel, 0, globalWorkSize, 0); + } else { + //Launch bitonicMergeLocal + kernel = kernels.get(BITONIC_MERGE_LOCAL) + .putArgs(dstKey, dstVal, dstKey, dstVal) + .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind(); + + localWorkSize = LOCAL_SIZE_LIMIT / 2; + globalWorkSize = batch * arrayLength / 2; + + queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + break; + } + } + } + } + } + + private void printSnapshot(IntBuffer buffer, int snapshot) { + for(int i = 0; i < snapshot; i++) + out.print(buffer.get() + ", "); + out.println("...; " + buffer.remaining() + " more"); + buffer.rewind(); + } + + private void checkIfSorted(IntBuffer keys) { + for (int i = 1; i < keys.capacity(); i++) { + if (keys.get(i - 1) > keys.get(i)) { + throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i)); + } + } + } + + public static void main(String[] args) throws IOException { + new BitonicSort(); + } +} |