diff options
author | Michael Bien <[email protected]> | 2010-03-01 02:37:16 +0100 |
---|---|---|
committer | Michael Bien <[email protected]> | 2010-03-01 02:37:16 +0100 |
commit | 85a5c5f455aec2062f12d1a4c0a1a3c10beeefb7 (patch) | |
tree | 659759b91f9b0c7992dd6dfedbd45fe385d331cb /src/com/mbien/opencl/demos/bitonicsort | |
parent | 04fac379b2bb5876b21d047294bd3e029e6ebe3c (diff) |
moved bitonic sort into seperate package.
Diffstat (limited to 'src/com/mbien/opencl/demos/bitonicsort')
-rw-r--r-- | src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl | 214 | ||||
-rw-r--r-- | src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java | 201 |
2 files changed, 415 insertions, 0 deletions
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl new file mode 100644 index 0000000..a8d0e1d --- /dev/null +++ b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl @@ -0,0 +1,214 @@ +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. + * Any use, reproduction, disclosure, or distribution of this software + * and related documentation without an express license agreement from + * NVIDIA Corporation is strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + + + +//Passed down by clBuildProgram +//#define LOCAL_SIZE_LIMIT 1024 + + + +inline void ComparatorPrivate( + uint *keyA, + uint *keyB, + uint arrowDir +){ + if( (*keyA > *keyB) == arrowDir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + } +} + +inline void ComparatorLocal( + __local uint *keyA, + __local uint *keyB, + uint arrowDir +){ + if( (*keyA > *keyB) == arrowDir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into local memory +//////////////////////////////////////////////////////////////////////////////// +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicSortLocal( + __global uint *d_DstKey, + __global uint *d_SrcKey, + uint arrayLength, + uint sortDir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subbatch and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + + for(uint size = 2; size < arrayLength; size <<= 1){ + //Bitonic merge + uint dir = ( (get_local_id(0) & (size / 2)) != 0 ); + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + } + + //dir == sortDir for the last bitonic merge step + { + for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + sortDir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into local memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortLocal with the only exception +//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being +//sorted in opposite directions +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicSortLocal1( + __global uint *d_DstKey, + __global uint *d_SrcKey +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subarray and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + + uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); + + for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ + //Bitonic merge + uint dir = (comparatorI & (size / 2)) != 0; + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + } + + //Odd / even arrays of LOCAL_SIZE_LIMIT elements + //sorted in opposite directions + { + uint dir = (get_group_id(0) & 1); + for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT +__kernel void bitonicMergeGlobal( + __global uint *d_DstKey, + __global uint *d_SrcKey, + uint arrayLength, + uint size, + uint stride, + uint sortDir +){ + uint global_comparatorI = get_global_id(0); + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + uint keyA = d_SrcKey[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + + ComparatorPrivate( + &keyA, + &keyB, + dir + ); + + d_DstKey[pos + 0] = keyA; + d_DstKey[pos + stride] = keyB; +} + +//Combined bitonic merge steps for +//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicMergeLocal( + __global uint *d_DstKey, + __global uint *d_SrcKey, + uint arrayLength, + uint stride, + uint size, + uint sortDir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + + //Bitonic merge + uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); + uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); + for(; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java new file mode 100644 index 0000000..ed5b32e --- /dev/null +++ b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java @@ -0,0 +1,201 @@ +/* + * 18:42 Saturday, February 27 2010 + */ +package com.mbien.opencl.demos.bitonicsort; + +import com.mbien.opencl.CLBuffer; +import com.mbien.opencl.CLCommandQueue; +import com.mbien.opencl.CLContext; +import com.mbien.opencl.CLDevice; +import com.mbien.opencl.CLKernel; +import com.mbien.opencl.CLProgram; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.Map; +import java.util.Random; + +import static java.lang.System.*; +import static com.mbien.opencl.CLMemory.Mem.*; +import static com.mbien.opencl.CLProgram.*; + +/** + * Bitonic sort optimized for GPUs. + * Uses NVIDIA's bitonic merge sort kernel. + * @author Michael Bien + */ +public class BitonicSort { + + private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal"; + private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal"; + private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal"; + private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1"; + + private final static int LOCAL_SIZE_LIMIT = 1024; + private final Map<String, CLKernel> kernels; + + public BitonicSort() throws IOException { + + final int sortDir = 1; + final int elements = 1048576; + final int maxvalue = 1000000; + + out.println("Initializing OpenCL..."); + + //Create the context + CLContext context = null; + + try{ + + context = CLContext.create(); + CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); + + out.println("Initializing OpenCL bitonic sorter..."); + kernels = initBitonicSort(queue); + + out.println("Creating OpenCL memory objects..."); + CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); + System.out.println(keyBuffer.getCLSize()/1000000.0f); + + out.println("Initializing data...\n"); + Random random = new Random(); + for (int i = 0; i < elements; i++) { + int rnd = random.nextInt(maxvalue); + keyBuffer.getBuffer().put(i, rnd); + } + + int arrayLength = elements; + int batch = elements / arrayLength; + + out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch); + + long time = currentTimeMillis(); + + bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir); + queue.putReadBuffer(keyBuffer, true); + + out.println(currentTimeMillis() - time+"ms"); + + IntBuffer keys = keyBuffer.getBuffer(); + printSnapshot(keys, 20); + checkIfSorted(keys); + + out.println("\nTEST PASSED"); + + }finally{ + if(context!=null) { + context.release(); + } + } + + } + + private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException { + + out.println(" creating bitonic sort program"); + + CLContext context = queue.getContext(); + + CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl")) + .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT)); + + Map<String, CLKernel> kernelMap = program.createCLKernels(); + + out.println(" checking minimum supported workgroup size"); + //Check for work group size + CLDevice device = queue.getDevice(); + long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device); + long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device); + long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device); + + if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) + || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) + || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) { + throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2 + +" required by this application is not supported on this device."); + } + + return kernelMap; + + } + + public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) { + + if (arrayLength < 2) { + throw new IllegalArgumentException("arrayLength was "+arrayLength); + } + + // TODO Only power-of-two array lengths are supported so far + + dir = (dir != 0) ? 1 : 0; + + CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1); + CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL); + CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL); + CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL); + + if (arrayLength <= LOCAL_SIZE_LIMIT) { + + // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE ); + + //Launch bitonicSortLocal + sortlocal.putArgs(dstKey, srcKey) + .putArg(arrayLength).putArg(dir).rewind(); + + int localWorkSize = LOCAL_SIZE_LIMIT / 2; + int globalWorkSize = batch * arrayLength / 2; + queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize); + + } else { + + //Launch bitonicSortLocal1 + sortlocal1.setArgs(dstKey, srcKey); + + int localWorkSize = LOCAL_SIZE_LIMIT / 2; + int globalWorkSize = batch * arrayLength / 2; + + queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize); + + for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { + for (int stride = size / 2; stride > 0; stride >>= 1) { + if (stride >= LOCAL_SIZE_LIMIT) { + //Launch bitonicMergeGlobal + mergeGlobal.putArgs(dstKey, dstKey) + .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind(); + + globalWorkSize = batch * arrayLength / 2; + queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0); + } else { + //Launch bitonicMergeLocal + mergeLocal.putArgs(dstKey, dstKey) + .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind(); + + localWorkSize = LOCAL_SIZE_LIMIT / 2; + globalWorkSize = batch * arrayLength / 2; + + queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize); + break; + } + } + } + } + } + + private void printSnapshot(IntBuffer buffer, int snapshot) { + for(int i = 0; i < snapshot; i++) + out.print(buffer.get() + ", "); + out.println("...; " + buffer.remaining() + " more"); + buffer.rewind(); + } + + private void checkIfSorted(IntBuffer keys) { + for (int i = 1; i < keys.capacity(); i++) { + if (keys.get(i - 1) > keys.get(i)) { + throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i)); + } + } + } + + public static void main(String[] args) throws IOException { + new BitonicSort(); + } +} |