diff options
Diffstat (limited to 'src/com/mbien')
-rw-r--r-- | src/com/mbien/opencl/demos/sort/BitonicSort.cl | 65 | ||||
-rw-r--r-- | src/com/mbien/opencl/demos/sort/BitonicSort.java | 137 |
2 files changed, 78 insertions, 124 deletions
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.cl b/src/com/mbien/opencl/demos/sort/BitonicSort.cl index a89b06b..a8d0e1d 100644 --- a/src/com/mbien/opencl/demos/sort/BitonicSort.cl +++ b/src/com/mbien/opencl/demos/sort/BitonicSort.cl @@ -22,29 +22,23 @@ inline void ComparatorPrivate( uint *keyA, - uint *valA, uint *keyB, - uint *valB, uint arrowDir ){ if( (*keyA > *keyB) == arrowDir ){ uint t; t = *keyA; *keyA = *keyB; *keyB = t; - t = *valA; *valA = *valB; *valB = t; } } inline void ComparatorLocal( __local uint *keyA, - __local uint *valA, __local uint *keyB, - __local uint *valB, uint arrowDir ){ if( (*keyA > *keyB) == arrowDir ){ uint t; t = *keyA; *keyA = *keyB; *keyB = t; - t = *valA; *valA = *valB; *valB = t; } } @@ -54,24 +48,17 @@ inline void ComparatorLocal( __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) void bitonicSortLocal( __global uint *d_DstKey, - __global uint *d_DstVal, __global uint *d_SrcKey, - __global uint *d_SrcVal, uint arrayLength, uint sortDir ){ __local uint l_key[LOCAL_SIZE_LIMIT]; - __local uint l_val[LOCAL_SIZE_LIMIT]; //Offset to the beginning of subbatch and load data d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; - l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; - l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; for(uint size = 2; size < arrayLength; size <<= 1){ //Bitonic merge @@ -80,8 +67,8 @@ void bitonicSortLocal( barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( - &l_key[pos + 0], &l_val[pos + 0], - &l_key[pos + stride], &l_val[pos + stride], + &l_key[pos + 0], + &l_key[pos + stride], dir ); } @@ -93,8 +80,8 @@ void bitonicSortLocal( barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( - &l_key[pos + 0], &l_val[pos + 0], - &l_key[pos + stride], &l_val[pos + stride], + &l_key[pos + 0], + &l_key[pos + stride], sortDir ); } @@ -102,9 +89,7 @@ void bitonicSortLocal( barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; - d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; - d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } //////////////////////////////////////////////////////////////////////////////// @@ -117,22 +102,15 @@ void bitonicSortLocal( __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) void bitonicSortLocal1( __global uint *d_DstKey, - __global uint *d_DstVal, - __global uint *d_SrcKey, - __global uint *d_SrcVal + __global uint *d_SrcKey ){ __local uint l_key[LOCAL_SIZE_LIMIT]; - __local uint l_val[LOCAL_SIZE_LIMIT]; //Offset to the beginning of subarray and load data d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; - l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; - l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); @@ -143,8 +121,8 @@ void bitonicSortLocal1( barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( - &l_key[pos + 0], &l_val[pos + 0], - &l_key[pos + stride], &l_val[pos + stride], + &l_key[pos + 0], + &l_key[pos + stride], dir ); } @@ -158,8 +136,8 @@ void bitonicSortLocal1( barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( - &l_key[pos + 0], &l_val[pos + 0], - &l_key[pos + stride], &l_val[pos + stride], + &l_key[pos + 0], + &l_key[pos + stride], dir ); } @@ -167,17 +145,13 @@ void bitonicSortLocal1( barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; - d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; - d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT __kernel void bitonicMergeGlobal( __global uint *d_DstKey, - __global uint *d_DstVal, __global uint *d_SrcKey, - __global uint *d_SrcVal, uint arrayLength, uint size, uint stride, @@ -191,20 +165,16 @@ __kernel void bitonicMergeGlobal( uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); uint keyA = d_SrcKey[pos + 0]; - uint valA = d_SrcVal[pos + 0]; uint keyB = d_SrcKey[pos + stride]; - uint valB = d_SrcVal[pos + stride]; ComparatorPrivate( - &keyA, &valA, - &keyB, &valB, + &keyA, + &keyB, dir ); d_DstKey[pos + 0] = keyA; - d_DstVal[pos + 0] = valA; d_DstKey[pos + stride] = keyB; - d_DstVal[pos + stride] = valB; } //Combined bitonic merge steps for @@ -212,25 +182,18 @@ __kernel void bitonicMergeGlobal( __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) void bitonicMergeLocal( __global uint *d_DstKey, - __global uint *d_DstVal, __global uint *d_SrcKey, - __global uint *d_SrcVal, uint arrayLength, uint stride, uint size, uint sortDir ){ __local uint l_key[LOCAL_SIZE_LIMIT]; - __local uint l_val[LOCAL_SIZE_LIMIT]; d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; - l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; - l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; //Bitonic merge uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); @@ -239,15 +202,13 @@ void bitonicMergeLocal( barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( - &l_key[pos + 0], &l_val[pos + 0], - &l_key[pos + stride], &l_val[pos + stride], + &l_key[pos + 0], + &l_key[pos + stride], dir ); } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; - d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; - d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.java b/src/com/mbien/opencl/demos/sort/BitonicSort.java index be28409..207ad1e 100644 --- a/src/com/mbien/opencl/demos/sort/BitonicSort.java +++ b/src/com/mbien/opencl/demos/sort/BitonicSort.java @@ -25,9 +25,10 @@ import static com.mbien.opencl.CLProgram.*; */ public class BitonicSort { - private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal"; - private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal"; - private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1"; + private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal"; + private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal"; + private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal"; + private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1"; private final static int LOCAL_SIZE_LIMIT = 1024; private final Map<String, CLKernel> kernels; @@ -35,77 +36,76 @@ public class BitonicSort { public BitonicSort() throws IOException { final int sortDir = 1; - final int elements = 1024; - final int maxvalue = 1000000000; + final int elements = 1048576; + final int maxvalue = 100000; - System.out.println("Initializing OpenCL..."); + out.println("Initializing OpenCL..."); //Create the context - CLContext context = CLContext.create(); - CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); + CLContext context = null; - System.out.println("Initializing OpenCL bitonic sorter..."); - kernels = initBitonicSort(context, queue); + try{ + context = CLContext.create(); + CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); - System.out.println("Creating OpenCL memory objects..."); - CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); + out.println("Initializing OpenCL bitonic sorter..."); + kernels = initBitonicSort(queue); - // in case of key/value pairs -// CLBuffer<IntBuffer> valueBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); + out.println("Creating OpenCL memory objects..."); + CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); + System.out.println(keyBuffer.getCLSize()/1000000.0f); - System.out.println("Initializing data...\n"); - Random random = new Random(); - for (int i = 0; i < elements; i++) { - int rnd = random.nextInt(maxvalue); - keyBuffer.getBuffer().put(i, rnd); -// valueBuffer.getBuffer().put(i, rnd); // value can be arbitary - } - - int arrayLength = elements; - int batch = elements / arrayLength; - - System.out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch); + out.println("Initializing data...\n"); + Random random = new Random(); + for (int i = 0; i < elements; i++) { + int rnd = random.nextInt(maxvalue); + keyBuffer.getBuffer().put(i, rnd); + } -// long time = System.currentTimeMillis(); + int arrayLength = elements; + int batch = elements / arrayLength; - bitonicSort(queue, keyBuffer, batch, arrayLength, sortDir); + out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch); - queue.putReadBuffer(keyBuffer, true); -// queue.putReadBuffer(valueBuffer, true); -// System.out.println(System.currentTimeMillis() - time); + long time = currentTimeMillis(); - IntBuffer keys = keyBuffer.getBuffer(); - printSnapshot(keys, 10); - checkIfSorted(keys); + bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir); + queue.putReadBuffer(keyBuffer, true); -// IntBuffer values = valueBuffer.getBuffer(); -// printSnapshot(values, 10); -// checkIfSorted(values); + out.println(currentTimeMillis() - time+"ms"); - System.out.println(); + IntBuffer keys = keyBuffer.getBuffer(); + printSnapshot(keys, 10); + checkIfSorted(keys); - System.out.println("TEST PASSED"); + out.println("\nTEST PASSED"); - context.release(); + }finally{ + if(context!=null) { + context.release(); + } + } } - private Map<String, CLKernel> initBitonicSort(CLContext context, CLCommandQueue queue) throws IOException { + private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException { - System.out.println(" creating bitonic sort program"); + out.println(" creating bitonic sort program"); + + CLContext context = queue.getContext(); CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl")) .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT)); - Map<String, CLKernel> kernels = program.createCLKernels(); + Map<String, CLKernel> kernelMap = program.createCLKernels(); - System.out.println(" checking minimum supported workgroup size"); + out.println(" checking minimum supported workgroup size"); //Check for work group size CLDevice device = queue.getDevice(); - long szBitonicSortLocal = kernels.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device); - long szBitonicSortLocal1 = kernels.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device); - long szBitonicMergeLocal = kernels.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device); + long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device); + long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device); + long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device); if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) @@ -114,19 +114,11 @@ public class BitonicSort { +" required by this application is not supported on this device."); } - return kernels; - - } - - public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, int batch, int arrayLength, int dir) { - this.bitonicSort(queue, keys, keys, keys, keys, batch, arrayLength, dir); - } + return kernelMap; - public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, CLBuffer<?> values, int batch, int arrayLength, int dir) { - this.bitonicSort(queue, keys, values, keys, values, batch, arrayLength, dir); } - public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> dstVal, CLBuffer<?> srcKey, CLBuffer<?> srcVal, int batch, int arrayLength, int dir) { + public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) { if (arrayLength < 2) { throw new IllegalArgumentException("arrayLength was "+arrayLength); @@ -136,50 +128,51 @@ public class BitonicSort { dir = (dir != 0) ? 1 : 0; + CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1); + CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL); + CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL); + CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL); + if (arrayLength <= LOCAL_SIZE_LIMIT) { // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE ); //Launch bitonicSortLocal - CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL) - .putArgs(dstKey, dstVal, srcKey, srcVal) - .putArg(arrayLength).putArg(dir).rewind(); + sortlocal.putArgs(dstKey, srcKey) + .putArg(arrayLength).putArg(dir).rewind(); int localWorkSize = LOCAL_SIZE_LIMIT / 2; int globalWorkSize = batch * arrayLength / 2; - queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize); } else { //Launch bitonicSortLocal1 - CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL1) - .setArgs(dstKey, dstVal, srcKey, srcVal); + sortlocal1.setArgs(dstKey, srcKey); int localWorkSize = LOCAL_SIZE_LIMIT / 2; int globalWorkSize = batch * arrayLength / 2; - queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize); for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { for (int stride = size / 2; stride > 0; stride >>= 1) { if (stride >= LOCAL_SIZE_LIMIT) { //Launch bitonicMergeGlobal - kernel = kernels.get("bitonicMergeGlobal") - .putArgs(dstKey, dstVal, dstKey, dstVal) - .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind(); + mergeGlobal.putArgs(dstKey, dstKey) + .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind(); globalWorkSize = batch * arrayLength / 2; - queue.put1DRangeKernel(kernel, 0, globalWorkSize, 0); + queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0); } else { //Launch bitonicMergeLocal - kernel = kernels.get(BITONIC_MERGE_LOCAL) - .putArgs(dstKey, dstVal, dstKey, dstVal) - .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind(); + mergeLocal.putArgs(dstKey, dstKey) + .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind(); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; - queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize); break; } } |