summaryrefslogtreecommitdiffstats
path: root/src/com/mbien/opencl
diff options
context:
space:
mode:
authorMichael Bien <[email protected]>2010-03-01 02:32:07 +0100
committerMichael Bien <[email protected]>2010-03-01 02:32:07 +0100
commit04fac379b2bb5876b21d047294bd3e029e6ebe3c (patch)
treeef78c8a535121a878b9fb81f1336d6fd3cb49c00 /src/com/mbien/opencl
parent48a46a6084878803acadd44b0e96e237cf5768c1 (diff)
cleaned up bitonic sort sample and removed sort payload for simplicity (value of the key-value pair wasn't used anyway).
Diffstat (limited to 'src/com/mbien/opencl')
-rw-r--r--src/com/mbien/opencl/demos/sort/BitonicSort.cl65
-rw-r--r--src/com/mbien/opencl/demos/sort/BitonicSort.java137
2 files changed, 78 insertions, 124 deletions
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.cl b/src/com/mbien/opencl/demos/sort/BitonicSort.cl
index a89b06b..a8d0e1d 100644
--- a/src/com/mbien/opencl/demos/sort/BitonicSort.cl
+++ b/src/com/mbien/opencl/demos/sort/BitonicSort.cl
@@ -22,29 +22,23 @@
inline void ComparatorPrivate(
uint *keyA,
- uint *valA,
uint *keyB,
- uint *valB,
uint arrowDir
){
if( (*keyA > *keyB) == arrowDir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
- t = *valA; *valA = *valB; *valB = t;
}
}
inline void ComparatorLocal(
__local uint *keyA,
- __local uint *valA,
__local uint *keyB,
- __local uint *valB,
uint arrowDir
){
if( (*keyA > *keyB) == arrowDir ){
uint t;
t = *keyA; *keyA = *keyB; *keyB = t;
- t = *valA; *valA = *valB; *valB = t;
}
}
@@ -54,24 +48,17 @@ inline void ComparatorLocal(
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
void bitonicSortLocal(
__global uint *d_DstKey,
- __global uint *d_DstVal,
__global uint *d_SrcKey,
- __global uint *d_SrcVal,
uint arrayLength,
uint sortDir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
- __local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subbatch and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
- l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
- l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
for(uint size = 2; size < arrayLength; size <<= 1){
//Bitonic merge
@@ -80,8 +67,8 @@ void bitonicSortLocal(
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
- &l_key[pos + 0], &l_val[pos + 0],
- &l_key[pos + stride], &l_val[pos + stride],
+ &l_key[pos + 0],
+ &l_key[pos + stride],
dir
);
}
@@ -93,8 +80,8 @@ void bitonicSortLocal(
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
- &l_key[pos + 0], &l_val[pos + 0],
- &l_key[pos + stride], &l_val[pos + stride],
+ &l_key[pos + 0],
+ &l_key[pos + stride],
sortDir
);
}
@@ -102,9 +89,7 @@ void bitonicSortLocal(
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
- d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
- d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
////////////////////////////////////////////////////////////////////////////////
@@ -117,22 +102,15 @@ void bitonicSortLocal(
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
void bitonicSortLocal1(
__global uint *d_DstKey,
- __global uint *d_DstVal,
- __global uint *d_SrcKey,
- __global uint *d_SrcVal
+ __global uint *d_SrcKey
){
__local uint l_key[LOCAL_SIZE_LIMIT];
- __local uint l_val[LOCAL_SIZE_LIMIT];
//Offset to the beginning of subarray and load data
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
- l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
- l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
@@ -143,8 +121,8 @@ void bitonicSortLocal1(
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
- &l_key[pos + 0], &l_val[pos + 0],
- &l_key[pos + stride], &l_val[pos + stride],
+ &l_key[pos + 0],
+ &l_key[pos + stride],
dir
);
}
@@ -158,8 +136,8 @@ void bitonicSortLocal1(
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
- &l_key[pos + 0], &l_val[pos + 0],
- &l_key[pos + stride], &l_val[pos + stride],
+ &l_key[pos + 0],
+ &l_key[pos + stride],
dir
);
}
@@ -167,17 +145,13 @@ void bitonicSortLocal1(
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
- d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
- d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void bitonicMergeGlobal(
__global uint *d_DstKey,
- __global uint *d_DstVal,
__global uint *d_SrcKey,
- __global uint *d_SrcVal,
uint arrayLength,
uint size,
uint stride,
@@ -191,20 +165,16 @@ __kernel void bitonicMergeGlobal(
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
uint keyA = d_SrcKey[pos + 0];
- uint valA = d_SrcVal[pos + 0];
uint keyB = d_SrcKey[pos + stride];
- uint valB = d_SrcVal[pos + stride];
ComparatorPrivate(
- &keyA, &valA,
- &keyB, &valB,
+ &keyA,
+ &keyB,
dir
);
d_DstKey[pos + 0] = keyA;
- d_DstVal[pos + 0] = valA;
d_DstKey[pos + stride] = keyB;
- d_DstVal[pos + stride] = valB;
}
//Combined bitonic merge steps for
@@ -212,25 +182,18 @@ __kernel void bitonicMergeGlobal(
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
void bitonicMergeLocal(
__global uint *d_DstKey,
- __global uint *d_DstVal,
__global uint *d_SrcKey,
- __global uint *d_SrcVal,
uint arrayLength,
uint stride,
uint size,
uint sortDir
){
__local uint l_key[LOCAL_SIZE_LIMIT];
- __local uint l_val[LOCAL_SIZE_LIMIT];
d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
- l_val[get_local_id(0) + 0] = d_SrcVal[ 0];
l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
- l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
@@ -239,15 +202,13 @@ void bitonicMergeLocal(
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(
- &l_key[pos + 0], &l_val[pos + 0],
- &l_key[pos + stride], &l_val[pos + stride],
+ &l_key[pos + 0],
+ &l_key[pos + stride],
dir
);
}
barrier(CLK_LOCAL_MEM_FENCE);
d_DstKey[ 0] = l_key[get_local_id(0) + 0];
- d_DstVal[ 0] = l_val[get_local_id(0) + 0];
d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
- d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
}
diff --git a/src/com/mbien/opencl/demos/sort/BitonicSort.java b/src/com/mbien/opencl/demos/sort/BitonicSort.java
index be28409..207ad1e 100644
--- a/src/com/mbien/opencl/demos/sort/BitonicSort.java
+++ b/src/com/mbien/opencl/demos/sort/BitonicSort.java
@@ -25,9 +25,10 @@ import static com.mbien.opencl.CLProgram.*;
*/
public class BitonicSort {
- private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal";
- private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal";
- private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1";
+ private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal";
+ private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal";
+ private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal";
+ private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1";
private final static int LOCAL_SIZE_LIMIT = 1024;
private final Map<String, CLKernel> kernels;
@@ -35,77 +36,76 @@ public class BitonicSort {
public BitonicSort() throws IOException {
final int sortDir = 1;
- final int elements = 1024;
- final int maxvalue = 1000000000;
+ final int elements = 1048576;
+ final int maxvalue = 100000;
- System.out.println("Initializing OpenCL...");
+ out.println("Initializing OpenCL...");
//Create the context
- CLContext context = CLContext.create();
- CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+ CLContext context = null;
- System.out.println("Initializing OpenCL bitonic sorter...");
- kernels = initBitonicSort(context, queue);
+ try{
+ context = CLContext.create();
+ CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
- System.out.println("Creating OpenCL memory objects...");
- CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+ out.println("Initializing OpenCL bitonic sorter...");
+ kernels = initBitonicSort(queue);
- // in case of key/value pairs
-// CLBuffer<IntBuffer> valueBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+ out.println("Creating OpenCL memory objects...");
+ CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+ System.out.println(keyBuffer.getCLSize()/1000000.0f);
- System.out.println("Initializing data...\n");
- Random random = new Random();
- for (int i = 0; i < elements; i++) {
- int rnd = random.nextInt(maxvalue);
- keyBuffer.getBuffer().put(i, rnd);
-// valueBuffer.getBuffer().put(i, rnd); // value can be arbitary
- }
-
- int arrayLength = elements;
- int batch = elements / arrayLength;
-
- System.out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
+ out.println("Initializing data...\n");
+ Random random = new Random();
+ for (int i = 0; i < elements; i++) {
+ int rnd = random.nextInt(maxvalue);
+ keyBuffer.getBuffer().put(i, rnd);
+ }
-// long time = System.currentTimeMillis();
+ int arrayLength = elements;
+ int batch = elements / arrayLength;
- bitonicSort(queue, keyBuffer, batch, arrayLength, sortDir);
+ out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
- queue.putReadBuffer(keyBuffer, true);
-// queue.putReadBuffer(valueBuffer, true);
-// System.out.println(System.currentTimeMillis() - time);
+ long time = currentTimeMillis();
- IntBuffer keys = keyBuffer.getBuffer();
- printSnapshot(keys, 10);
- checkIfSorted(keys);
+ bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir);
+ queue.putReadBuffer(keyBuffer, true);
-// IntBuffer values = valueBuffer.getBuffer();
-// printSnapshot(values, 10);
-// checkIfSorted(values);
+ out.println(currentTimeMillis() - time+"ms");
- System.out.println();
+ IntBuffer keys = keyBuffer.getBuffer();
+ printSnapshot(keys, 10);
+ checkIfSorted(keys);
- System.out.println("TEST PASSED");
+ out.println("\nTEST PASSED");
- context.release();
+ }finally{
+ if(context!=null) {
+ context.release();
+ }
+ }
}
- private Map<String, CLKernel> initBitonicSort(CLContext context, CLCommandQueue queue) throws IOException {
+ private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException {
- System.out.println(" creating bitonic sort program");
+ out.println(" creating bitonic sort program");
+
+ CLContext context = queue.getContext();
CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
.build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
- Map<String, CLKernel> kernels = program.createCLKernels();
+ Map<String, CLKernel> kernelMap = program.createCLKernels();
- System.out.println(" checking minimum supported workgroup size");
+ out.println(" checking minimum supported workgroup size");
//Check for work group size
CLDevice device = queue.getDevice();
- long szBitonicSortLocal = kernels.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
- long szBitonicSortLocal1 = kernels.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
- long szBitonicMergeLocal = kernels.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
+ long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
+ long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
+ long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
|| (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
@@ -114,19 +114,11 @@ public class BitonicSort {
+" required by this application is not supported on this device.");
}
- return kernels;
-
- }
-
- public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, int batch, int arrayLength, int dir) {
- this.bitonicSort(queue, keys, keys, keys, keys, batch, arrayLength, dir);
- }
+ return kernelMap;
- public void bitonicSort(CLCommandQueue queue, CLBuffer<?> keys, CLBuffer<?> values, int batch, int arrayLength, int dir) {
- this.bitonicSort(queue, keys, values, keys, values, batch, arrayLength, dir);
}
- public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> dstVal, CLBuffer<?> srcKey, CLBuffer<?> srcVal, int batch, int arrayLength, int dir) {
+ public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) {
if (arrayLength < 2) {
throw new IllegalArgumentException("arrayLength was "+arrayLength);
@@ -136,50 +128,51 @@ public class BitonicSort {
dir = (dir != 0) ? 1 : 0;
+ CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1);
+ CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL);
+ CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL);
+ CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL);
+
if (arrayLength <= LOCAL_SIZE_LIMIT) {
// oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
//Launch bitonicSortLocal
- CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL)
- .putArgs(dstKey, dstVal, srcKey, srcVal)
- .putArg(arrayLength).putArg(dir).rewind();
+ sortlocal.putArgs(dstKey, srcKey)
+ .putArg(arrayLength).putArg(dir).rewind();
int localWorkSize = LOCAL_SIZE_LIMIT / 2;
int globalWorkSize = batch * arrayLength / 2;
- queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+ queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize);
} else {
//Launch bitonicSortLocal1
- CLKernel kernel = kernels.get(BITONIC_SORT_LOCAL1)
- .setArgs(dstKey, dstVal, srcKey, srcVal);
+ sortlocal1.setArgs(dstKey, srcKey);
int localWorkSize = LOCAL_SIZE_LIMIT / 2;
int globalWorkSize = batch * arrayLength / 2;
- queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+ queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize);
for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
for (int stride = size / 2; stride > 0; stride >>= 1) {
if (stride >= LOCAL_SIZE_LIMIT) {
//Launch bitonicMergeGlobal
- kernel = kernels.get("bitonicMergeGlobal")
- .putArgs(dstKey, dstVal, dstKey, dstVal)
- .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
+ mergeGlobal.putArgs(dstKey, dstKey)
+ .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
globalWorkSize = batch * arrayLength / 2;
- queue.put1DRangeKernel(kernel, 0, globalWorkSize, 0);
+ queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0);
} else {
//Launch bitonicMergeLocal
- kernel = kernels.get(BITONIC_MERGE_LOCAL)
- .putArgs(dstKey, dstVal, dstKey, dstVal)
- .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
+ mergeLocal.putArgs(dstKey, dstKey)
+ .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
localWorkSize = LOCAL_SIZE_LIMIT / 2;
globalWorkSize = batch * arrayLength / 2;
- queue.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+ queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize);
break;
}
}