summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Bien <[email protected]>2011-09-21 21:29:12 +0200
committerMichael Bien <[email protected]>2011-09-21 21:29:12 +0200
commitb87b657642b6bda35bed85e8b43b3b68ac994c25 (patch)
tree86b7ffb45db16b2849184abad76bb9d1c90294b5
parentbaf07b12a2a62003334d17113e8dad1e92b80029 (diff)
initial version of parallel scan primitive. The implementation is work in progress, final version will use a different kernel/algorithm.
random float utility method for TestUtils
-rw-r--r--src/com/jogamp/opencl/util/pp/Scan.java193
-rw-r--r--src/com/jogamp/opencl/util/pp/scan.cl92
-rw-r--r--test/com/jogamp/opencl/CLProgramTest.java15
-rw-r--r--test/com/jogamp/opencl/TestUtils.java10
-rw-r--r--test/com/jogamp/opencl/util/pp/ScanTest.java132
5 files changed, 434 insertions, 8 deletions
diff --git a/src/com/jogamp/opencl/util/pp/Scan.java b/src/com/jogamp/opencl/util/pp/Scan.java
new file mode 100644
index 00000000..9d79087e
--- /dev/null
+++ b/src/com/jogamp/opencl/util/pp/Scan.java
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2011, Michael Bien
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ * conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ * of conditions and the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Created on Tuesday, September 20 2011 22:26
+ */
+package com.jogamp.opencl.util.pp;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opencl.CLResource;
+import com.jogamp.opencl.CLWork.CLWork1D;
+import com.jogamp.opencl.util.CLUtil;
+import com.jogamp.opencl.util.concurrent.CLQueueContext;
+import com.jogamp.opencl.util.concurrent.CLTask;
+import com.jogamp.opencl.util.concurrent.CLQueueContext.CLResourceQueueContext;
+import com.jogamp.opencl.util.CLProgramConfiguration;
+import java.io.IOException;
+import java.nio.Buffer;
+
+import static com.jogamp.opencl.CLMemory.Mem.*;
+
+/**
+ * Prototype, not ready for general use.
+ * @author Michael Bien
+ */
+/*public */class Scan<B extends Buffer> implements CLResource {
+
+ private static final String SOURCES;
+
+ private final Op OPERATION;
+ private final ArgType ELEMENT;
+
+ private final CLProgram program;
+ private final CLWork1D smallScan;
+
+ static{
+ try {
+ StringBuilder sb = new StringBuilder(2048);
+ CLUtil.readStream(Scan.class.getResourceAsStream("scan.cl"), sb);
+ SOURCES = sb.toString();
+ } catch (IOException ex) {
+ throw new RuntimeException("can not initialize Reduction.", ex);
+ }
+ }
+
+ private <B extends Buffer> Scan(CLContext context, Op op, Class<B> elementType) {
+
+ if(!op.equals(Op.ADD)) {
+ throw new IllegalArgumentException("only add is supported for now");
+ }
+
+ this.ELEMENT = ArgType.valueOf(elementType);
+ this.OPERATION = op;
+
+ this.program = context.createProgram(SOURCES);
+
+ CLProgramConfiguration config = program.prepare();
+ config.withDefine("OP_"+op.name())
+ .withDefine("TYPE", ELEMENT.vectorType(1));
+ if(ELEMENT.equals(ArgType.DOUBLE)) {
+ config.withDefine("DOUBLE_FP");
+ }
+ config.build();
+
+ smallScan = CLWork1D.create1D(program.createCLKernel("smallScan"));
+ }
+
+ public static <B extends Buffer> Scan<B> create(CLContext context, Op op, Class<? extends B> elementType) {
+ return new Scan<B>(context, op, elementType);
+ }
+
+ public static <B extends Buffer> Scan<B> create(CLCommandQueue queue, Op op, Class<? extends B> elementType) {
+ return create(queue.getContext(), op, elementType);
+ }
+
+ public static <B extends Buffer> CLTask<CLResourceQueueContext<Scan<B>>, B> createTask(B input, B output, Op op, Class<? extends B> elementType) {
+ return new CLScanTask<B>(input, output, op, elementType);
+ }
+
+ public B scan(CLCommandQueue queue, B input, B output) {
+
+ int length = input.capacity();
+
+ int maxSize = (int)smallScan.getKernel().getWorkGroupSize(queue.getDevice());
+ if(length > maxSize*2) {
+ throw new IllegalArgumentException("buffer was to large for the given hardware");
+ }
+
+ int workSize = (length+length%2)/2; // half, rounded up
+ int sharedBufferSize = maxSize * ELEMENT.SIZE*2;
+
+ CLContext context = queue.getContext();
+
+ CLBuffer<B> in = context.createBuffer(input, READ_ONLY);
+ CLBuffer<B> out = context.createBuffer(output, WRITE_ONLY);
+
+ smallScan.getKernel().putArg(in).putArg(out).putArgSize(sharedBufferSize).putArg(length).rewind();
+ smallScan.setWorkSize(workSize, workSize);
+
+ queue.putWriteBuffer(in, false);
+ queue.putWork(smallScan);
+ queue.putReadBuffer(out, true);
+
+ in.release();
+ out.release();
+
+ return output;
+ }
+
+ @Override
+ public void release() {
+ program.release();
+ }
+
+ @Override
+ public boolean isReleased() {
+ return program == null || program.isReleased();
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName()+"["+OPERATION+", "+ELEMENT+"]";
+ }
+
+ public enum Op {ADD, MUL, MIN, MAX}
+
+ private static class CLScanTask<B extends Buffer> extends CLTask<CLResourceQueueContext<Scan<B>>, B> {
+
+ private final static int TYPE_ID = 2;
+
+ private final B input;
+ private final B output;
+ private final Op op;
+ private final Class<? extends B> elementType;
+ private final Integer KEY;
+
+ private CLScanTask(B input, B output, Op op, Class<? extends B> elementType) {
+ this.input = input;
+ this.output = output;
+ this.op = op;
+ this.elementType = elementType;
+ this.KEY = TYPE_ID + op.ordinal()*10 + 1000*ArgType.valueOf(elementType).ordinal();
+ }
+
+ @Override
+ public CLResourceQueueContext<Scan<B>> createQueueContext(CLCommandQueue queue) {
+ Scan<B> reduction = Scan.create(queue, op, elementType);
+ return new CLQueueContext.CLResourceQueueContext<Scan<B>>(queue, reduction);
+ }
+
+ @Override
+ public B execute(CLResourceQueueContext<Scan<B>> context) {
+ return context.resource.scan(context.queue, input, output);
+ }
+
+ @Override
+ public Object getContextKey() {
+ return KEY;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName()+"["+op+", "+elementType+", "+KEY+"]";
+ }
+ }
+
+}
diff --git a/src/com/jogamp/opencl/util/pp/scan.cl b/src/com/jogamp/opencl/util/pp/scan.cl
new file mode 100644
index 00000000..9fc08cda
--- /dev/null
+++ b/src/com/jogamp/opencl/util/pp/scan.cl
@@ -0,0 +1,92 @@
+// created on Wednesday, September 21 2011
+//@author mbien
+
+#ifdef DOUBLE_FP
+ #ifdef cl_khr_fp64
+ #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+ #elif defined(cl_amd_fp64)
+ #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+ #else
+ #error "Double precision floating point not supported."
+ #endif
+#endif
+
+#ifndef TYPE
+ typedef int TYPE;
+ #warning "TYPE was not set"
+#endif
+
+/*
+ * Work-efficient compute implementation of scan, one thread per 2 elements
+ * O(log(n)) steps and O(n) adds using shared memory
+ * Uses a balanced tree algorithm. See Belloch, 1990 "Prefix Sums and Their Applications"
+ * Implementation is based on AMD's prefix sum example.
+ * length must be smaller than workgroup size, vector types are not supported.
+ */
+kernel void smallScan(const global TYPE* input, global TYPE* output, local TYPE* block, const uint length) {
+
+ const uint id = get_global_id(0);
+ const uint id2= id*2;
+
+ int offset = 1;
+
+ /* Cache the computational window in shared memory */
+ if(id2 < length - 1) {
+ block[id2] = input[id2];
+ block[id2 + 1] = input[id2 + 1];
+ }else if(id2 == length - 1) { // odd buffer length
+ block[id2] = input[id2];
+ block[id2 + 1] = 0;
+ }else{ // no-op when out of bounds
+ block[id2] = 0;
+ block[id2 + 1] = 0;
+ }
+
+ const uint limit = (length-length%2);
+
+ /* build the sum in place up the tree */
+ for(int d = limit>>1; d > 0; d >>=1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(id < d) {
+ int ai = offset*(id2 + 1) - 1;
+ int bi = offset*(id2 + 2) - 1;
+
+ block[bi] += block[ai];
+ }
+ offset *= 2;
+ }
+
+ /* scan back down the tree */
+
+ /* clear the last element */
+ if(id == 0) {
+ block[limit - 1] = 0;
+ }
+
+ /* traverse down the tree building the scan in the place */
+ for(int d = 1; d < limit; d *= 2) {
+ offset >>=1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(id < d) {
+ int ai = offset*(id2 + 1) - 1;
+ int bi = offset*(id2 + 2) - 1;
+
+ TYPE t = block[ai];
+ block[ai] = block[bi];
+ block[bi] += t;
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /*write the results back to global memory */
+ if(id2 < length - 1) {
+ output[id2] = block[id2];
+ output[id2 + 1] = block[id2 + 1];
+ }else if(id2 == length - 1) { // odd length
+ output[id2] = block[id2]+block[id2-1];
+ }
+}
+
diff --git a/test/com/jogamp/opencl/CLProgramTest.java b/test/com/jogamp/opencl/CLProgramTest.java
index cf7f45ff..ae5a0926 100644
--- a/test/com/jogamp/opencl/CLProgramTest.java
+++ b/test/com/jogamp/opencl/CLProgramTest.java
@@ -55,6 +55,7 @@ import static java.lang.System.*;
import static com.jogamp.opencl.CLProgram.CompilerOptions.*;
import static com.jogamp.opencl.util.CLPlatformFilters.*;
import static com.jogamp.opencl.CLVersion.*;
+import static com.jogamp.opencl.TestUtils.*;
/**
*
@@ -364,11 +365,11 @@ public class CLProgramTest {
Random rnd = new Random(seed);
kernel.putArg(buffer);
- kernel.putArg(rnd.nextFloat());
- kernel.putArg(rnd.nextFloat(), rnd.nextFloat());
-// kernel.putArg(rnd.nextFloat(), rnd.nextFloat(), rnd.nextFloat()); // nv does not support float3
- kernel.putArg(rnd.nextFloat(), rnd.nextFloat(), rnd.nextFloat(), rnd.nextFloat());
- kernel.putArg(TestUtils.fillBuffer(Buffers.newDirectFloatBuffer(8), seed));
+ kernel.putArg(rndFloat(rnd));
+ kernel.putArg(rndFloat(rnd), rndFloat(rnd));
+// kernel.putArg(rndFloat(rnd), rndFloat(rnd), rndFloat(rnd)); // nv does not support float3
+ kernel.putArg(rndFloat(rnd), rndFloat(rnd), rndFloat(rnd), rndFloat(rnd));
+ kernel.putArg(fillBuffer(Buffers.newDirectFloatBuffer(8), seed));
CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
queue.putTask(kernel).putReadBuffer(buffer, true);
@@ -377,12 +378,12 @@ public class CLProgramTest {
rnd = new Random(seed);
for(int i = 0; i < 7; i++) {
- assertEquals(rnd.nextFloat(), out.get(), 0.01f);
+ assertEquals(rndFloat(rnd), out.get(), 0.01f);
}
rnd = new Random(seed);
for(int i = 0; i < 8; i++) {
- assertEquals(rnd.nextFloat(), out.get(), 0.01f);
+ assertEquals(rndFloat(rnd), out.get(), 0.01f);
}
}finally{
diff --git a/test/com/jogamp/opencl/TestUtils.java b/test/com/jogamp/opencl/TestUtils.java
index e7e5fabe..87e7cc5f 100644
--- a/test/com/jogamp/opencl/TestUtils.java
+++ b/test/com/jogamp/opencl/TestUtils.java
@@ -62,13 +62,21 @@ public class TestUtils {
Random rnd = new Random(seed);
while(buffer.remaining() != 0)
- buffer.put(rnd.nextFloat());
+ buffer.put(rndFloat(rnd));
buffer.rewind();
return buffer;
}
+ public static float rndFloat(Random rnd) {
+ return rndFloat(rnd, 100);
+ }
+
+ public static float rndFloat(Random rnd, float range) {
+ return (rnd.nextFloat()-0.5f)*range*2;
+ }
+
public static void checkIfEqual(ByteBuffer a, ByteBuffer b, int elements) {
for(int i = 0; i < elements; i++) {
int aVal = a.getInt();
diff --git a/test/com/jogamp/opencl/util/pp/ScanTest.java b/test/com/jogamp/opencl/util/pp/ScanTest.java
new file mode 100644
index 00000000..81aa7288
--- /dev/null
+++ b/test/com/jogamp/opencl/util/pp/ScanTest.java
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2011, Michael Bien
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ * conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ * of conditions and the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Created on Tuesday, September 20 2011 01:26
+ */
+package com.jogamp.opencl.util.pp;
+
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.util.pp.Scan.Op;
+import com.jogamp.common.nio.Buffers;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLPlatform;
+import com.jogamp.opencl.TestUtils;
+import java.nio.FloatBuffer;
+import java.nio.IntBuffer;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import static java.lang.System.*;
+
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class ScanTest {
+
+
+ @Test
+ public void testSmallScan() {
+ CLContext context = CLContext.create(getDevice());
+
+ try{
+ CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+ int[][] in = new int[][]{ { 4, 0, 5, 5, 0, 5, 5, 1 }, //even
+ { 4, 0, 5, 5, 0, 5, 5, 1, 3 } };//odd
+
+ int[] inclusive = new int[] { 4, 4, 9,14,14,19,24,27,28 };
+ int[] exclusive = new int[] { 0, 4, 4, 9,14,14,19,24,27 };
+
+ for (int i = 0; i < in.length; i++) {
+
+ IntBuffer input = Buffers.newDirectIntBuffer(in[i]);
+ IntBuffer output = Buffers.newDirectIntBuffer(input.capacity());
+
+ out.println((input.capacity()%2==0?"even":"odd") + " array lenght");
+
+ Scan<IntBuffer> scan = Scan.create(context, Op.ADD, input.getClass());
+ scan.scan(queue, input, output);
+
+ while(output.hasRemaining()) {
+ int value = output.get();
+// System.out.println(value);
+ assertEquals(exclusive[output.position()-1], value);
+ }
+
+ scan.release();
+
+ }
+ }finally{
+ context.release();
+ }
+ }
+
+ @Test
+ public void testSmallScanSizeLimit() {
+
+ CLContext context = CLContext.create(getDevice());
+
+ try{
+ CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+ float[] exclusive = new float[queue.getDevice().getMaxWorkGroupSize()*2];
+
+ FloatBuffer input = Buffers.newDirectFloatBuffer(exclusive.length);
+ FloatBuffer output = Buffers.newDirectFloatBuffer(input.capacity());
+
+ TestUtils.fillBuffer(input, 42);
+ long time = nanoTime();
+ for (int i = 1; i < exclusive.length; i++) {
+ exclusive[i] = exclusive[i-1]+input.get(i-1);
+ }
+ out.println("delta "+(nanoTime()-time));
+
+ Scan<FloatBuffer> scan = Scan.create(context, Op.ADD, input.getClass());
+ time = nanoTime();
+ scan.scan(queue, input, output);
+ out.println("delta "+(nanoTime()-time));
+
+ while(output.hasRemaining()) {
+ float value = output.get();
+ assertEquals("@"+(output.position()-1),exclusive[output.position()-1], value, 0.1f);
+ }
+
+ scan.release();
+
+ }finally{
+ context.release();
+ }
+ }
+
+ private CLDevice getDevice() {
+ return CLPlatform.listCLPlatforms()[0].getMaxFlopsDevice();
+ }
+
+}