diff options
-rw-r--r-- | src/com/jogamp/opencl/util/pp/Scan.java | 193 | ||||
-rw-r--r-- | src/com/jogamp/opencl/util/pp/scan.cl | 92 | ||||
-rw-r--r-- | test/com/jogamp/opencl/CLProgramTest.java | 15 | ||||
-rw-r--r-- | test/com/jogamp/opencl/TestUtils.java | 10 | ||||
-rw-r--r-- | test/com/jogamp/opencl/util/pp/ScanTest.java | 132 |
5 files changed, 434 insertions, 8 deletions
diff --git a/src/com/jogamp/opencl/util/pp/Scan.java b/src/com/jogamp/opencl/util/pp/Scan.java new file mode 100644 index 00000000..9d79087e --- /dev/null +++ b/src/com/jogamp/opencl/util/pp/Scan.java @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2011, Michael Bien + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are + * permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * Created on Tuesday, September 20 2011 22:26 + */ +package com.jogamp.opencl.util.pp; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opencl.CLResource; +import com.jogamp.opencl.CLWork.CLWork1D; +import com.jogamp.opencl.util.CLUtil; +import com.jogamp.opencl.util.concurrent.CLQueueContext; +import com.jogamp.opencl.util.concurrent.CLTask; +import com.jogamp.opencl.util.concurrent.CLQueueContext.CLResourceQueueContext; +import com.jogamp.opencl.util.CLProgramConfiguration; +import java.io.IOException; +import java.nio.Buffer; + +import static com.jogamp.opencl.CLMemory.Mem.*; + +/** + * Prototype, not ready for general use. + * @author Michael Bien + */ +/*public */class Scan<B extends Buffer> implements CLResource { + + private static final String SOURCES; + + private final Op OPERATION; + private final ArgType ELEMENT; + + private final CLProgram program; + private final CLWork1D smallScan; + + static{ + try { + StringBuilder sb = new StringBuilder(2048); + CLUtil.readStream(Scan.class.getResourceAsStream("scan.cl"), sb); + SOURCES = sb.toString(); + } catch (IOException ex) { + throw new RuntimeException("can not initialize Reduction.", ex); + } + } + + private <B extends Buffer> Scan(CLContext context, Op op, Class<B> elementType) { + + if(!op.equals(Op.ADD)) { + throw new IllegalArgumentException("only add is supported for now"); + } + + this.ELEMENT = ArgType.valueOf(elementType); + this.OPERATION = op; + + this.program = context.createProgram(SOURCES); + + CLProgramConfiguration config = program.prepare(); + config.withDefine("OP_"+op.name()) + .withDefine("TYPE", ELEMENT.vectorType(1)); + if(ELEMENT.equals(ArgType.DOUBLE)) { + config.withDefine("DOUBLE_FP"); + } + config.build(); + + smallScan = CLWork1D.create1D(program.createCLKernel("smallScan")); + } + + public static <B extends Buffer> Scan<B> create(CLContext context, Op op, Class<? extends B> elementType) { + return new Scan<B>(context, op, elementType); + } + + public static <B extends Buffer> Scan<B> create(CLCommandQueue queue, Op op, Class<? extends B> elementType) { + return create(queue.getContext(), op, elementType); + } + + public static <B extends Buffer> CLTask<CLResourceQueueContext<Scan<B>>, B> createTask(B input, B output, Op op, Class<? extends B> elementType) { + return new CLScanTask<B>(input, output, op, elementType); + } + + public B scan(CLCommandQueue queue, B input, B output) { + + int length = input.capacity(); + + int maxSize = (int)smallScan.getKernel().getWorkGroupSize(queue.getDevice()); + if(length > maxSize*2) { + throw new IllegalArgumentException("buffer was to large for the given hardware"); + } + + int workSize = (length+length%2)/2; // half, rounded up + int sharedBufferSize = maxSize * ELEMENT.SIZE*2; + + CLContext context = queue.getContext(); + + CLBuffer<B> in = context.createBuffer(input, READ_ONLY); + CLBuffer<B> out = context.createBuffer(output, WRITE_ONLY); + + smallScan.getKernel().putArg(in).putArg(out).putArgSize(sharedBufferSize).putArg(length).rewind(); + smallScan.setWorkSize(workSize, workSize); + + queue.putWriteBuffer(in, false); + queue.putWork(smallScan); + queue.putReadBuffer(out, true); + + in.release(); + out.release(); + + return output; + } + + @Override + public void release() { + program.release(); + } + + @Override + public boolean isReleased() { + return program == null || program.isReleased(); + } + + @Override + public String toString() { + return getClass().getSimpleName()+"["+OPERATION+", "+ELEMENT+"]"; + } + + public enum Op {ADD, MUL, MIN, MAX} + + private static class CLScanTask<B extends Buffer> extends CLTask<CLResourceQueueContext<Scan<B>>, B> { + + private final static int TYPE_ID = 2; + + private final B input; + private final B output; + private final Op op; + private final Class<? extends B> elementType; + private final Integer KEY; + + private CLScanTask(B input, B output, Op op, Class<? extends B> elementType) { + this.input = input; + this.output = output; + this.op = op; + this.elementType = elementType; + this.KEY = TYPE_ID + op.ordinal()*10 + 1000*ArgType.valueOf(elementType).ordinal(); + } + + @Override + public CLResourceQueueContext<Scan<B>> createQueueContext(CLCommandQueue queue) { + Scan<B> reduction = Scan.create(queue, op, elementType); + return new CLQueueContext.CLResourceQueueContext<Scan<B>>(queue, reduction); + } + + @Override + public B execute(CLResourceQueueContext<Scan<B>> context) { + return context.resource.scan(context.queue, input, output); + } + + @Override + public Object getContextKey() { + return KEY; + } + + @Override + public String toString() { + return getClass().getSimpleName()+"["+op+", "+elementType+", "+KEY+"]"; + } + } + +} diff --git a/src/com/jogamp/opencl/util/pp/scan.cl b/src/com/jogamp/opencl/util/pp/scan.cl new file mode 100644 index 00000000..9fc08cda --- /dev/null +++ b/src/com/jogamp/opencl/util/pp/scan.cl @@ -0,0 +1,92 @@ +// created on Wednesday, September 21 2011 +//@author mbien + +#ifdef DOUBLE_FP + #ifdef cl_khr_fp64 + #pragma OPENCL EXTENSION cl_khr_fp64 : enable + #elif defined(cl_amd_fp64) + #pragma OPENCL EXTENSION cl_amd_fp64 : enable + #else + #error "Double precision floating point not supported." + #endif +#endif + +#ifndef TYPE + typedef int TYPE; + #warning "TYPE was not set" +#endif + +/* + * Work-efficient compute implementation of scan, one thread per 2 elements + * O(log(n)) steps and O(n) adds using shared memory + * Uses a balanced tree algorithm. See Belloch, 1990 "Prefix Sums and Their Applications" + * Implementation is based on AMD's prefix sum example. + * length must be smaller than workgroup size, vector types are not supported. + */ +kernel void smallScan(const global TYPE* input, global TYPE* output, local TYPE* block, const uint length) { + + const uint id = get_global_id(0); + const uint id2= id*2; + + int offset = 1; + + /* Cache the computational window in shared memory */ + if(id2 < length - 1) { + block[id2] = input[id2]; + block[id2 + 1] = input[id2 + 1]; + }else if(id2 == length - 1) { // odd buffer length + block[id2] = input[id2]; + block[id2 + 1] = 0; + }else{ // no-op when out of bounds + block[id2] = 0; + block[id2 + 1] = 0; + } + + const uint limit = (length-length%2); + + /* build the sum in place up the tree */ + for(int d = limit>>1; d > 0; d >>=1) { + barrier(CLK_LOCAL_MEM_FENCE); + + if(id < d) { + int ai = offset*(id2 + 1) - 1; + int bi = offset*(id2 + 2) - 1; + + block[bi] += block[ai]; + } + offset *= 2; + } + + /* scan back down the tree */ + + /* clear the last element */ + if(id == 0) { + block[limit - 1] = 0; + } + + /* traverse down the tree building the scan in the place */ + for(int d = 1; d < limit; d *= 2) { + offset >>=1; + barrier(CLK_LOCAL_MEM_FENCE); + + if(id < d) { + int ai = offset*(id2 + 1) - 1; + int bi = offset*(id2 + 2) - 1; + + TYPE t = block[ai]; + block[ai] = block[bi]; + block[bi] += t; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + /*write the results back to global memory */ + if(id2 < length - 1) { + output[id2] = block[id2]; + output[id2 + 1] = block[id2 + 1]; + }else if(id2 == length - 1) { // odd length + output[id2] = block[id2]+block[id2-1]; + } +} + diff --git a/test/com/jogamp/opencl/CLProgramTest.java b/test/com/jogamp/opencl/CLProgramTest.java index cf7f45ff..ae5a0926 100644 --- a/test/com/jogamp/opencl/CLProgramTest.java +++ b/test/com/jogamp/opencl/CLProgramTest.java @@ -55,6 +55,7 @@ import static java.lang.System.*; import static com.jogamp.opencl.CLProgram.CompilerOptions.*; import static com.jogamp.opencl.util.CLPlatformFilters.*; import static com.jogamp.opencl.CLVersion.*; +import static com.jogamp.opencl.TestUtils.*; /** * @@ -364,11 +365,11 @@ public class CLProgramTest { Random rnd = new Random(seed); kernel.putArg(buffer); - kernel.putArg(rnd.nextFloat()); - kernel.putArg(rnd.nextFloat(), rnd.nextFloat()); -// kernel.putArg(rnd.nextFloat(), rnd.nextFloat(), rnd.nextFloat()); // nv does not support float3 - kernel.putArg(rnd.nextFloat(), rnd.nextFloat(), rnd.nextFloat(), rnd.nextFloat()); - kernel.putArg(TestUtils.fillBuffer(Buffers.newDirectFloatBuffer(8), seed)); + kernel.putArg(rndFloat(rnd)); + kernel.putArg(rndFloat(rnd), rndFloat(rnd)); +// kernel.putArg(rndFloat(rnd), rndFloat(rnd), rndFloat(rnd)); // nv does not support float3 + kernel.putArg(rndFloat(rnd), rndFloat(rnd), rndFloat(rnd), rndFloat(rnd)); + kernel.putArg(fillBuffer(Buffers.newDirectFloatBuffer(8), seed)); CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); queue.putTask(kernel).putReadBuffer(buffer, true); @@ -377,12 +378,12 @@ public class CLProgramTest { rnd = new Random(seed); for(int i = 0; i < 7; i++) { - assertEquals(rnd.nextFloat(), out.get(), 0.01f); + assertEquals(rndFloat(rnd), out.get(), 0.01f); } rnd = new Random(seed); for(int i = 0; i < 8; i++) { - assertEquals(rnd.nextFloat(), out.get(), 0.01f); + assertEquals(rndFloat(rnd), out.get(), 0.01f); } }finally{ diff --git a/test/com/jogamp/opencl/TestUtils.java b/test/com/jogamp/opencl/TestUtils.java index e7e5fabe..87e7cc5f 100644 --- a/test/com/jogamp/opencl/TestUtils.java +++ b/test/com/jogamp/opencl/TestUtils.java @@ -62,13 +62,21 @@ public class TestUtils { Random rnd = new Random(seed); while(buffer.remaining() != 0) - buffer.put(rnd.nextFloat()); + buffer.put(rndFloat(rnd)); buffer.rewind(); return buffer; } + public static float rndFloat(Random rnd) { + return rndFloat(rnd, 100); + } + + public static float rndFloat(Random rnd, float range) { + return (rnd.nextFloat()-0.5f)*range*2; + } + public static void checkIfEqual(ByteBuffer a, ByteBuffer b, int elements) { for(int i = 0; i < elements; i++) { int aVal = a.getInt(); diff --git a/test/com/jogamp/opencl/util/pp/ScanTest.java b/test/com/jogamp/opencl/util/pp/ScanTest.java new file mode 100644 index 00000000..81aa7288 --- /dev/null +++ b/test/com/jogamp/opencl/util/pp/ScanTest.java @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2011, Michael Bien + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are + * permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * Created on Tuesday, September 20 2011 01:26 + */ +package com.jogamp.opencl.util.pp; + +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.util.pp.Scan.Op; +import com.jogamp.common.nio.Buffers; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLPlatform; +import com.jogamp.opencl.TestUtils; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import org.junit.Test; + +import static org.junit.Assert.*; +import static java.lang.System.*; + + +/** + * + * @author Michael Bien + */ +public class ScanTest { + + + @Test + public void testSmallScan() { + CLContext context = CLContext.create(getDevice()); + + try{ + CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); + + int[][] in = new int[][]{ { 4, 0, 5, 5, 0, 5, 5, 1 }, //even + { 4, 0, 5, 5, 0, 5, 5, 1, 3 } };//odd + + int[] inclusive = new int[] { 4, 4, 9,14,14,19,24,27,28 }; + int[] exclusive = new int[] { 0, 4, 4, 9,14,14,19,24,27 }; + + for (int i = 0; i < in.length; i++) { + + IntBuffer input = Buffers.newDirectIntBuffer(in[i]); + IntBuffer output = Buffers.newDirectIntBuffer(input.capacity()); + + out.println((input.capacity()%2==0?"even":"odd") + " array lenght"); + + Scan<IntBuffer> scan = Scan.create(context, Op.ADD, input.getClass()); + scan.scan(queue, input, output); + + while(output.hasRemaining()) { + int value = output.get(); +// System.out.println(value); + assertEquals(exclusive[output.position()-1], value); + } + + scan.release(); + + } + }finally{ + context.release(); + } + } + + @Test + public void testSmallScanSizeLimit() { + + CLContext context = CLContext.create(getDevice()); + + try{ + CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); + + float[] exclusive = new float[queue.getDevice().getMaxWorkGroupSize()*2]; + + FloatBuffer input = Buffers.newDirectFloatBuffer(exclusive.length); + FloatBuffer output = Buffers.newDirectFloatBuffer(input.capacity()); + + TestUtils.fillBuffer(input, 42); + long time = nanoTime(); + for (int i = 1; i < exclusive.length; i++) { + exclusive[i] = exclusive[i-1]+input.get(i-1); + } + out.println("delta "+(nanoTime()-time)); + + Scan<FloatBuffer> scan = Scan.create(context, Op.ADD, input.getClass()); + time = nanoTime(); + scan.scan(queue, input, output); + out.println("delta "+(nanoTime()-time)); + + while(output.hasRemaining()) { + float value = output.get(); + assertEquals("@"+(output.position()-1),exclusive[output.position()-1], value, 0.1f); + } + + scan.release(); + + }finally{ + context.release(); + } + } + + private CLDevice getDevice() { + return CLPlatform.listCLPlatforms()[0].getMaxFlopsDevice(); + } + +} |