diff options
author | Michael Bien <[email protected]> | 2011-09-21 21:29:12 +0200 |
---|---|---|
committer | Michael Bien <[email protected]> | 2011-09-21 21:29:12 +0200 |
commit | b87b657642b6bda35bed85e8b43b3b68ac994c25 (patch) | |
tree | 86b7ffb45db16b2849184abad76bb9d1c90294b5 /src/com/jogamp/opencl | |
parent | baf07b12a2a62003334d17113e8dad1e92b80029 (diff) |
initial version of parallel scan primitive. The implementation is work in progress, final version will use a different kernel/algorithm.
random float utility method for TestUtils
Diffstat (limited to 'src/com/jogamp/opencl')
-rw-r--r-- | src/com/jogamp/opencl/util/pp/Scan.java | 193 | ||||
-rw-r--r-- | src/com/jogamp/opencl/util/pp/scan.cl | 92 |
2 files changed, 285 insertions, 0 deletions
diff --git a/src/com/jogamp/opencl/util/pp/Scan.java b/src/com/jogamp/opencl/util/pp/Scan.java new file mode 100644 index 00000000..9d79087e --- /dev/null +++ b/src/com/jogamp/opencl/util/pp/Scan.java @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2011, Michael Bien + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are + * permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * Created on Tuesday, September 20 2011 22:26 + */ +package com.jogamp.opencl.util.pp; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opencl.CLResource; +import com.jogamp.opencl.CLWork.CLWork1D; +import com.jogamp.opencl.util.CLUtil; +import com.jogamp.opencl.util.concurrent.CLQueueContext; +import com.jogamp.opencl.util.concurrent.CLTask; +import com.jogamp.opencl.util.concurrent.CLQueueContext.CLResourceQueueContext; +import com.jogamp.opencl.util.CLProgramConfiguration; +import java.io.IOException; +import java.nio.Buffer; + +import static com.jogamp.opencl.CLMemory.Mem.*; + +/** + * Prototype, not ready for general use. + * @author Michael Bien + */ +/*public */class Scan<B extends Buffer> implements CLResource { + + private static final String SOURCES; + + private final Op OPERATION; + private final ArgType ELEMENT; + + private final CLProgram program; + private final CLWork1D smallScan; + + static{ + try { + StringBuilder sb = new StringBuilder(2048); + CLUtil.readStream(Scan.class.getResourceAsStream("scan.cl"), sb); + SOURCES = sb.toString(); + } catch (IOException ex) { + throw new RuntimeException("can not initialize Reduction.", ex); + } + } + + private <B extends Buffer> Scan(CLContext context, Op op, Class<B> elementType) { + + if(!op.equals(Op.ADD)) { + throw new IllegalArgumentException("only add is supported for now"); + } + + this.ELEMENT = ArgType.valueOf(elementType); + this.OPERATION = op; + + this.program = context.createProgram(SOURCES); + + CLProgramConfiguration config = program.prepare(); + config.withDefine("OP_"+op.name()) + .withDefine("TYPE", ELEMENT.vectorType(1)); + if(ELEMENT.equals(ArgType.DOUBLE)) { + config.withDefine("DOUBLE_FP"); + } + config.build(); + + smallScan = CLWork1D.create1D(program.createCLKernel("smallScan")); + } + + public static <B extends Buffer> Scan<B> create(CLContext context, Op op, Class<? extends B> elementType) { + return new Scan<B>(context, op, elementType); + } + + public static <B extends Buffer> Scan<B> create(CLCommandQueue queue, Op op, Class<? extends B> elementType) { + return create(queue.getContext(), op, elementType); + } + + public static <B extends Buffer> CLTask<CLResourceQueueContext<Scan<B>>, B> createTask(B input, B output, Op op, Class<? extends B> elementType) { + return new CLScanTask<B>(input, output, op, elementType); + } + + public B scan(CLCommandQueue queue, B input, B output) { + + int length = input.capacity(); + + int maxSize = (int)smallScan.getKernel().getWorkGroupSize(queue.getDevice()); + if(length > maxSize*2) { + throw new IllegalArgumentException("buffer was to large for the given hardware"); + } + + int workSize = (length+length%2)/2; // half, rounded up + int sharedBufferSize = maxSize * ELEMENT.SIZE*2; + + CLContext context = queue.getContext(); + + CLBuffer<B> in = context.createBuffer(input, READ_ONLY); + CLBuffer<B> out = context.createBuffer(output, WRITE_ONLY); + + smallScan.getKernel().putArg(in).putArg(out).putArgSize(sharedBufferSize).putArg(length).rewind(); + smallScan.setWorkSize(workSize, workSize); + + queue.putWriteBuffer(in, false); + queue.putWork(smallScan); + queue.putReadBuffer(out, true); + + in.release(); + out.release(); + + return output; + } + + @Override + public void release() { + program.release(); + } + + @Override + public boolean isReleased() { + return program == null || program.isReleased(); + } + + @Override + public String toString() { + return getClass().getSimpleName()+"["+OPERATION+", "+ELEMENT+"]"; + } + + public enum Op {ADD, MUL, MIN, MAX} + + private static class CLScanTask<B extends Buffer> extends CLTask<CLResourceQueueContext<Scan<B>>, B> { + + private final static int TYPE_ID = 2; + + private final B input; + private final B output; + private final Op op; + private final Class<? extends B> elementType; + private final Integer KEY; + + private CLScanTask(B input, B output, Op op, Class<? extends B> elementType) { + this.input = input; + this.output = output; + this.op = op; + this.elementType = elementType; + this.KEY = TYPE_ID + op.ordinal()*10 + 1000*ArgType.valueOf(elementType).ordinal(); + } + + @Override + public CLResourceQueueContext<Scan<B>> createQueueContext(CLCommandQueue queue) { + Scan<B> reduction = Scan.create(queue, op, elementType); + return new CLQueueContext.CLResourceQueueContext<Scan<B>>(queue, reduction); + } + + @Override + public B execute(CLResourceQueueContext<Scan<B>> context) { + return context.resource.scan(context.queue, input, output); + } + + @Override + public Object getContextKey() { + return KEY; + } + + @Override + public String toString() { + return getClass().getSimpleName()+"["+op+", "+elementType+", "+KEY+"]"; + } + } + +} diff --git a/src/com/jogamp/opencl/util/pp/scan.cl b/src/com/jogamp/opencl/util/pp/scan.cl new file mode 100644 index 00000000..9fc08cda --- /dev/null +++ b/src/com/jogamp/opencl/util/pp/scan.cl @@ -0,0 +1,92 @@ +// created on Wednesday, September 21 2011 +//@author mbien + +#ifdef DOUBLE_FP + #ifdef cl_khr_fp64 + #pragma OPENCL EXTENSION cl_khr_fp64 : enable + #elif defined(cl_amd_fp64) + #pragma OPENCL EXTENSION cl_amd_fp64 : enable + #else + #error "Double precision floating point not supported." + #endif +#endif + +#ifndef TYPE + typedef int TYPE; + #warning "TYPE was not set" +#endif + +/* + * Work-efficient compute implementation of scan, one thread per 2 elements + * O(log(n)) steps and O(n) adds using shared memory + * Uses a balanced tree algorithm. See Belloch, 1990 "Prefix Sums and Their Applications" + * Implementation is based on AMD's prefix sum example. + * length must be smaller than workgroup size, vector types are not supported. + */ +kernel void smallScan(const global TYPE* input, global TYPE* output, local TYPE* block, const uint length) { + + const uint id = get_global_id(0); + const uint id2= id*2; + + int offset = 1; + + /* Cache the computational window in shared memory */ + if(id2 < length - 1) { + block[id2] = input[id2]; + block[id2 + 1] = input[id2 + 1]; + }else if(id2 == length - 1) { // odd buffer length + block[id2] = input[id2]; + block[id2 + 1] = 0; + }else{ // no-op when out of bounds + block[id2] = 0; + block[id2 + 1] = 0; + } + + const uint limit = (length-length%2); + + /* build the sum in place up the tree */ + for(int d = limit>>1; d > 0; d >>=1) { + barrier(CLK_LOCAL_MEM_FENCE); + + if(id < d) { + int ai = offset*(id2 + 1) - 1; + int bi = offset*(id2 + 2) - 1; + + block[bi] += block[ai]; + } + offset *= 2; + } + + /* scan back down the tree */ + + /* clear the last element */ + if(id == 0) { + block[limit - 1] = 0; + } + + /* traverse down the tree building the scan in the place */ + for(int d = 1; d < limit; d *= 2) { + offset >>=1; + barrier(CLK_LOCAL_MEM_FENCE); + + if(id < d) { + int ai = offset*(id2 + 1) - 1; + int bi = offset*(id2 + 2) - 1; + + TYPE t = block[ai]; + block[ai] = block[bi]; + block[bi] += t; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + /*write the results back to global memory */ + if(id2 < length - 1) { + output[id2] = block[id2]; + output[id2 + 1] = block[id2 + 1]; + }else if(id2 == length - 1) { // odd length + output[id2] = block[id2]+block[id2-1]; + } +} + |