summaryrefslogtreecommitdiffstats
path: root/src/com/jogamp/opencl
diff options
context:
space:
mode:
authorMichael Bien <[email protected]>2011-09-21 21:29:12 +0200
committerMichael Bien <[email protected]>2011-09-21 21:29:12 +0200
commitb87b657642b6bda35bed85e8b43b3b68ac994c25 (patch)
tree86b7ffb45db16b2849184abad76bb9d1c90294b5 /src/com/jogamp/opencl
parentbaf07b12a2a62003334d17113e8dad1e92b80029 (diff)
initial version of parallel scan primitive. The implementation is work in progress, final version will use a different kernel/algorithm.
random float utility method for TestUtils
Diffstat (limited to 'src/com/jogamp/opencl')
-rw-r--r--src/com/jogamp/opencl/util/pp/Scan.java193
-rw-r--r--src/com/jogamp/opencl/util/pp/scan.cl92
2 files changed, 285 insertions, 0 deletions
diff --git a/src/com/jogamp/opencl/util/pp/Scan.java b/src/com/jogamp/opencl/util/pp/Scan.java
new file mode 100644
index 00000000..9d79087e
--- /dev/null
+++ b/src/com/jogamp/opencl/util/pp/Scan.java
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2011, Michael Bien
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list of
+ * conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ * of conditions and the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Created on Tuesday, September 20 2011 22:26
+ */
+package com.jogamp.opencl.util.pp;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opencl.CLResource;
+import com.jogamp.opencl.CLWork.CLWork1D;
+import com.jogamp.opencl.util.CLUtil;
+import com.jogamp.opencl.util.concurrent.CLQueueContext;
+import com.jogamp.opencl.util.concurrent.CLTask;
+import com.jogamp.opencl.util.concurrent.CLQueueContext.CLResourceQueueContext;
+import com.jogamp.opencl.util.CLProgramConfiguration;
+import java.io.IOException;
+import java.nio.Buffer;
+
+import static com.jogamp.opencl.CLMemory.Mem.*;
+
+/**
+ * Prototype, not ready for general use.
+ * @author Michael Bien
+ */
+/*public */class Scan<B extends Buffer> implements CLResource {
+
+ private static final String SOURCES;
+
+ private final Op OPERATION;
+ private final ArgType ELEMENT;
+
+ private final CLProgram program;
+ private final CLWork1D smallScan;
+
+ static{
+ try {
+ StringBuilder sb = new StringBuilder(2048);
+ CLUtil.readStream(Scan.class.getResourceAsStream("scan.cl"), sb);
+ SOURCES = sb.toString();
+ } catch (IOException ex) {
+ throw new RuntimeException("can not initialize Reduction.", ex);
+ }
+ }
+
+ private <B extends Buffer> Scan(CLContext context, Op op, Class<B> elementType) {
+
+ if(!op.equals(Op.ADD)) {
+ throw new IllegalArgumentException("only add is supported for now");
+ }
+
+ this.ELEMENT = ArgType.valueOf(elementType);
+ this.OPERATION = op;
+
+ this.program = context.createProgram(SOURCES);
+
+ CLProgramConfiguration config = program.prepare();
+ config.withDefine("OP_"+op.name())
+ .withDefine("TYPE", ELEMENT.vectorType(1));
+ if(ELEMENT.equals(ArgType.DOUBLE)) {
+ config.withDefine("DOUBLE_FP");
+ }
+ config.build();
+
+ smallScan = CLWork1D.create1D(program.createCLKernel("smallScan"));
+ }
+
+ public static <B extends Buffer> Scan<B> create(CLContext context, Op op, Class<? extends B> elementType) {
+ return new Scan<B>(context, op, elementType);
+ }
+
+ public static <B extends Buffer> Scan<B> create(CLCommandQueue queue, Op op, Class<? extends B> elementType) {
+ return create(queue.getContext(), op, elementType);
+ }
+
+ public static <B extends Buffer> CLTask<CLResourceQueueContext<Scan<B>>, B> createTask(B input, B output, Op op, Class<? extends B> elementType) {
+ return new CLScanTask<B>(input, output, op, elementType);
+ }
+
+ public B scan(CLCommandQueue queue, B input, B output) {
+
+ int length = input.capacity();
+
+ int maxSize = (int)smallScan.getKernel().getWorkGroupSize(queue.getDevice());
+ if(length > maxSize*2) {
+ throw new IllegalArgumentException("buffer was to large for the given hardware");
+ }
+
+ int workSize = (length+length%2)/2; // half, rounded up
+ int sharedBufferSize = maxSize * ELEMENT.SIZE*2;
+
+ CLContext context = queue.getContext();
+
+ CLBuffer<B> in = context.createBuffer(input, READ_ONLY);
+ CLBuffer<B> out = context.createBuffer(output, WRITE_ONLY);
+
+ smallScan.getKernel().putArg(in).putArg(out).putArgSize(sharedBufferSize).putArg(length).rewind();
+ smallScan.setWorkSize(workSize, workSize);
+
+ queue.putWriteBuffer(in, false);
+ queue.putWork(smallScan);
+ queue.putReadBuffer(out, true);
+
+ in.release();
+ out.release();
+
+ return output;
+ }
+
+ @Override
+ public void release() {
+ program.release();
+ }
+
+ @Override
+ public boolean isReleased() {
+ return program == null || program.isReleased();
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName()+"["+OPERATION+", "+ELEMENT+"]";
+ }
+
+ public enum Op {ADD, MUL, MIN, MAX}
+
+ private static class CLScanTask<B extends Buffer> extends CLTask<CLResourceQueueContext<Scan<B>>, B> {
+
+ private final static int TYPE_ID = 2;
+
+ private final B input;
+ private final B output;
+ private final Op op;
+ private final Class<? extends B> elementType;
+ private final Integer KEY;
+
+ private CLScanTask(B input, B output, Op op, Class<? extends B> elementType) {
+ this.input = input;
+ this.output = output;
+ this.op = op;
+ this.elementType = elementType;
+ this.KEY = TYPE_ID + op.ordinal()*10 + 1000*ArgType.valueOf(elementType).ordinal();
+ }
+
+ @Override
+ public CLResourceQueueContext<Scan<B>> createQueueContext(CLCommandQueue queue) {
+ Scan<B> reduction = Scan.create(queue, op, elementType);
+ return new CLQueueContext.CLResourceQueueContext<Scan<B>>(queue, reduction);
+ }
+
+ @Override
+ public B execute(CLResourceQueueContext<Scan<B>> context) {
+ return context.resource.scan(context.queue, input, output);
+ }
+
+ @Override
+ public Object getContextKey() {
+ return KEY;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName()+"["+op+", "+elementType+", "+KEY+"]";
+ }
+ }
+
+}
diff --git a/src/com/jogamp/opencl/util/pp/scan.cl b/src/com/jogamp/opencl/util/pp/scan.cl
new file mode 100644
index 00000000..9fc08cda
--- /dev/null
+++ b/src/com/jogamp/opencl/util/pp/scan.cl
@@ -0,0 +1,92 @@
+// created on Wednesday, September 21 2011
+//@author mbien
+
+#ifdef DOUBLE_FP
+ #ifdef cl_khr_fp64
+ #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+ #elif defined(cl_amd_fp64)
+ #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+ #else
+ #error "Double precision floating point not supported."
+ #endif
+#endif
+
+#ifndef TYPE
+ typedef int TYPE;
+ #warning "TYPE was not set"
+#endif
+
+/*
+ * Work-efficient compute implementation of scan, one thread per 2 elements
+ * O(log(n)) steps and O(n) adds using shared memory
+ * Uses a balanced tree algorithm. See Belloch, 1990 "Prefix Sums and Their Applications"
+ * Implementation is based on AMD's prefix sum example.
+ * length must be smaller than workgroup size, vector types are not supported.
+ */
+kernel void smallScan(const global TYPE* input, global TYPE* output, local TYPE* block, const uint length) {
+
+ const uint id = get_global_id(0);
+ const uint id2= id*2;
+
+ int offset = 1;
+
+ /* Cache the computational window in shared memory */
+ if(id2 < length - 1) {
+ block[id2] = input[id2];
+ block[id2 + 1] = input[id2 + 1];
+ }else if(id2 == length - 1) { // odd buffer length
+ block[id2] = input[id2];
+ block[id2 + 1] = 0;
+ }else{ // no-op when out of bounds
+ block[id2] = 0;
+ block[id2 + 1] = 0;
+ }
+
+ const uint limit = (length-length%2);
+
+ /* build the sum in place up the tree */
+ for(int d = limit>>1; d > 0; d >>=1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(id < d) {
+ int ai = offset*(id2 + 1) - 1;
+ int bi = offset*(id2 + 2) - 1;
+
+ block[bi] += block[ai];
+ }
+ offset *= 2;
+ }
+
+ /* scan back down the tree */
+
+ /* clear the last element */
+ if(id == 0) {
+ block[limit - 1] = 0;
+ }
+
+ /* traverse down the tree building the scan in the place */
+ for(int d = 1; d < limit; d *= 2) {
+ offset >>=1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(id < d) {
+ int ai = offset*(id2 + 1) - 1;
+ int bi = offset*(id2 + 2) - 1;
+
+ TYPE t = block[ai];
+ block[ai] = block[bi];
+ block[bi] += t;
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /*write the results back to global memory */
+ if(id2 < length - 1) {
+ output[id2] = block[id2];
+ output[id2 + 1] = block[id2 + 1];
+ }else if(id2 == length - 1) { // odd length
+ output[id2] = block[id2]+block[id2-1];
+ }
+}
+