summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--resources/cl-common.cfg25
-rw-r--r--src/com/mbien/opencl/CLCommandQueue.java56
-rw-r--r--src/com/mbien/opencl/CLKernel.java18
-rw-r--r--src/com/mbien/opencl/QueueBarrier.java48
-rw-r--r--test/com/mbien/opencl/CLConcurrencyTest.java126
5 files changed, 229 insertions, 44 deletions
diff --git a/resources/cl-common.cfg b/resources/cl-common.cfg
index 16b2017d..470c562d 100644
--- a/resources/cl-common.cfg
+++ b/resources/cl-common.cfg
@@ -29,30 +29,27 @@ Ignore CL_ULONG_MAX
#enforce client side "good behavior" by generating direct-memory-only bindings for
#performance critical functions.
#NioDirectOnly __ALL__
+#command queue ops
NioDirectOnly clEnqueueBarrier
-NioDirectOnly clEnqueueMarker
-NioDirectOnly clEnqueueNativeKernel
-NioDirectOnly clEnqueueUnmapMemObject
-NioDirectOnly clEnqueueWaitForEvents
-NioDirectOnly clEnqueueWriteImage
-NioDirectOnly clEnqueueReadBuffer
-NioDirectOnly clEnqueueWriteBuffer
-NioDirectOnly clEnqueueReadImage
-NioDirectOnly clEnqueueWriteImage
+NioDirectOnly clEnqueueCopyBuffer
+NioDirectOnly clEnqueueCopyBufferToImage
NioDirectOnly clEnqueueCopyImage
NioDirectOnly clEnqueueCopyImageToBuffer
-NioDirectOnly clEnqueueCopyBufferToImage
NioDirectOnly clEnqueueMapBuffer
NioDirectOnly clEnqueueMapImage
-NioDirectOnly clEnqueueUnmapMemObject
+NioDirectOnly clEnqueueMarker
NioDirectOnly clEnqueueNDRangeKernel
+NioDirectOnly clEnqueueReadBuffer
+NioDirectOnly clEnqueueReadImage
NioDirectOnly clEnqueueTask
-NioDirectOnly clEnqueueNativeKernel
-NioDirectOnly clEnqueueMarker
+NioDirectOnly clEnqueueUnmapMemObject
NioDirectOnly clEnqueueWaitForEvents
-NioDirectOnly clEnqueueBarrier
+NioDirectOnly clEnqueueWriteBuffer
+NioDirectOnly clEnqueueWriteImage
NioDirectOnly clEnqueueAcquireGLObjects
NioDirectOnly clEnqueueReleaseGLObjects
+#kernel ops
+NioDirectOnly clSetKernelArg
#common rename emitted struct accessors
#struct cl_image_format
diff --git a/src/com/mbien/opencl/CLCommandQueue.java b/src/com/mbien/opencl/CLCommandQueue.java
index 38ec7274..4e88ff1d 100644
--- a/src/com/mbien/opencl/CLCommandQueue.java
+++ b/src/com/mbien/opencl/CLCommandQueue.java
@@ -48,11 +48,10 @@ public class CLCommandQueue implements CLResource {
}
public CLCommandQueue putWriteBuffer(CLBuffer<?> writeBuffer, boolean blockingRead) {
- return putWriteBuffer(writeBuffer, null, blockingRead);
+ return putWriteBuffer(writeBuffer, blockingRead, null);
}
- public CLCommandQueue putWriteBuffer(CLBuffer<?> writeBuffer, CLEventList events, boolean blockingWrite) {
- PointerBuffer pb = PointerBuffer.allocateDirect(2);
+ public CLCommandQueue putWriteBuffer(CLBuffer<?> writeBuffer, boolean blockingWrite, CLEventList events) {
int ret = cl.clEnqueueWriteBuffer(
ID, writeBuffer.ID, blockingWrite ? CL_TRUE : CL_FALSE,
@@ -70,11 +69,11 @@ public class CLCommandQueue implements CLResource {
}
public CLCommandQueue putReadBuffer(CLBuffer<?> readBuffer, boolean blockingRead) {
- putReadBuffer(readBuffer, null, blockingRead);
+ putReadBuffer(readBuffer, blockingRead, null);
return this;
}
- public CLCommandQueue putReadBuffer(CLBuffer<?> readBuffer, CLEventList events, boolean blockingRead) {
+ public CLCommandQueue putReadBuffer(CLBuffer<?> readBuffer, boolean blockingRead, CLEventList events) {
int ret = cl.clEnqueueReadBuffer(
ID, readBuffer.ID, blockingRead ? CL_TRUE : CL_FALSE,
@@ -150,11 +149,6 @@ public class CLCommandQueue implements CLResource {
return this;
}
- public CLCommandQueue putTask() {
-
- return this;
- }
-
public CLBuffer putMapBuffer() {
return null;
@@ -198,12 +192,35 @@ public class CLCommandQueue implements CLResource {
return this;
}
+ /**
+ * {@link #putTask} equivalent to calling
+ * {@link #put1DRangeKernel(CLKernel kernel, long globalWorkOffset, long globalWorkSize, long localWorkSize)}
+ * with globalWorkOffset = null, globalWorkSize set to 1, and localWorkSize set to 1.
+ */
+ public CLCommandQueue putTask(CLKernel kernel) {
+ int ret = cl.clEnqueueTask(ID, kernel.ID, 0, null, null);
+ checkForError(ret, "can not enqueue Task");
+ return this;
+ }
+
+ /**
+ * @see #putTask(com.mbien.opencl.CLKernel)
+ */
+ public CLCommandQueue putTask(CLKernel kernel, CLEventList events) {
+ int ret = cl.clEnqueueTask(ID, kernel.ID, 0, null, events==null ? null : events.IDs);
+ checkForError(ret, "can not enqueue Task");
+ if(events != null) {
+ events.createEvent(context);
+ }
+ return this;
+ }
+
public CLCommandQueue put1DRangeKernel(CLKernel kernel, long globalWorkOffset, long globalWorkSize, long localWorkSize) {
- this.put1DRangeKernel(kernel, null, globalWorkOffset, globalWorkSize, localWorkSize);
+ this.put1DRangeKernel(kernel, globalWorkOffset, globalWorkSize, localWorkSize, null);
return this;
}
- public CLCommandQueue put1DRangeKernel(CLKernel kernel, CLEventList events, long globalWorkOffset, long globalWorkSize, long localWorkSize) {
+ public CLCommandQueue put1DRangeKernel(CLKernel kernel, long globalWorkOffset, long globalWorkSize, long localWorkSize, CLEventList events) {
PointerBuffer globWO = null;
PointerBuffer globWS = null;
PointerBuffer locWS = null;
@@ -218,25 +235,24 @@ public class CLCommandQueue implements CLResource {
locWS = bufferC.put(1, localWorkSize).position(1);
}
- this.putNDRangeKernel(kernel, events, 1, globWO, globWS, locWS);
+ this.putNDRangeKernel(kernel, 1, globWO, globWS, locWS, events);
return this;
}
public CLCommandQueue put2DRangeKernel(CLKernel kernel, long globalWorkOffsetX, long globalWorkOffsetY,
long globalWorkSizeX, long globalWorkSizeY,
long localWorkSizeX, long localWorkSizeY) {
- this.put2DRangeKernel(kernel, null,
+ this.put2DRangeKernel(kernel,
globalWorkOffsetX, globalWorkOffsetY,
globalWorkSizeX, globalWorkSizeY,
- localWorkSizeX, localWorkSizeY);
+ localWorkSizeX, localWorkSizeY, null);
return this;
}
- public CLCommandQueue put2DRangeKernel(CLKernel kernel, CLEventList events,
- long globalWorkOffsetX, long globalWorkOffsetY,
+ public CLCommandQueue put2DRangeKernel(CLKernel kernel, long globalWorkOffsetX, long globalWorkOffsetY,
long globalWorkSizeX, long globalWorkSizeY,
- long localWorkSizeX, long localWorkSizeY) {
+ long localWorkSizeX, long localWorkSizeY, CLEventList events) {
PointerBuffer globalWorkOffset = null;
PointerBuffer globalWorkSize = null;
PointerBuffer localWorkSize = null;
@@ -255,11 +271,11 @@ public class CLCommandQueue implements CLResource {
}
public CLCommandQueue putNDRangeKernel(CLKernel kernel, int workDimension, PointerBuffer globalWorkOffset, PointerBuffer globalWorkSize, PointerBuffer localWorkSize) {
- this.putNDRangeKernel(kernel, null, workDimension, globalWorkOffset, globalWorkSize, localWorkSize);
+ this.putNDRangeKernel(kernel, workDimension, globalWorkOffset, globalWorkSize, localWorkSize, null);
return this;
}
- public CLCommandQueue putNDRangeKernel(CLKernel kernel, CLEventList events, int workDimension, PointerBuffer globalWorkOffset, PointerBuffer globalWorkSize, PointerBuffer localWorkSize) {
+ public CLCommandQueue putNDRangeKernel(CLKernel kernel, int workDimension, PointerBuffer globalWorkOffset, PointerBuffer globalWorkSize, PointerBuffer localWorkSize, CLEventList events) {
int ret = cl.clEnqueueNDRangeKernel(
ID, kernel.ID, workDimension,
diff --git a/src/com/mbien/opencl/CLKernel.java b/src/com/mbien/opencl/CLKernel.java
index e7ac4b4d..2115a9f8 100644
--- a/src/com/mbien/opencl/CLKernel.java
+++ b/src/com/mbien/opencl/CLKernel.java
@@ -26,12 +26,15 @@ public class CLKernel implements CLResource {
private final CLProgram program;
private final CL cl;
+ private final ByteBuffer buffer;
+
private int argIndex;
CLKernel(CLProgram program, long id) {
this.ID = id;
this.program = program;
this.cl = program.context.cl;
+ this.buffer = BufferFactory.newDirectByteBuffer(8);
long[] longArray = new long[1];
@@ -136,19 +139,19 @@ public class CLKernel implements CLResource {
}
private final Buffer wrap(float value) {
- return BufferFactory.newDirectByteBuffer(4).putFloat(value).rewind();
+ return buffer.putFloat(value).rewind();
}
private final Buffer wrap(double value) {
- return BufferFactory.newDirectByteBuffer(8).putDouble(value).rewind();
+ return buffer.putDouble(value).rewind();
}
private final Buffer wrap(int value) {
- return BufferFactory.newDirectByteBuffer(4).putInt(value).rewind();
+ return buffer.putInt(value).rewind();
}
private final Buffer wrap(long value) {
- return BufferFactory.newDirectByteBuffer(8).putLong(value).rewind();
+ return buffer.putLong(value).rewind();
}
public CLKernel rewind() {
@@ -196,5 +199,12 @@ public class CLKernel implements CLResource {
hash = 43 * hash + (this.program != null ? this.program.hashCode() : 0);
return hash;
}
+
+ CLKernel copy() {
+ int[] err = new int[1];
+ long newID = cl.clCreateKernel(program.ID, name, err, 0);
+ checkForError(err[0], "can not copy kernel");
+ return new CLKernel(program, newID);
+ }
}
diff --git a/src/com/mbien/opencl/QueueBarrier.java b/src/com/mbien/opencl/QueueBarrier.java
new file mode 100644
index 00000000..247ede7a
--- /dev/null
+++ b/src/com/mbien/opencl/QueueBarrier.java
@@ -0,0 +1,48 @@
+package com.mbien.opencl;
+
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class QueueBarrier {
+
+ private final CountDownLatch latch;
+
+ public QueueBarrier(int queueCount) {
+ this.latch = new CountDownLatch(queueCount);
+ }
+
+ /**
+ * Blocks the current Thread until the given events on the CLCommandQueue occurred.
+ * This method may be invoked concurrently without synchronization on the QueueBarrier object
+ * as long each Thread passes a distinct CLCommandQueue as parameter to this method.
+ */
+ public QueueBarrier waitFor(CLCommandQueue queue, CLEventList events) {
+ queue.putWaitForEvents(events);
+ latch.countDown();
+ return this;
+ }
+
+ /**
+ * Blocks until all Threads which called {@link #waitFor}
+ * continue excecution.
+ * This method blocks only once, all subsequent calls are ignored.
+ */
+ public QueueBarrier await() throws InterruptedException {
+ latch.await();
+ return this;
+ }
+ /**
+ * @see {@link #await()}
+ * @param timeout the maximum time to wait
+ * @param unit the time unit of the {@code timeout} argument
+ */
+ public QueueBarrier await(long timeout, TimeUnit unit) throws InterruptedException {
+ latch.await(timeout, unit);
+ return this;
+ }
+
+}
diff --git a/test/com/mbien/opencl/CLConcurrencyTest.java b/test/com/mbien/opencl/CLConcurrencyTest.java
index 057b57ee..dfaa4e49 100644
--- a/test/com/mbien/opencl/CLConcurrencyTest.java
+++ b/test/com/mbien/opencl/CLConcurrencyTest.java
@@ -1,7 +1,6 @@
package com.mbien.opencl;
import com.mbien.opencl.CLBuffer.Mem;
-import com.mbien.opencl.CLCommandQueue.Mode;
import java.io.IOException;
import java.nio.ByteBuffer;
import org.junit.Test;
@@ -45,9 +44,8 @@ public class CLConcurrencyTest {
assertEquals(0, events.size());
- // asynchronous write of data to GPU device, blocking read later to get the computed results back.
- queue.putWriteBuffer(clBufferA, events, false) // write A
- .putWriteBuffer(clBufferB, events, false); // write B
+ queue.putWriteBuffer(clBufferA, false, events) // write A
+ .putWriteBuffer(clBufferB, false, events); // write B
assertEquals(2, events.size());
queue.putWaitForEvents(events);
@@ -56,10 +54,10 @@ public class CLConcurrencyTest {
assertEquals(0, events.size());
vectorAddKernel.setArgs(clBufferA, clBufferB, clBufferC); // C = A+B
- queue.put1DRangeKernel(vectorAddKernel, events, 0, elements, 256);
+ queue.put1DRangeKernel(vectorAddKernel, 0, elements, 256, events);
vectorAddKernel.setArgs(clBufferA, clBufferB, clBufferD); // D = A+B
- queue.put1DRangeKernel(vectorAddKernel, events, 0, elements, 256);
+ queue.put1DRangeKernel(vectorAddKernel, 0, elements, 256, events);
assertEquals(2, events.size());
queue.putWaitForEvent(events, 0)
@@ -80,4 +78,120 @@ public class CLConcurrencyTest {
}
+ @Test
+ public void concurrencyTest() throws IOException, InterruptedException {
+
+ out.println(" - - - queue synchronisation test - - - ");
+
+ final int elements = ONE_MB/SIZEOF_INT * 10; // 20MB per buffer
+
+ CLContext context = CLContext.create();
+
+ CLDevice[] devices = context.getCLDevices();
+
+ if(devices.length < 2) {
+ out.println("aborting test... need at least 2 devices");
+ context.release();
+ return;
+ }
+
+ final CLBuffer<ByteBuffer> clBufferC = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+ final CLBuffer<ByteBuffer> clBufferD = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+
+ final CLBuffer<ByteBuffer> clBufferA1 = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+ final CLBuffer<ByteBuffer> clBufferB1 = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+ final CLBuffer<ByteBuffer> clBufferA2 = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+ final CLBuffer<ByteBuffer> clBufferB2 = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+
+ CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl")).build();
+
+ final CLKernel vectorAddKernel1 = program.getCLKernel("VectorAddGM")
+ .setArg(3, elements);
+
+ //TODO introduce public api for cloning/creating kernels
+ final CLKernel vectorAddKernel2 = vectorAddKernel1.copy()
+ .setArg(3, elements);
+
+
+ int secondDevice = devices.length > 1 ? 1 : 0;
+
+ final CLCommandQueue queue1 = devices[0 ].createCommandQueue();
+ final CLCommandQueue queue2 = devices[secondDevice].createCommandQueue();
+
+ if(secondDevice > 0)
+ System.out.println("using two devices");
+
+ final QueueBarrier barrier = new QueueBarrier(2);
+
+ Thread thread1 = new Thread("C") {
+
+ @Override
+ public void run() {
+
+ fillBuffer(clBufferA1.buffer, 12345);
+ fillBuffer(clBufferB1.buffer, 67890);
+
+// System.out.println("C buffer");
+ queue1.putWriteBuffer(clBufferA1, false) // write A
+ .putWriteBuffer(clBufferB1, true); // write B
+
+// System.out.println("C args");
+ vectorAddKernel1.setArgs(clBufferA1, clBufferB1, clBufferC); // C = A+B
+
+// System.out.println("C kernels");
+ CLEventList events1 = new CLEventList(2);
+ queue1.put1DRangeKernel(vectorAddKernel1, 0, elements, 256, events1)
+ .putReadBuffer(clBufferC, false, events1);
+
+ barrier.waitFor(queue1, events1);
+
+ }
+
+ };
+
+ Thread thread2 = new Thread("D") {
+
+ @Override
+ public void run() {
+
+ fillBuffer(clBufferA2.buffer, 12345);
+ fillBuffer(clBufferB2.buffer, 67890);
+
+// System.out.println("D buffer");
+ queue2.putWriteBuffer(clBufferA2, false) // write A
+ .putWriteBuffer(clBufferB2, true); // write B
+
+// System.out.println("D args");
+ vectorAddKernel2.setArgs(clBufferA2, clBufferB2, clBufferD); // D = A+B
+
+// System.out.println("D kernels");
+ CLEventList events2 = new CLEventList(2);
+ queue2.put1DRangeKernel(vectorAddKernel2, 0, elements, 256, events2)
+ .putReadBuffer(clBufferD, false, events2);
+
+ barrier.waitFor(queue2, events2);
+
+ }
+
+ };
+
+ out.println("starting threads");
+ thread1.start();
+ thread2.start();
+ barrier.await();
+ out.println("done");
+
+ checkIfEqual(clBufferC.buffer, clBufferD.buffer, elements);
+
+ context.release();
+
+// vectorAddKernel2.release();
+
+ out.println("results are valid");
+
+ }
+
+
+
+
} \ No newline at end of file