summaryrefslogtreecommitdiffstats
path: root/src/com/jogamp/opencl/demos/hellojocl
diff options
context:
space:
mode:
Diffstat (limited to 'src/com/jogamp/opencl/demos/hellojocl')
-rw-r--r--src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java111
1 files changed, 64 insertions, 47 deletions
diff --git a/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
index 1daf890..70900eb 100644
--- a/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
+++ b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
@@ -3,6 +3,7 @@ package com.jogamp.opencl.demos.hellojocl;
import com.jogamp.opencl.CLBuffer;
import com.jogamp.opencl.CLCommandQueue;
import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
import com.jogamp.opencl.CLKernel;
import com.jogamp.opencl.CLProgram;
import java.io.IOException;
@@ -11,6 +12,7 @@ import java.util.Random;
import static java.lang.System.*;
import static com.jogamp.opencl.CLMemory.Mem.*;
+import static java.lang.Math.*;
/**
* Hello Java OpenCL example. Adds all elements of buffer A to buffer B
@@ -23,63 +25,78 @@ public class HelloJOCL {
public static void main(String[] args) throws IOException {
- int elementCount = 11444777; // Length of arrays to process
- int localWorkSize = 256; // Local work size dimensions
- int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize
-
- // set up
+ // set up (uses default CLPlatform and creates context for all devices)
CLContext context = CLContext.create();
-
- CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
-
- CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
- CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
- CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
-
- out.println("used device memory: "
- + (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");
-
- // fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between runs).
- fillBuffer(clBufferA.getBuffer(), 12345);
- fillBuffer(clBufferB.getBuffer(), 67890);
-
- // get a reference to the kernel functon with the name 'VectorAdd'
- // and map the buffers to its input parameters.
- CLKernel kernel = program.createCLKernel("VectorAdd");
- kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
-
- // create command queue on fastest device.
- CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
-
- // asynchronous write of data to GPU device, blocking read later to get the computed results back.
- long time = nanoTime();
- queue.putWriteBuffer(clBufferA, false)
- .putWriteBuffer(clBufferB, false)
- .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
- .putReadBuffer(clBufferC, true);
- time = nanoTime() - time;
-
- // cleanup all resources associated with this context.
- context.release();
-
- // print first few elements of the resulting buffer to the console.
- out.println("a+b=c results snapshot: ");
- for(int i = 0; i < 10; i++)
- out.print(clBufferC.getBuffer().get() + ", ");
- out.println("...; " + clBufferC.getBuffer().remaining() + " more");
-
- out.println("computation took: "+(time/1000000)+"ms");
+ out.println("created "+context);
+
+ // always make sure to release the context under all circumstances
+ // not needed for this particular sample but recommented
+ try{
+
+ // select fastest device
+ CLDevice device = context.getMaxFlopsDevice();
+ out.println("using "+device);
+
+ // create command queue on device.
+ CLCommandQueue queue = device.createCommandQueue();
+
+ int elementCount = 1444477; // Length of arrays to process
+ int localWorkSize = min(device.getMaxWorkGroupSize(), 256); // Local work size dimensions
+ int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize
+
+ // load sources, create and build program
+ CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
+
+ // A, B are input buffers, C is for the result
+ CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+ CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+ CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
+
+ out.println("used device memory: "
+ + (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");
+
+ // fill input buffers with random numbers
+ // (just to have test data; seed is fixed -> results will not change between runs).
+ fillBuffer(clBufferA.getBuffer(), 12345);
+ fillBuffer(clBufferB.getBuffer(), 67890);
+
+ // get a reference to the kernel function with the name 'VectorAdd'
+ // and map the buffers to its input parameters.
+ CLKernel kernel = program.createCLKernel("VectorAdd");
+ kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
+
+ // asynchronous write of data to GPU device,
+ // followed by blocking read to get the computed results back.
+ long time = nanoTime();
+ queue.putWriteBuffer(clBufferA, false)
+ .putWriteBuffer(clBufferB, false)
+ .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
+ .putReadBuffer(clBufferC, true);
+ time = nanoTime() - time;
+
+ // print first few elements of the resulting buffer to the console.
+ out.println("a+b=c results snapshot: ");
+ for(int i = 0; i < 10; i++)
+ out.print(clBufferC.getBuffer().get() + ", ");
+ out.println("...; " + clBufferC.getBuffer().remaining() + " more");
+
+ out.println("computation took: "+(time/1000000)+"ms");
+
+ }finally{
+ // cleanup all resources associated with this context.
+ context.release();
+ }
}
- private static final void fillBuffer(FloatBuffer buffer, int seed) {
+ private static void fillBuffer(FloatBuffer buffer, int seed) {
Random rnd = new Random(seed);
while(buffer.remaining() != 0)
buffer.put(rnd.nextFloat()*100);
buffer.rewind();
}
- private static final int roundUp(int groupSize, int globalSize) {
+ private static int roundUp(int groupSize, int globalSize) {
int r = globalSize % groupSize;
if (r == 0) {
return globalSize;