7 files changed, 1382 insertions, 0 deletions
diff --git a/test/com/jogamp/opencl/CLBufferTest.java b/test/com/jogamp/opencl/CLBufferTest.java
new file mode 100644
index 00000000..d0c8c2f9
--- /dev/null
+++ b/test/com/jogamp/opencl/CLBufferTest.java
@@ -0,0 +1,149 @@
+package com.jogamp.opencl;
+
+import com.jogamp.opencl.CLMemory.Mem;
+import com.jogamp.opencl.CLMemory.Map;
+import com.jogamp.common.nio.Buffers;
+import java.nio.ByteBuffer;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import static java.lang.System.*;
+import static com.jogamp.opencl.TestUtils.*;
+import static com.jogamp.common.nio.Buffers.*;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class CLBufferTest {
+
+    @Test
+    public void writeCopyReadBufferTest() {
+
+        out.println(" - - - highLevelTest; copy buffer test - - - ");
+
+        final int elements = NUM_ELEMENTS;
+
+        CLContext context = CLContext.create();
+
+         // the CL.MEM_* flag is probably completely irrelevant in our case since we do not use a kernel in this test
+        CLBuffer<ByteBuffer> clBufferA = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferB = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+
+        // fill only first read buffer -> we will copy the payload to the second later.
+        fillBuffer(clBufferA.buffer, 12345);
+
+        CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
+
+        // asynchronous write of data to GPU device, blocking read later to get the computed results back.
+        queue.putWriteBuffer(clBufferA, false)                                 // write A
+             .putCopyBuffer(clBufferA, clBufferB, clBufferA.buffer.capacity()) // copy A -> B
+             .putReadBuffer(clBufferB, true)                                   // read B
+             .finish();
+
+        context.release();
+
+        out.println("validating computed results...");
+        checkIfEqual(clBufferA.buffer, clBufferB.buffer, elements);
+        out.println("results are valid");
+
+    }
+
+    @Test
+    public void bufferWithHostPointerTest() {
+
+        out.println(" - - - highLevelTest; host pointer test - - - ");
+
+        final int elements = NUM_ELEMENTS;
+
+        CLContext context = CLContext.create();
+
+        ByteBuffer buffer = Buffers.newDirectByteBuffer(elements*SIZEOF_INT);
+        // fill only first read buffer -> we will copy the payload to the second later.
+        fillBuffer(buffer, 12345);
+
+        CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
+
+        Mem[] bufferConfig = new Mem[] {Mem.COPY_BUFFER, Mem.USE_BUFFER};
+
+        for(int i = 0; i < bufferConfig.length; i++) {
+
+            out.println("testing with "+bufferConfig[i] + " config");
+
+            CLBuffer<ByteBuffer> clBufferA = context.createBuffer(buffer, Mem.READ_ONLY, bufferConfig[i]);
+            CLBuffer<ByteBuffer> clBufferB = context.createByteBuffer(elements*SIZEOF_INT, Mem.READ_ONLY);
+
+            // asynchronous write of data to GPU device, blocking read later to get the computed results back.
+            queue.putCopyBuffer(clBufferA, clBufferB, clBufferA.buffer.capacity()) // copy A -> B
+                 .putReadBuffer(clBufferB, true)                                   // read B
+                 .finish();
+
+            assertEquals(2, context.getMemoryObjects().size());
+            clBufferA.release();
+            assertEquals(1, context.getMemoryObjects().size());
+            clBufferB.release();
+            assertEquals(0, context.getMemoryObjects().size());
+
+            // uploading worked when a==b.
+            out.println("validating computed results...");
+            checkIfEqual(clBufferA.buffer, clBufferB.buffer, elements);
+            out.println("results are valid");
+        }
+
+        context.release();
+    }
+    
+    @Test
+    public void mapBufferTest() {
+
+        out.println(" - - - highLevelTest; map buffer test - - - ");
+
+        final int elements = NUM_ELEMENTS;
+        final int sizeInBytes = elements*SIZEOF_INT;
+
+        CLContext context;
+        CLBuffer<?> clBufferA;
+        CLBuffer<?> clBufferB;
+
+        // We will have to allocate mappable NIO memory on non CPU contexts
+        // since we can't map e.g GPU memory.
+        if(CLPlatform.getDefault().listCLDevices(CLDevice.Type.CPU).length > 0) {
+
+            context = CLContext.create(CLDevice.Type.CPU);
+
+            clBufferA = context.createBuffer(sizeInBytes, Mem.READ_WRITE);
+            clBufferB = context.createBuffer(sizeInBytes, Mem.READ_WRITE);
+        }else{
+
+            context = CLContext.create();
+
+            clBufferA = context.createByteBuffer(sizeInBytes, Mem.READ_WRITE, Mem.USE_BUFFER);
+            clBufferB = context.createByteBuffer(sizeInBytes, Mem.READ_WRITE, Mem.USE_BUFFER);
+        }
+
+        CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
+        
+        // fill only first buffer -> we will copy the payload to the second later.
+        ByteBuffer mappedBufferA = queue.putMapBuffer(clBufferA, Map.READ_WRITE, true);
+        assertEquals(sizeInBytes, mappedBufferA.capacity());
+
+        fillBuffer(mappedBufferA, 12345);           // write to A
+
+        queue.putUnmapMemory(clBufferA)             // unmap A
+             .putCopyBuffer(clBufferA, clBufferB);  // copy A -> B
+
+        // map B for read operations
+        ByteBuffer mappedBufferB = queue.putMapBuffer(clBufferB, Map.READ, true);
+        assertEquals(sizeInBytes, mappedBufferB.capacity());
+
+        out.println("validating computed results...");
+        checkIfEqual(mappedBufferA, mappedBufferB, elements); // A == B ?
+        out.println("results are valid");
+
+        queue.putUnmapMemory(clBufferB);            // unmap B
+
+        context.release();
+
+    }
+    
+}
diff --git a/test/com/jogamp/opencl/CLCommandQueueTest.java b/test/com/jogamp/opencl/CLCommandQueueTest.java
new file mode 100644
index 00000000..a5d7afb1
--- /dev/null
+++ b/test/com/jogamp/opencl/CLCommandQueueTest.java
@@ -0,0 +1,266 @@
+package com.jogamp.opencl;
+
+import com.jogamp.opencl.util.MultiQueueBarrier;
+import com.jogamp.opencl.CLCommandQueue.Mode;
+import com.jogamp.opencl.CLMemory.Mem;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import static java.lang.System.*;
+import static com.jogamp.opencl.TestUtils.*;
+import static com.jogamp.opencl.CLEvent.*;
+import static com.jogamp.common.nio.Buffers.*;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class CLCommandQueueTest {
+
+    private final int groupSize = 256;
+
+    @Test
+    public void enumsTest() {
+
+        //CLCommandQueueEnums
+        EnumSet<Mode> queueMode = Mode.valuesOf(CL.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL.CL_QUEUE_PROFILING_ENABLE);
+        assertTrue(queueMode.contains(Mode.OUT_OF_ORDER_MODE));
+        assertTrue(queueMode.contains(Mode.PROFILING_MODE));
+
+        assertNotNull(Mode.valuesOf(0));
+        assertEquals(0, Mode.valuesOf(0).size());
+        for (Mode mode : Mode.values()) {
+            assertEquals(mode, Mode.valueOf(mode.QUEUE_MODE));
+        }
+
+        // CLEvent enums
+        for (ProfilingCommand cmd : ProfilingCommand.values()) {
+            assertEquals(cmd, ProfilingCommand.valueOf(cmd.COMMAND));
+        }
+
+        for (CommandType type : CommandType.values()) {
+            assertEquals(type, CommandType.valueOf(type.TYPE));
+        }
+
+        for (ExecutionStatus status : ExecutionStatus.values()) {
+            assertEquals(status, ExecutionStatus.valueOf(status.STATUS));
+        }
+
+    }
+
+    @Test
+    public void eventsTest() throws IOException {
+
+        out.println(" - - - event synchronization test - - - ");
+
+        final int elements = roundUp(groupSize, ONE_MB / SIZEOF_INT * 5); // 5MB per buffer
+
+        CLContext context = CLContext.create();
+
+        CLBuffer<ByteBuffer> clBufferA = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferB = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferC = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferD = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+
+        fillBuffer(clBufferA.buffer, 12345);
+        fillBuffer(clBufferB.buffer, 67890);
+
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl")).build();
+        CLKernel vectorAddKernel = program.createCLKernel("VectorAddGM").setArg(3, elements);
+        CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
+
+        final CLEventList events = new CLEventList(2);
+
+        assertEquals(0, events.size());
+
+        queue.putWriteBuffer(clBufferA, false, events) // write A
+             .putWriteBuffer(clBufferB, false, events);// write B
+
+        assertEquals(2, events.size());
+        queue.putWaitForEvents(events, true);
+
+        events.release();
+        assertEquals(0, events.size());
+
+        vectorAddKernel.setArgs(clBufferA, clBufferB, clBufferC); // C = A+B
+        queue.put1DRangeKernel(vectorAddKernel, 0, elements, groupSize, events);
+
+        vectorAddKernel.setArgs(clBufferA, clBufferB, clBufferD); // D = A+B
+        queue.put1DRangeKernel(vectorAddKernel, 0, elements, groupSize, events);
+
+        assertEquals(2, events.size());
+        queue.putWaitForEvent(events, 0, false)
+             .putWaitForEvent(events, 1, true);
+
+        queue.putReadBuffer(clBufferC, false)
+             .putReadBuffer(clBufferD, true);
+
+        events.release();
+
+        checkIfEqual(clBufferC.buffer, clBufferD.buffer, elements);
+
+
+        context.release();
+
+
+        out.println("results are valid");
+
+    }
+    @Test
+    public void profilingEventsTest() throws IOException {
+
+        out.println(" - - - event synchronization test - - - ");
+
+        final int elements = roundUp(groupSize, ONE_MB / SIZEOF_INT * 5); // 5MB per buffer
+
+        CLContext context = CLContext.create();
+
+        CLBuffer<ByteBuffer> clBufferA = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferB = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferC = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+
+        fillBuffer(clBufferA.buffer, 12345);
+        fillBuffer(clBufferB.buffer, 67890);
+
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl")).build();
+        CLKernel vectorAddKernel = program.createCLKernel("VectorAddGM").setArg(3, elements);
+        CLCommandQueue queue = context.getDevices()[0].createCommandQueue(Mode.PROFILING_MODE);
+
+        queue.putWriteBuffer(clBufferA, true) // write A
+             .putWriteBuffer(clBufferB, true);// write B
+
+        final CLEventList events = new CLEventList(1);
+
+        assertEquals(0, events.size());
+
+        vectorAddKernel.setArgs(clBufferA, clBufferB, clBufferC); // C = A+B
+        queue.put1DRangeKernel(vectorAddKernel, 0, elements, groupSize, events);
+
+        assertEquals(1, events.size());
+        CLEvent probe = events.getEvent(0);
+        out.println(probe);
+
+        queue.putWaitForEvents(events, true);
+        assertEquals(CLEvent.ExecutionStatus.COMPLETE, probe.getStatus());
+
+        out.println(probe);
+        long time = probe.getProfilingInfo(CLEvent.ProfilingCommand.END)
+                  - probe.getProfilingInfo(CLEvent.ProfilingCommand.START);
+        out.println("time: "+time);
+        assertTrue(time > 0);
+
+        events.release();
+        context.release();
+
+    }
+
+    @Test
+    public void concurrencyTest() throws IOException, InterruptedException {
+
+        out.println(" - - - QueueBarrier test - - - ");
+
+        final int elements = ONE_MB / SIZEOF_INT * 10; // 20MB per buffer
+
+        CLContext context = CLContext.create();
+
+        CLDevice[] devices = context.getDevices();
+
+        if (devices.length < 2) {
+            out.println("aborting test... need at least 2 devices");
+            context.release();
+            return;
+        }
+
+        final CLBuffer<ByteBuffer> clBufferC = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        final CLBuffer<ByteBuffer> clBufferD = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+
+        final CLBuffer<ByteBuffer> clBufferA1 = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        final CLBuffer<ByteBuffer> clBufferB1 = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        final CLBuffer<ByteBuffer> clBufferA2 = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+        final CLBuffer<ByteBuffer> clBufferB2 = context.createByteBuffer(elements * SIZEOF_INT, Mem.READ_ONLY);
+
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl")).build();
+
+        final CLKernel vectorAddKernel1 = program.createCLKernel("VectorAddGM").setArg(3, elements);
+        final CLKernel vectorAddKernel2 = program.createCLKernel("VectorAddGM").setArg(3, elements);
+
+        int secondDevice = devices.length > 1 ? 1 : 0;
+
+        final CLCommandQueue queue1 = devices[0           ].createCommandQueue();
+        final CLCommandQueue queue2 = devices[secondDevice].createCommandQueue();
+
+        fillBuffer(clBufferC.buffer, 12345);
+
+        if (secondDevice > 0) {
+            System.out.println("using two devices");
+        }
+
+        final MultiQueueBarrier barrier = new MultiQueueBarrier(2);
+
+        Thread thread1 = new Thread("C") {
+
+            @Override
+            public void run() {
+
+                fillBuffer(clBufferA1.buffer, 12345);
+                fillBuffer(clBufferB1.buffer, 67890);
+
+//                System.out.println("C buffer");
+                queue1.putWriteBuffer(clBufferA1, false)  // write A
+                      .putWriteBuffer(clBufferB1, false); // write B
+
+//                System.out.println("C args");
+                vectorAddKernel1.setArgs(clBufferA1, clBufferB1, clBufferC); // C = A+B
+
+//                System.out.println("C kernels");
+                CLEventList events1 = new CLEventList(2);
+                queue1.put1DRangeKernel(vectorAddKernel1, 0, elements, groupSize, events1)
+                      .putReadBuffer(clBufferC, false, events1);
+
+                barrier.waitFor(queue1, events1);
+
+            }
+        };
+
+        Thread thread2 = new Thread("D") {
+
+            @Override
+            public void run() {
+
+                fillBuffer(clBufferA2.buffer, 12345);
+                fillBuffer(clBufferB2.buffer, 67890);
+
+//                System.out.println("D buffer");
+                queue2.putWriteBuffer(clBufferA2, false)  // write A
+                      .putWriteBuffer(clBufferB2, false); // write B
+
+//                System.out.println("D args");
+                vectorAddKernel2.setArgs(clBufferA2, clBufferB2, clBufferD); // D = A+B
+
+//                System.out.println("D kernels");
+                CLEventList events2 = new CLEventList(2);
+                queue2.put1DRangeKernel(vectorAddKernel2, 0, elements, groupSize, events2)
+                      .putReadBuffer(clBufferD, false, events2);
+
+                barrier.waitFor(queue2, events2);
+
+            }
+        };
+
+        out.println("starting threads");
+        thread1.start();
+        thread2.start();
+        barrier.await();
+        out.println("done");
+
+        checkIfEqual(clBufferC.buffer, clBufferD.buffer, elements);
+
+        context.release();
+
+        out.println("results are valid");
+
+    }
+}
diff --git a/test/com/jogamp/opencl/CLProgramTest.java b/test/com/jogamp/opencl/CLProgramTest.java
new file mode 100644
index 00000000..8b5d4362
--- /dev/null
+++ b/test/com/jogamp/opencl/CLProgramTest.java
@@ -0,0 +1,224 @@
+package com.jogamp.opencl;
+
+import com.jogamp.opencl.util.CLBuildConfiguration;
+import com.jogamp.opencl.util.CLProgramConfiguration;
+import com.jogamp.opencl.CLProgram.Status;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.Map;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.junit.Assert.*;
+import static java.lang.System.*;
+import static com.jogamp.opencl.CLProgram.CompilerOptions.*;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class CLProgramTest {
+
+    @Rule
+    public TemporaryFolder tmpFolder = new TemporaryFolder();
+
+
+    @Test
+    public void enumsTest() {
+
+        // CLProgram enums
+        for (Status e : Status.values()) {
+            assertEquals(e, Status.valueOf(e.STATUS));
+        }
+    }
+
+    @Test
+    public void rebuildProgramTest() throws IOException {
+
+        out.println(" - - - CLProgramTest; rebuild program test - - - ");
+
+        CLContext context = CLContext.create();
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl"));
+
+        try{
+            program.createCLKernels();
+            fail("expected exception but got none :(");
+        }catch(CLException ex) {
+            out.println("got expected exception:  "+ex.getCLErrorString());
+            assertEquals(ex.errorcode, CL.CL_INVALID_PROGRAM_EXECUTABLE);
+        }
+
+        out.println(program.getBuildStatus());
+        program.build();
+        out.println(program.getBuildStatus());
+
+        assertTrue(program.isExecutable());
+
+        Map<String, CLKernel> kernels = program.createCLKernels();
+        assertNotNull(kernels);
+        assertTrue("kernel map is empty", kernels.size() > 0);
+
+        // rebuild
+        // 1. release kernels (internally)
+        // 2. build program
+        program.build();
+        assertTrue(program.isExecutable());
+        out.println(program.getBuildStatus());
+
+        // try again with rebuilt program
+        kernels = program.createCLKernels();
+        assertNotNull(kernels);
+        assertTrue("kernel map is empty", kernels.size() > 0);
+        assertTrue(kernels.size() > 0);
+
+        context.release();
+    }
+
+    @Test
+    public void programBinariesTest() throws IOException {
+
+        out.println(" - - - CLProgramTest; down-/upload binaries test - - - ");
+
+        CLContext context = CLContext.create();
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl"))
+                                   .build(ENABLE_MAD, WARNINGS_ARE_ERRORS);
+
+        // optain binaries
+        Map<CLDevice, byte[]> binaries = program.getBinaries();
+        assertFalse(binaries.isEmpty());
+
+        CLDevice[] devices = program.getCLDevices();
+        for (CLDevice device : devices) {
+            assertTrue(binaries.containsKey(device));
+        }
+
+        // 1. release program
+        // 2. re-create program with old binaries
+        program.release();
+
+        assertFalse(program.isExecutable());
+
+        assertNotNull(program.getBinaries());
+        assertEquals(program.getBinaries().size(), 0);
+
+        assertNotNull(program.getBuildLog());
+        assertEquals(program.getBuildLog().length(), 0);
+
+        assertNotNull(program.getSource());
+        assertEquals(program.getSource().length(), 0);
+
+        assertNotNull(program.getCLDevices());
+        assertEquals(program.getCLDevices().length, 0);
+
+        {
+            Map<String, CLKernel> kernels = program.createCLKernels();
+            assertNotNull(kernels);
+            assertEquals(kernels.size(), 0);
+        }
+        assertNull(program.createCLKernel("foo"));
+
+        program = context.createProgram(binaries);
+
+        assertFalse(program.isExecutable());
+
+        assertNotNull(program.getCLDevices());
+        assertTrue(program.getCLDevices().length != 0);
+
+        assertNotNull(program.getBinaries());
+        assertEquals(program.getBinaries().size(), 0);
+
+        assertNotNull(program.getBuildLog());
+        assertTrue(program.getBuildLog().length() != 0);
+
+        assertNotNull(program.getSource());
+        assertEquals(program.getSource().length(), 0);
+
+        try{
+            Map<String, CLKernel> kernels = program.createCLKernels();
+            fail("expected an exception from createCLKernels but got: "+kernels);
+        }catch(CLException ex) {
+            // expected, not build yet
+        }
+
+        out.println(program.getBuildStatus());
+        program.build();
+        out.println(program.getBuildStatus());
+
+        assertNotNull(program.createCLKernel("Test"));
+
+        assertTrue(program.isExecutable());
+
+    }
+
+    @Test
+    public void builderTest() throws IOException, ClassNotFoundException {
+        out.println(" - - - CLProgramTest; program builder test - - - ");
+
+        CLContext context = CLContext.create();
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl"));
+
+        // same as program.build()
+        program.prepare().build();
+
+        assertTrue(program.isExecutable());
+
+
+        // complex build
+        program.prepare().withOption(ENABLE_MAD)
+                         .forDevice(context.getMaxFlopsDevice())
+                         .withDefine("RADIUS", 5)
+                         .withDefine("ENABLE_FOOBAR")
+                         .build();
+
+        assertTrue(program.isExecutable());
+
+        // reusable builder
+        CLBuildConfiguration builder = CLProgramBuilder.createConfiguration()
+                                     .withOption(ENABLE_MAD)
+                                     .forDevices(context.getDevices())
+                                     .withDefine("RADIUS", 5)
+                                     .withDefine("ENABLE_FOOBAR");
+
+        out.println(builder);
+        
+        builder.setProgram(program).build();
+        assertTrue(program.isExecutable());
+
+        // serialization test
+        File file = tmpFolder.newFile("foobar.builder");
+        ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file));
+        builder.save(oos);
+        oos.close();
+
+        // build configuration
+        ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
+        CLBuildConfiguration buildConfig = CLProgramBuilder.loadConfiguration(ois);
+        ois.close();
+
+        assertEquals(builder, buildConfig);
+
+        buildConfig.build(program);
+        assertTrue(program.isExecutable());
+
+        // program configuration
+        ois = new ObjectInputStream(new FileInputStream(file));
+        CLProgramConfiguration programConfig = CLProgramBuilder.loadConfiguration(ois, context);
+        assertNotNull(programConfig.getProgram());
+        ois.close();
+        program = programConfig.build();
+        assertTrue(program.isExecutable());
+
+        
+        // cloneing
+        assertEquals(builder, builder.clone());
+
+    }
+
+
+
+}
diff --git a/test/com/jogamp/opencl/HighLevelBindingTest.java b/test/com/jogamp/opencl/HighLevelBindingTest.java
new file mode 100644
index 00000000..33cce0b5
--- /dev/null
+++ b/test/com/jogamp/opencl/HighLevelBindingTest.java
@@ -0,0 +1,305 @@
+package com.jogamp.opencl;
+
+import com.jogamp.opencl.CLMemory.Mem;
+import com.jogamp.opencl.CLMemory.GLObjectType;
+import com.jogamp.opencl.CLSampler.AddressingMode;
+import com.jogamp.opencl.CLSampler.FilteringMode;
+import com.jogamp.opencl.CLImageFormat.ChannelOrder;
+import com.jogamp.opencl.CLImageFormat.ChannelType;
+import com.jogamp.opencl.CLDevice.FPConfig;
+import com.jogamp.opencl.CLDevice.GlobalMemCacheType;
+import com.jogamp.opencl.CLDevice.LocalMemType;
+import com.jogamp.opencl.CLDevice.Type;
+import com.jogamp.opencl.CLDevice.Capabilities;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.Map;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import static java.lang.System.*;
+import static com.jogamp.opencl.TestUtils.*;
+import static com.jogamp.common.nio.Buffers.*;
+
+/**
+ * Test testing the high level bindings.
+ * @author Michael Bien
+ */
+public class HighLevelBindingTest {
+
+    @BeforeClass
+    public static void setUpClass() throws Exception {
+        out.println("OS: " + System.getProperty("os.name"));
+        out.println("ARCH: " + System.getProperty("os.arch"));
+        out.println("VM: " + System.getProperty("java.vm.name"));
+        out.println("lib path: " + System.getProperty("java.library.path"));
+    }
+
+    @Test
+    public void enumsTest() {
+        
+        // enum tests
+        final EnumSet<FPConfig> singleFPConfig = FPConfig.valuesOf(CL.CL_FP_DENORM | CL.CL_FP_ROUND_TO_INF);
+        assertEquals(0, FPConfig.valuesOf(0).size());
+        assertTrue(singleFPConfig.contains(FPConfig.DENORM));
+        assertTrue(singleFPConfig.contains(FPConfig.ROUND_TO_INF));
+
+        // CLDevice enums
+        for (FPConfig e : FPConfig.values()) {
+            EnumSet<FPConfig> set = FPConfig.valuesOf(e.CONFIG);
+            assertTrue(set.contains(e));
+        }
+        for (GlobalMemCacheType e : GlobalMemCacheType.values()) {
+            assertEquals(e, GlobalMemCacheType.valueOf(e.TYPE));
+        }
+        for (LocalMemType e : LocalMemType.values()) {
+            assertEquals(e, LocalMemType.valueOf(e.TYPE));
+        }
+        for (Type e : Type.values()) {
+            assertEquals(e, Type.valueOf(e.TYPE));
+        }
+        for (Capabilities e : Capabilities.values()) {
+            assertEquals(e, Capabilities.valueOf(e.CAPS));
+        }
+
+        // CLMemory enums
+        for (Mem e : Mem.values()) {
+            assertEquals(e, Mem.valueOf(e.CONFIG));
+        }
+
+        for (GLObjectType e : GLObjectType.values()) {
+            assertEquals(e, GLObjectType.valueOf(e.TYPE));
+        }
+
+        // CLSampler enums
+        for (AddressingMode e : AddressingMode.values()) {
+            assertEquals(e, AddressingMode.valueOf(e.MODE));
+        }
+        for (FilteringMode e : FilteringMode.values()) {
+            assertEquals(e, FilteringMode.valueOf(e.MODE));
+        }
+
+        // CLImage enums
+        for (ChannelOrder e : ChannelOrder.values()) {
+            assertEquals(e, ChannelOrder.valueOf(e.ORDER));
+        }
+        for (ChannelType e : ChannelType.values()) {
+            assertEquals(e, ChannelType.valueOf(e.TYPE));
+        }
+
+    }
+
+
+
+    @Test
+    public void contextlessTest() {
+
+        out.println(" - - - highLevelTest; contextless - - - ");
+
+        // platform/device info tests
+        CLPlatform[] clPlatforms = CLPlatform.listCLPlatforms();
+
+        for (CLPlatform platform : clPlatforms) {
+
+            out.println("platform info:");
+            out.println("    name: "+platform.getName());
+            out.println("    id: "+platform.ID);
+            out.println("    profile: "+platform.getProfile());
+            out.println("    version: "+platform.getVersion());
+            out.println("    vendor: "+platform.getVendor());
+            out.println("    max FLOPS device: "+platform.getMaxFlopsDevice());
+            out.println("    extensions: "+platform.getExtensions());
+
+            CLDevice[] clDevices = platform.listCLDevices();
+            for (CLDevice device : clDevices) {
+                out.println("device info:");
+                out.println("    name: "+device.getName());
+                out.println("    profile: "+device.getProfile());
+                out.println("    vendor: "+device.getVendor());
+                out.println("    vendor id: "+device.getVendorID());
+                out.println("    version: "+device.getVersion());
+                out.println("    driver version: "+device.getDriverVersion());
+                out.println("    type: "+device.getType());
+                out.println("    global mem: "+device.getGlobalMemSize()/(1024*1024)+" MB");
+                out.println("    max alloc mem: "+device.getMaxMemAllocSize()/(1024*1024)+" MB");
+                out.println("    max param size: "+device.getMaxParameterSize()+" byte");
+                out.println("    local mem: "+device.getLocalMemSize()/1024+" KB");
+                out.println("    local mem type: "+device.getLocalMemType());
+                out.println("    global mem cache size: "+device.getGlobalMemCacheSize());
+                out.println("    global mem cacheline size: "+device.getGlobalMemCachelineSize());
+                out.println("    global mem cache type: "+device.getGlobalMemCacheType());
+                out.println("    constant buffer size: "+device.getMaxConstantBufferSize());
+                out.println("    error correction support: "+device.isErrorCorrectionSupported());
+                out.println("    queue properties: "+device.getQueueProperties());
+                out.println("    clock: "+device.getMaxClockFrequency()+" MHz");
+                out.println("    timer res: "+device.getProfilingTimerResolution()+" ns");
+                out.println("    max work group size: "+device.getMaxWorkGroupSize());
+                out.println("    max compute units: "+device.getMaxComputeUnits());
+                out.println("    max work item dimensions: "+device.getMaxWorkItemDimensions());
+                out.println("    max work item sizes: "+Arrays.toString(device.getMaxWorkItemSizes()));
+                out.println("    compiler available: "+device.isCompilerAvailable());
+                out.println("    image support: "+device.isImageSupportAvailable());
+                out.println("    max read image args: "+device.getMaxReadImageArgs());
+                out.println("    max write image args: "+device.getMaxWriteImageArgs());
+                out.println("    max image2d dimensions: "+Arrays.asList(device.getMaxImage2dWidth(), device.getMaxImage2dHeight()));
+                out.println("    max image3d dimensions: "+Arrays.asList(device.getMaxImage2dWidth(), device.getMaxImage2dHeight(), device.getMaxImage3dDepth()));
+                out.println("    number of address bits: "+device.getAddressBits());
+                out.println("    half FP available: "+device.isHalfFPAvailable());
+                out.println("    double FP available: "+device.isDoubleFPAvailable());
+                out.println("    little endian: "+device.isLittleEndian());
+                out.println("    half FP config: "+device.getHalfFPConfig());
+                out.println("    single FP config: "+device.getSingleFPConfig());
+                out.println("    double FP config: "+device.getDoubleFPConfig());
+                out.println("    execution capabilities: "+device.getExecutionCapabilities());
+                out.println("    gl memory sharing: "+device.isGLMemorySharingSupported());
+                out.println("    extensions: "+device.getExtensions());
+            }
+        }
+
+    }
+
+    @Test
+    public void createContextTest() {
+
+        out.println(" - - - highLevelTest; create context - - - ");
+
+        CLPlatform platform = CLPlatform.getDefault();
+        int deviceCount = platform.listCLDevices().length;
+        CLDevice firstDevice = platform.listCLDevices()[0];
+
+        CLContext c = CLContext.create();
+        assertNotNull(c);
+        assertEquals(deviceCount, c.getDevices().length);
+        c.release();
+
+        c = CLContext.create(platform);
+        assertNotNull(c);
+        assertEquals(deviceCount, c.getDevices().length);
+        c.release();
+
+        c = CLContext.create(firstDevice);
+        assertNotNull(c);
+        assertEquals(1, c.getDevices().length);
+        c.release();
+
+        c = CLContext.create(CLDevice.Type.ALL);
+        assertNotNull(c);
+        assertEquals(deviceCount, c.getDevices().length);
+        c.release();
+
+        c = CLContext.create(platform, firstDevice);
+        assertNotNull(c);
+        assertEquals(1, c.getDevices().length);
+        c.release();
+
+        c = CLContext.create(platform, CLDevice.Type.ALL);
+        assertNotNull(c);
+        assertEquals(deviceCount, c.getDevices().length);
+        c.release();
+
+    }
+
+    @Test
+    public void vectorAddGMTest() throws IOException {
+
+        out.println(" - - - highLevelTest; global memory kernel - - - ");
+
+        CLPlatform[] clPlatforms = CLPlatform.listCLPlatforms();
+        CLContext context = CLContext.create(clPlatforms[0]);
+
+        CLDevice[] contextDevices = context.getDevices();
+
+        out.println("context devices:");
+        for (CLDevice device : contextDevices) {
+            out.println("   "+device.toString());
+        }
+
+        out.println("max FLOPS device: " + context.getMaxFlopsDevice());
+
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl")).build();
+
+        CLDevice[] programDevices = program.getCLDevices();
+
+        assertEquals(contextDevices.length, programDevices.length);
+
+        out.println("build log:\n"+program.getBuildLog());
+        out.println("build status:\n"+program.getBuildStatus());
+
+        String source = program.getSource();
+        assertFalse(source.trim().isEmpty());
+//        out.println("source:\n"+source);
+
+        Map<CLDevice, byte[]> binaries = program.getBinaries();
+        assertFalse(binaries.isEmpty());
+
+        int elementCount = 11444777;	// Length of float arrays to process (odd # for illustration)
+        int localWorkSize = 256;      // set and log Global and Local work size dimensions
+        int globalWorkSize = roundUp(localWorkSize, elementCount);  // rounded up to the nearest multiple of the LocalWorkSize
+
+        out.println("allocateing buffers of size: "+globalWorkSize);
+
+        ByteBuffer srcA = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
+        ByteBuffer srcB = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
+        ByteBuffer dest = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
+
+        fillBuffer(srcA, 23456);
+        fillBuffer(srcB, 46987);
+
+        CLBuffer<ByteBuffer> clBufferA = context.createBuffer(srcA, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferB = context.createBuffer(srcB, Mem.READ_ONLY);
+        CLBuffer<ByteBuffer> clBufferC = context.createBuffer(dest, Mem.WRITE_ONLY);
+
+        Map<String, CLKernel> kernels = program.createCLKernels();
+        for (CLKernel kernel : kernels.values()) {
+            out.println("kernel: "+kernel.toString());
+        }
+
+        assertNotNull(kernels.get("VectorAddGM"));
+        assertNotNull(kernels.get("Test"));
+
+        CLKernel vectorAddKernel = kernels.get("VectorAddGM");
+
+        vectorAddKernel.setArg(0, clBufferA)
+                       .setArg(1, clBufferB)
+                       .setArg(2, clBufferC)
+                       .setArg(3, elementCount);
+
+        CLCommandQueue queue = programDevices[0].createCommandQueue();
+
+        // Asynchronous write of data to GPU device, blocking read later
+        queue.putWriteBuffer(clBufferA, false)
+             .putWriteBuffer(clBufferB, false)
+             .put1DRangeKernel(vectorAddKernel, 0, globalWorkSize, localWorkSize)
+             .putReadBuffer(clBufferC, true)
+             .finish().release();
+
+        out.println("a+b=c result snapshot: ");
+        for(int i = 0; i < 10; i++)
+            out.print(dest.getInt()+", ");
+        out.println("...; "+dest.remaining()/SIZEOF_INT + " more");
+
+        assertTrue(3 == context.getMemoryObjects().size());
+        clBufferA.release();
+        assertTrue(2 == context.getMemoryObjects().size());
+
+        assertTrue(2 == context.getMemoryObjects().size());
+        clBufferB.release();
+        assertTrue(1 == context.getMemoryObjects().size());
+
+        assertTrue(1 == context.getMemoryObjects().size());
+        clBufferC.release();
+        assertTrue(0 == context.getMemoryObjects().size());
+
+
+        assertTrue(1 == context.getPrograms().size());
+        program.release();
+        assertTrue(0 == context.getPrograms().size());
+
+        context.release();
+    }
+
+    
+}
diff --git a/test/com/jogamp/opencl/LowLevelBindingTest.java b/test/com/jogamp/opencl/LowLevelBindingTest.java
new file mode 100644
index 00000000..2162bca0
--- /dev/null
+++ b/test/com/jogamp/opencl/LowLevelBindingTest.java
@@ -0,0 +1,364 @@
+package com.jogamp.opencl;
+
+import com.jogamp.opencl.impl.CLImpl;
+
+import java.nio.ByteBuffer;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static java.lang.System.*;
+import static com.jogamp.common.nio.Buffers.*;
+
+/**
+ * Test testing the low level bindings.
+ * @author Michael Bien
+ */
+public class LowLevelBindingTest {
+
+    private final static String programSource =
+              " // OpenCL Kernel Function for element by element vector addition                                  \n"
+            + "kernel void VectorAdd(global const int* a, global const int* b, global int* c, int iNumElements) { \n"
+            + "    // get index into global data array                                                            \n"
+            + "    int iGID = get_global_id(0);                                                                   \n"
+            + "    // bound check (equivalent to the limit on a 'for' loop for standard/serial C code             \n"
+            + "    if (iGID >= iNumElements)  {                                                                   \n"
+            + "        return;                                                                                    \n"
+            + "    }                                                                                              \n"
+            + "    // add the vector elements                                                                     \n"
+            + "    c[iGID] = a[iGID] + b[iGID];                                                                   \n"
+            + "}                                                                                                  \n"
+            + "kernel void Test(global const int* a, global const int* b, global int* c, int iNumElements) {      \n"
+            + "    // get index into global data array                                                            \n"
+            + "    int iGID = get_global_id(0);                                                                   \n"
+            + "    // bound check (equivalent to the limit on a 'for' loop for standard/serial C code             \n"
+            + "    if (iGID >= iNumElements)  {                                                                   \n"
+            + "        return;                                                                                    \n"
+            + "    }                                                                                              \n"
+            + "    c[iGID] = iGID;                                                                                \n"
+            + "}                                                                                                  \n";
+
+
+    @BeforeClass
+    public static void setUpClass() throws Exception {
+        out.println("OS: " + System.getProperty("os.name"));
+        out.println("VM: " + System.getProperty("java.vm.name"));
+    }
+
+    @Test
+    public void contextlessTest() {
+        out.println("low level tests temporary disabled");
+        out.println(" - - - lowLevelTest; contextless binding - - - ");
+
+
+        CL cl = CLPlatform.getLowLevelCLInterface();
+
+        System.out.println(((CLImpl)cl).clGetExtensionFunctionAddress("clCreateFromGLBuffer").getLong());
+        System.out.println(((CLImpl)cl).clGetExtensionFunctionAddress("clEnqueueAcquireGLObjects").getLong());
+/*
+        int ret = CL.CL_SUCCESS;
+
+        int[] intBuffer = new int[1];
+        // find all available OpenCL platforms
+        ret = cl.clGetPlatformIDs(0, null, 0, intBuffer, 0);
+        checkForError(ret);
+        out.println("#platforms: "+intBuffer[0]);
+
+        long[] platformId = new long[intBuffer[0]];
+        ret = cl.clGetPlatformIDs(platformId.length, platformId, 0, null, 0);
+        checkForError(ret);
+
+        // print platform info
+        long[] longBuffer = new long[1];
+        ByteBuffer bb = ByteBuffer.allocate(128);
+        bb.order(ByteOrder.nativeOrder());
+
+        for (int i = 0; i < platformId.length; i++)  {
+
+            long platform = platformId[i];
+            out.println("platform id: "+platform);
+
+            ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_PROFILE, bb.capacity(), bb, longBuffer, 0);
+            checkForError(ret);
+            out.println("    profile: " + clString2JavaString(bb.array(), (int)longBuffer[0]));
+
+            ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VERSION, bb.capacity(), bb, longBuffer, 0);
+            checkForError(ret);
+            out.println("    version: " + clString2JavaString(bb.array(), (int)longBuffer[0]));
+
+            ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_NAME, bb.capacity(), bb, longBuffer, 0);
+            checkForError(ret);
+            out.println("    name: " + clString2JavaString(bb.array(), (int)longBuffer[0]));
+
+            ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VENDOR, bb.capacity(), bb, longBuffer, 0);
+            checkForError(ret);
+            out.println("    vendor: " + clString2JavaString(bb.array(), (int)longBuffer[0]));
+
+            //find all devices
+            ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, 0, null, 0, intBuffer, 0);
+            checkForError(ret);
+            out.println("#devices: "+intBuffer[0]);
+
+            long[] devices = new long[intBuffer[0]];
+            ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, devices.length, devices, 0, null, 0);
+
+            //print device info
+            for (int j = 0; j < devices.length; j++) {
+                long device = devices[j];
+                ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_NAME, bb.capacity(), bb, longBuffer, 0);
+                checkForError(ret);
+                out.println("    device: " + clString2JavaString(bb.array(), (int)longBuffer[0]));
+
+                ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_TYPE, bb.capacity(), bb, longBuffer, 0);
+                checkForError(ret);
+                out.println("    type: " + CLDevice.Type.valueOf(bb.get()));
+                bb.rewind();
+
+            }
+
+        }
+*/
+    }
+/*
+    @Test
+    public void createContextTest() {
+
+        out.println(" - - - createContextTest - - - ");
+
+        CL cl = CLPlatform.getLowLevelBinding();
+
+        int[] intArray = new int[1];
+        // find all available OpenCL platforms
+        int ret = cl.clGetPlatformIDs(0, null, 0, intArray, 0);
+        checkForError(ret);
+        out.println("#platforms: "+intArray[0]);
+
+        long[] longArray = new long[intArray[0]];
+        ret = cl.clGetPlatformIDs(longArray.length, longArray, 0, null, 0);
+        checkForError(ret);
+
+        long platform = longArray[0];
+
+        //find all devices
+        ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, 0, null, 0, intArray, 0);
+        checkForError(ret);
+        out.println("#devices: "+intArray[0]);
+
+        long[] devices = new long[intArray[0]];
+        ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, devices.length, devices, 0, null, 0);
+
+        IntBuffer intBuffer = IntBuffer.allocate(1);
+        long context = cl.clCreateContext(null, devices, null, null, intBuffer);
+        checkError("on clCreateContext", intBuffer.get());
+
+        //get number of devices
+        ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, 0, null, longArray, 0);
+        checkError("on clGetContextInfo", ret);
+
+        int sizeofLong = (CPU.is32Bit()?4:8);
+        out.println("context created with " + longArray[0]/sizeofLong + " devices");
+
+        //check if equal
+        assertEquals("context was not created on all devices specified", devices.length, longArray[0]/sizeofLong);
+
+        ret = cl.clReleaseContext(context);
+        checkError("on clReleaseContext", ret);
+    }
+
+
+    @Test
+    public void lowLevelVectorAddTest() {
+
+        out.println(" - - - lowLevelTest2; VectorAdd kernel - - - ");
+
+//        CreateContextCallback cb = new CreateContextCallback() {
+//            @Override
+//            public void createContextCallback(String errinfo, ByteBuffer private_info, long cb, Object user_data) {
+//                throw new RuntimeException("not yet implemented...");
+//            }
+//        };
+
+        long[] longArray = new long[1];
+        ByteBuffer bb = ByteBuffer.allocate(4096).order(ByteOrder.nativeOrder());
+
+        CL cl = CLPlatform.getLowLevelBinding();
+
+        int ret = CL.CL_SUCCESS;
+        int[] intArray = new int[1];
+
+        //TODO properties not allowed to be null
+        long context = cl.clCreateContextFromType(null, CL.CL_DEVICE_TYPE_ALL, null, null, null);
+        out.println("context handle: "+context);
+
+        ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, 0, null, longArray, 0);
+        checkError("on clGetContextInfo", ret);
+
+        int sizeofLong = (CPU.is32Bit()?4:8);
+        out.println("context created with " + longArray[0]/sizeofLong + " devices");
+
+        ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, bb.capacity(), bb, null, 0);
+        checkError("on clGetContextInfo", ret);
+
+        for (int i = 0; i < longArray[0]/sizeofLong; i++) {
+            out.println("device id: "+bb.getLong());
+        }
+
+        long firstDeviceID = bb.getLong(0);
+
+        // Create a command-queue
+        long commandQueue = cl.clCreateCommandQueue(context, firstDeviceID, 0, intArray, 0);
+        checkError("on clCreateCommandQueue", intArray[0]);
+
+        int elementCount = 11444777;	// Length of float arrays to process (odd # for illustration)
+        int localWorkSize = 256;      // set and log Global and Local work size dimensions
+        int globalWorkSize = roundUp(localWorkSize, elementCount);  // rounded up to the nearest multiple of the LocalWorkSize
+
+        out.println("allocateing buffers of size: "+globalWorkSize);
+
+        ByteBuffer srcA = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
+        ByteBuffer srcB = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
+        ByteBuffer dest = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
+
+        // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
+        long devSrcA = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcA.capacity(), null, intArray, 0);
+        checkError("on clCreateBuffer", intArray[0]);
+        long devSrcB = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcB.capacity(), null, intArray, 0);
+        checkError("on clCreateBuffer", intArray[0]);
+        long devDst  = cl.clCreateBuffer(context, CL.CL_MEM_WRITE_ONLY, dest.capacity(), null, intArray, 0);
+        checkError("on clCreateBuffer", intArray[0]);
+
+
+        // Create the program
+        long program = cl.clCreateProgramWithSource(context, 1, new String[] {programSource}, new long[]{programSource.length()}, 0, intArray, 0);
+        checkError("on clCreateProgramWithSource", intArray[0]);
+
+        // Build the program
+        ret = cl.clBuildProgram(program, null, null, null, null);
+        checkError("on clBuildProgram", ret);
+
+        // Read program infos
+        bb.rewind();
+        ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_NUM_DEVICES, bb.capacity(), bb, null, 0);
+        checkError("on clGetProgramInfo1", ret);
+        out.println("program associated with "+bb.getInt(0)+" device(s)");
+
+        ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, 0, null, longArray, 0);
+        checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret);
+        out.println("program source length (cl): "+longArray[0]);
+        out.println("program source length (java): "+programSource.length());
+
+        bb.rewind();
+        ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, bb.capacity(), bb, null, 0);
+        checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret);
+        out.println("program source:\n" + clString2JavaString(bb.array(), (int)longArray[0]));
+
+        // Check program status
+        Arrays.fill(longArray, 42);
+        bb.rewind();
+        ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_STATUS, bb.capacity(), bb, null, 0);
+        checkError("on clGetProgramBuildInfo1", ret);
+
+        out.println("program build status: " + CLProgram.Status.valueOf(bb.getInt(0)));
+        assertEquals("build status", CL.CL_BUILD_SUCCESS, bb.getInt(0));
+
+        // Read build log
+        ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, 0, null, longArray, 0);
+        checkError("on clGetProgramBuildInfo2", ret);
+        out.println("program log length: " + longArray[0]);
+
+        bb.rewind();
+        ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, bb.capacity(), bb, null, 0);
+        checkError("on clGetProgramBuildInfo3", ret);
+        out.println("log:\n" + clString2JavaString(bb.array(), (int)longArray[0]));
+
+        // Create the kernel
+        Arrays.fill(intArray, 42);
+        long kernel = cl.clCreateKernel(program, "VectorAdd", intArray, 0);
+        checkError("on clCreateKernel", intArray[0]);
+
+//        srcA.limit(elementCount*SIZEOF_FLOAT);
+//        srcB.limit(elementCount*SIZEOF_FLOAT);
+
+        fillBuffer(srcA, 23456);
+        fillBuffer(srcB, 46987);
+
+        // Set the Argument values
+        ret = cl.clSetKernelArg(kernel, 0, CPU.is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devSrcA));  checkError("on clSetKernelArg0", ret);
+        ret = cl.clSetKernelArg(kernel, 1, CPU.is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devSrcB));  checkError("on clSetKernelArg1", ret);
+        ret = cl.clSetKernelArg(kernel, 2, CPU.is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devDst));   checkError("on clSetKernelArg2", ret);
+        ret = cl.clSetKernelArg(kernel, 3, SIZEOF_INT,  wrap(elementCount));  checkError("on clSetKernelArg3", ret);
+
+        out.println("used device memory: "+ (srcA.capacity()+srcB.capacity()+dest.capacity())/1000000 +"MB");
+
+        // Asynchronous write of data to GPU device
+        ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcA, CL.CL_FALSE, 0, srcA.capacity(), srcA, 0, null, null);
+        checkError("on clEnqueueWriteBuffer", ret);
+        ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcB, CL.CL_FALSE, 0, srcB.capacity(), srcB, 0, null, null);
+        checkError("on clEnqueueWriteBuffer", ret);
+
+        // Launch kernel
+        PointerBuffer gWS = PointerBuffer.allocateDirect(1).put(globalWorkSize).rewind();
+        PointerBuffer lWS = PointerBuffer.allocateDirect(1).put(localWorkSize).rewind();
+        ret = cl.clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, gWS, lWS, 0, null, null);
+        checkError("on clEnqueueNDRangeKernel", ret);
+
+        // Synchronous/blocking read of results
+        ret = cl.clEnqueueReadBuffer(commandQueue, devDst, CL.CL_TRUE, 0, dest.capacity(), dest, 0, null, null);
+        checkError("on clEnqueueReadBuffer", ret);
+
+        out.println("a+b=c result snapshot: ");
+        for(int i = 0; i < 10; i++)
+            out.print(dest.getInt()+", ");
+        out.println("...; "+dest.remaining()/SIZEOF_INT + " more");
+
+
+        // cleanup
+        ret = cl.clReleaseCommandQueue(commandQueue);
+        checkError("on clReleaseCommandQueue", ret);
+
+        ret = cl.clReleaseMemObject(devSrcA);
+        checkError("on clReleaseMemObject", ret);
+        ret = cl.clReleaseMemObject(devSrcB);
+        checkError("on clReleaseMemObject", ret);
+        ret = cl.clReleaseMemObject(devDst);
+        checkError("on clReleaseMemObject", ret);
+
+        ret = cl.clReleaseProgram(program);
+        checkError("on clReleaseProgram", ret);
+
+        ret = cl.clReleaseKernel(kernel);
+        checkError("on clReleaseKernel", ret);
+
+        ret = cl.clUnloadCompiler();
+        checkError("on clUnloadCompiler", ret);
+
+        ret = cl.clReleaseContext(context);
+        checkError("on clReleaseContext", ret);
+
+    }
+
+    @Test
+    public void loadTest() {
+        //for memory leak detection; e.g watch out for "out of host memory" errors
+        out.println(" - - - loadTest - - - ");
+        for(int i = 0; i < 100; i++) {
+            out.println("###iteration "+i);
+            lowLevelVectorAddTest();
+        }
+    }
+*/
+    private ByteBuffer wrap(long value) {
+        return (ByteBuffer) newDirectByteBuffer(8).putLong(value).rewind();
+    }
+
+    private final void checkForError(int ret) {
+        this.checkError("", ret);
+    }
+
+    private final void checkError(String msg, int ret) {
+        if(ret != CL.CL_SUCCESS)
+            throw CLException.newException(ret, msg);
+    }
+
+
+}
+\ No newline at end of file
diff --git a/test/com/jogamp/opencl/TestUtils.java b/test/com/jogamp/opencl/TestUtils.java
new file mode 100644
index 00000000..e2ef16f3
--- /dev/null
+++ b/test/com/jogamp/opencl/TestUtils.java
@@ -0,0 +1,52 @@
+package com.jogamp.opencl;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import static java.lang.System.*;
+import static org.junit.Assert.*;
+
+/**
+ * @author Michael Bien
+ */
+public class TestUtils {
+
+    //decrease this value on systems with few memory.
+    final static int ONE_MB = 1048576;
+
+    final static int NUM_ELEMENTS = 10000000;
+
+    public static final void fillBuffer(ByteBuffer buffer, int seed) {
+
+        Random rnd = new Random(seed);
+
+        while(buffer.remaining() != 0)
+            buffer.putInt(rnd.nextInt());
+
+        buffer.rewind();
+    }
+
+    public static final int roundUp(int groupSize, int globalSize) {
+        int r = globalSize % groupSize;
+        if (r == 0) {
+            return globalSize;
+        } else {
+            return globalSize + groupSize - r;
+        }
+    }
+
+    public static final void checkIfEqual(ByteBuffer a, ByteBuffer b, int elements) {
+        for(int i = 0; i < elements; i++) {
+            int aVal = a.getInt();
+            int bVal = b.getInt();
+            if(aVal != bVal) {
+                out.println("a: "+aVal);
+                out.println("b: "+bVal);
+                out.println("position: "+a.position());
+                fail("a!=b");
+            }
+        }
+        a.rewind();
+        b.rewind();
+    }
+}
diff --git a/test/com/jogamp/opencl/testkernels.cl b/test/com/jogamp/opencl/testkernels.cl
new file mode 100644
index 00000000..ec7e8bf6
--- /dev/null
+++ b/test/com/jogamp/opencl/testkernels.cl
@@ -0,0 +1,22 @@
+
+    // OpenCL Kernel Function for element by element vector addition
+    kernel void VectorAddGM(global const int* a, global const int* b, global int* c, int iNumElements) {
+        // get index into global data array
+        int iGID = get_global_id(0);
+        // bound check (equivalent to the limit on a 'for' loop for standard/serial C code
+        if (iGID >= iNumElements)  {
+            return;
+        }
+        // add the vector elements
+        c[iGID] = a[iGID] + b[iGID];
+    }
+
+    kernel void Test(global const int* a, global const int* b, global int* c, int iNumElements) {
+        // get index into global data array
+        int iGID = get_global_id(0);
+        // bound check (equivalent to the limit on a 'for' loop for standard/serial C code
+        if (iGID >= iNumElements)  {
+            return;
+        }
+        c[iGID] = iGID;
+    }