From abe0135b4457d4c4ff722b0f39a47cad6c178f7e Mon Sep 17 00:00:00 2001 From: Michael Bien Date: Tue, 20 Oct 2009 22:06:10 +0200 Subject: refactored JOCLTest into LowLevelBindingTest and HighLevelBindingTest. moved listCLPlatforms() and getLowLevelBinding() from CLContext to CLPlatform. added method to create CLPrograms from InputStreams and updated test. --- test/com/mbien/opencl/HighLevelBindingTest.java | 156 ++++++++ test/com/mbien/opencl/JOCLTest.java | 472 ------------------------ test/com/mbien/opencl/LowLevelBindingTest.java | 316 ++++++++++++++++ test/com/mbien/opencl/TestUtils.java | 29 ++ test/com/mbien/opencl/testkernels.cl | 22 ++ 5 files changed, 523 insertions(+), 472 deletions(-) create mode 100644 test/com/mbien/opencl/HighLevelBindingTest.java delete mode 100644 test/com/mbien/opencl/JOCLTest.java create mode 100644 test/com/mbien/opencl/LowLevelBindingTest.java create mode 100644 test/com/mbien/opencl/TestUtils.java create mode 100644 test/com/mbien/opencl/testkernels.cl (limited to 'test') diff --git a/test/com/mbien/opencl/HighLevelBindingTest.java b/test/com/mbien/opencl/HighLevelBindingTest.java new file mode 100644 index 00000000..a2bfce91 --- /dev/null +++ b/test/com/mbien/opencl/HighLevelBindingTest.java @@ -0,0 +1,156 @@ +package com.mbien.opencl; + +import com.sun.gluegen.runtime.BufferFactory; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Map; +import org.junit.BeforeClass; +import org.junit.Test; +import static org.junit.Assert.*; +import static java.lang.System.*; +import static com.mbien.opencl.TestUtils.*; + +/** + * Test testing the high level bindings. + * @author Michael Bien + */ +public class HighLevelBindingTest { + + @BeforeClass + public static void setUpClass() throws Exception { + out.println("OS: " + System.getProperty("os.name")); + out.println("VM: " + System.getProperty("java.vm.name")); + } + + @Test + public void contextlessTest() { + + out.println(" - - - highLevelTest; contextless - - - "); + + CLPlatform[] clPlatforms = CLPlatform.listCLPlatforms(); + + for (CLPlatform platform : clPlatforms) { + + out.println("platform info:"); + out.println(" name: "+platform.getName()); + out.println(" profile: "+platform.getProfile()); + out.println(" version: "+platform.getVersion()); + out.println(" vendor: "+platform.getVendor()); + + CLDevice[] clDevices = platform.listCLDevices(); + for (CLDevice device : clDevices) { + out.println("device info:"); + out.println(" name: "+device.getName()); + out.println(" profile: "+device.getProfile()); + out.println(" vendor: "+device.getVendor()); + out.println(" type: "+device.getType()); + out.println(" global mem: "+device.getGlobalMemSize()/(1024*1024)+" MB"); + out.println(" local mem: "+device.getLocalMemSize()/1024+" KB"); + out.println(" clock: "+device.getMaxClockFrequency()+" MHz"); + out.println(" max work group size: "+device.getMaxWorkGroupSize()); + out.println(" max compute units: "+device.getMaxComputeUnits()); + out.println(" extensions: "+device.getExtensions()); + } + } + + } + + @Test + public void vectorAddGMTest() throws IOException { + + out.println(" - - - highLevelTest; global memory kernel - - - "); + + CLContext context = CLContext.create(); + + CLDevice[] contextDevices = context.getCLDevices(); + + out.println("context devices:"); + for (CLDevice device : contextDevices) { + out.println(" "+device.toString()); + } + + CLProgram program = context.createProgram(getClass().getResourceAsStream("testkernels.cl")).build(); + + CLDevice[] programDevices = program.getCLDevices(); + + assertEquals(contextDevices.length, programDevices.length); + + out.println("program devices:"); + for (CLDevice device : programDevices) { + out.println(" "+device.toString()); + out.println(" build log: "+program.getBuildLog(device)); + out.println(" build status: "+program.getBuildStatus(device)); + } + + String source = program.getSource(); + assertFalse(source.trim().isEmpty()); +// out.println("source:\n"+source); + + int elementCount = 11444777; // Length of float arrays to process (odd # for illustration) + int localWorkSize = 256; // set and log Global and Local work size dimensions + int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the LocalWorkSize + + out.println("allocateing buffers of size: "+globalWorkSize); + + ByteBuffer srcA = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); + ByteBuffer srcB = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); + ByteBuffer dest = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); + + fillBuffer(srcA, 23456); + fillBuffer(srcB, 46987); + + CLBuffer clBufferA = context.createBuffer(CL.CL_MEM_READ_ONLY, srcA); + CLBuffer clBufferB = context.createBuffer(CL.CL_MEM_READ_ONLY, srcB); + CLBuffer clBufferC = context.createBuffer(CL.CL_MEM_WRITE_ONLY, dest); + + Map kernels = program.getCLKernels(); + for (CLKernel kernel : kernels.values()) { + out.println("kernel: "+kernel.toString()); + } + + assertNotNull(kernels.get("VectorAddGM")); + assertNotNull(kernels.get("Test")); + + CLKernel vectorAddKernel = kernels.get("VectorAddGM"); + + vectorAddKernel.setArg(0, BufferFactory.SIZEOF_LONG, clBufferA) + .setArg(1, BufferFactory.SIZEOF_LONG, clBufferB) + .setArg(2, BufferFactory.SIZEOF_LONG, clBufferC) + .setArg(3, BufferFactory.SIZEOF_INT, elementCount); + + CLCommandQueue queue = programDevices[0].createCommandQueue(); + + // Asynchronous write of data to GPU device, blocking read later + queue.putWriteBuffer(clBufferA, false) + .putWriteBuffer(clBufferB, false) + .putNDRangeKernel(vectorAddKernel, 1, null, new long[]{ globalWorkSize }, new long[]{ localWorkSize }) + .putReadBuffer(clBufferC, true).release(); + + out.println("a+b=c result snapshot: "); + for(int i = 0; i < 10; i++) + out.print(dest.getInt()+", "); + out.println("...; "+dest.remaining()/BufferFactory.SIZEOF_INT + " more"); + + assertTrue(3 == context.getCLBuffers().size()); + clBufferA.release(); + assertTrue(2 == context.getCLBuffers().size()); + + assertTrue(2 == context.getCLBuffers().size()); + clBufferB.release(); + assertTrue(1 == context.getCLBuffers().size()); + + assertTrue(1 == context.getCLBuffers().size()); + clBufferC.release(); + assertTrue(0 == context.getCLBuffers().size()); + + + assertTrue(1 == context.getCLPrograms().size()); + program.release(); + assertTrue(0 == context.getCLPrograms().size()); + +// CLDevice device = ctx.getMaxFlopsDevice(); +// out.println("max FLOPS device: " + device); + context.release(); + } + +} diff --git a/test/com/mbien/opencl/JOCLTest.java b/test/com/mbien/opencl/JOCLTest.java deleted file mode 100644 index 224a6768..00000000 --- a/test/com/mbien/opencl/JOCLTest.java +++ /dev/null @@ -1,472 +0,0 @@ -package com.mbien.opencl; - -import com.sun.gluegen.runtime.BufferFactory; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; -import java.util.Map; -import java.util.Random; -import org.junit.BeforeClass; -import org.junit.Test; -import static org.junit.Assert.*; -import static java.lang.System.*; - -/** - * Test for testing basic functionality. - * @author Michael Bien - */ -public class JOCLTest { - - private final static String programSource = - " // OpenCL Kernel Function for element by element vector addition \n" - + "__kernel void VectorAdd(__global const int* a, __global const int* b, __global int* c, int iNumElements) { \n" - + " // get index into global data array \n" - + " int iGID = get_global_id(0); \n" - + " // bound check (equivalent to the limit on a 'for' loop for standard/serial C code \n" - + " if (iGID >= iNumElements) { \n" - + " return; \n" - + " } \n" - + " // add the vector elements \n" - + " c[iGID] = a[iGID] + b[iGID]; \n" - + "} \n" - + "__kernel void Test(__global const int* a, __global const int* b, __global int* c, int iNumElements) { \n" - + " // get index into global data array \n" - + " int iGID = get_global_id(0); \n" - + " // bound check (equivalent to the limit on a 'for' loop for standard/serial C code \n" - + " if (iGID >= iNumElements) { \n" - + " return; \n" - + " } \n" - + " c[iGID] = iGID; \n" - + "} \n"; - - public JOCLTest() { - } - - @BeforeClass - public static void setUpClass() throws Exception { - out.println("OS: " + System.getProperty("os.name")); - out.println("VM: " + System.getProperty("java.vm.name")); - } - - @Test - public void lowLevelTest1() { - - out.println(" - - - lowLevelTest; contextless binding - - - "); - - int ret = CL.CL_SUCCESS; - - CL cl = CLContext.getLowLevelBinding(); - - int[] intBuffer = new int[1]; - // find all available OpenCL platforms - ret = cl.clGetPlatformIDs(0, null, 0, intBuffer, 0); - checkForError(ret); - out.println("#platforms: "+intBuffer[0]); - - long[] platformId = new long[intBuffer[0]]; - ret = cl.clGetPlatformIDs(platformId.length, platformId, 0, null, 0); - checkForError(ret); - - // print platform info - long[] longBuffer = new long[1]; - ByteBuffer bb = ByteBuffer.allocate(128); - bb.order(ByteOrder.nativeOrder()); - - for (int i = 0; i < platformId.length; i++) { - - long platform = platformId[i]; - out.println("platform id: "+platform); - - ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_PROFILE, bb.capacity(), bb, longBuffer, 0); - checkForError(ret); - out.println(" profile: "+new String(bb.array(), 0, (int)longBuffer[0])); - - ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VERSION, bb.capacity(), bb, longBuffer, 0); - checkForError(ret); - out.println(" version: "+new String(bb.array(), 0, (int)longBuffer[0])); - - ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_NAME, bb.capacity(), bb, longBuffer, 0); - checkForError(ret); - out.println(" name: "+new String(bb.array(), 0, (int)longBuffer[0])); - - ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VENDOR, bb.capacity(), bb, longBuffer, 0); - checkForError(ret); - out.println(" vendor: "+new String(bb.array(), 0, (int)longBuffer[0])); - - //find all devices - ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, 0, null, 0, intBuffer, 0); - checkForError(ret); - out.println("#devices: "+intBuffer[0]); - - long[] devices = new long[intBuffer[0]]; - ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, devices.length, devices, 0, null, 0); - - //print device info - for (int j = 0; j < devices.length; j++) { - long device = devices[j]; - ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_NAME, bb.capacity(), bb, longBuffer, 0); - checkForError(ret); - out.println(" device: "+new String(bb.array(), 0, (int)longBuffer[0])); - - ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_TYPE, bb.capacity(), bb, longBuffer, 0); - checkForError(ret); - out.println(" type: " + CLDevice.Type.valueOf(bb.get())); - bb.rewind(); - - } - - } - - } - - @Test - public void lowLevelTest2() { - - out.println(" - - - lowLevelTest2; VectorAdd kernel - - - "); - -// CreateContextCallback cb = new CreateContextCallback() { -// @Override -// public void createContextCallback(String errinfo, ByteBuffer private_info, long cb, Object user_data) { -// throw new RuntimeException("not yet implemented..."); -// } -// }; - - long[] longArray = new long[1]; - ByteBuffer bb = ByteBuffer.allocate(4096).order(ByteOrder.nativeOrder()); - - CL cl = CLContext.getLowLevelBinding(); - - int ret = CL.CL_SUCCESS; - int[] intArray = new int[1]; - - long context = cl.clCreateContextFromType(null, 0, CL.CL_DEVICE_TYPE_ALL, null, null, null, 0); - out.println("context handle: "+context); - - ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, 0, null, longArray, 0); - checkError("on clGetContextInfo", ret); - - int sizeofLong = 8; // TODO sizeof long... - out.println("context created with " + longArray[0]/sizeofLong + " devices"); - - ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, bb.capacity(), bb, null, 0); - checkError("on clGetContextInfo", ret); - - for (int i = 0; i < longArray[0]/sizeofLong; i++) { - out.println("device id: "+bb.getLong()); - } - - long firstDeviceID = bb.getLong(0); - - // Create a command-queue - long commandQueue = cl.clCreateCommandQueue(context, firstDeviceID, 0, intArray, 0); - checkError("on clCreateCommandQueue", intArray[0]); - - int elementCount = 11444777; // Length of float arrays to process (odd # for illustration) - int localWorkSize = 256; // set and log Global and Local work size dimensions - int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the LocalWorkSize - - out.println("allocateing buffers of size: "+globalWorkSize); - - ByteBuffer srcA = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); - ByteBuffer srcB = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); - ByteBuffer dest = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); - - // TODO sizeof int ... - // Allocate the OpenCL buffer memory objects for source and result on the device GMEM - long devSrcA = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcA.capacity(), null, intArray, 0); - checkError("on clCreateBuffer", intArray[0]); - long devSrcB = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcB.capacity(), null, intArray, 0); - checkError("on clCreateBuffer", intArray[0]); - long devDst = cl.clCreateBuffer(context, CL.CL_MEM_WRITE_ONLY, dest.capacity(), null, intArray, 0); - checkError("on clCreateBuffer", intArray[0]); - - - // Create the program - long program = cl.clCreateProgramWithSource(context, 1, new String[] {programSource}, new long[]{programSource.length()}, 0, intArray, 0); - checkError("on clCreateProgramWithSource", intArray[0]); - - // Build the program - ret = cl.clBuildProgram(program, null, null, null, null); - checkError("on clBuildProgram", ret); - - // Read program infos - bb.rewind(); - ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_NUM_DEVICES, bb.capacity(), bb, null, 0); - checkError("on clGetProgramInfo1", ret); - out.println("program associated with "+bb.getInt(0)+" device(s)"); - - ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, 0, bb, longArray, 0); - checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret); - out.println("program source length (cl): "+longArray[0]); - out.println("program source length (java): "+programSource.length()); - - bb.rewind(); - ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, bb.capacity(), bb, null, 0); - checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret); - out.println("program source:\n"+new String(bb.array(), 0, (int)longArray[0])); - - // Check program status - Arrays.fill(longArray, 42); - bb.rewind(); - ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_STATUS, bb.capacity(), bb, null, 0); - checkError("on clGetProgramBuildInfo1", ret); - - out.println("program build status: " + CLProgram.Status.valueOf(bb.getInt(0))); - assertEquals("build status", CL.CL_BUILD_SUCCESS, bb.getInt(0)); - - // Read build log - ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, 0, null, longArray, 0); - checkError("on clGetProgramBuildInfo2", ret); - out.println("program log length: " + longArray[0]); - - bb.rewind(); - ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, bb.capacity(), bb, null, 0); - checkError("on clGetProgramBuildInfo3", ret); - out.println("log:\n" + new String(bb.array(), 0, (int)longArray[0])); - - // Create the kernel - Arrays.fill(intArray, 42); - long kernel = cl.clCreateKernel(program, "VectorAdd", intArray, 0); - checkError("on clCreateKernel", intArray[0]); - -// srcA.limit(elementCount*BufferFactory.SIZEOF_FLOAT); -// srcB.limit(elementCount*BufferFactory.SIZEOF_FLOAT); - - fillBuffer(srcA, 23456); - fillBuffer(srcB, 46987); - - // Set the Argument values - ret = cl.clSetKernelArg(kernel, 0, BufferFactory.SIZEOF_LONG, wrap(devSrcA)); checkError("on clSetKernelArg0", ret); - ret = cl.clSetKernelArg(kernel, 1, BufferFactory.SIZEOF_LONG, wrap(devSrcB)); checkError("on clSetKernelArg1", ret); - ret = cl.clSetKernelArg(kernel, 2, BufferFactory.SIZEOF_LONG, wrap(devDst)); checkError("on clSetKernelArg2", ret); - ret = cl.clSetKernelArg(kernel, 3, BufferFactory.SIZEOF_INT, wrap(elementCount)); checkError("on clSetKernelArg3", ret); - - out.println("used device memory: "+ (srcA.capacity()+srcB.capacity()+dest.capacity())/1000000 +"MB"); - - // Asynchronous write of data to GPU device - ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcA, CL.CL_FALSE, 0, srcA.capacity(), srcA, 0, null, 0, null, 0); - checkError("on clEnqueueWriteBuffer", ret); - ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcB, CL.CL_FALSE, 0, srcB.capacity(), srcB, 0, null, 0, null, 0); - checkError("on clEnqueueWriteBuffer", ret); - - // Launch kernel - ret = cl.clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, 0, - new long[]{ globalWorkSize }, 0, - new long[]{ localWorkSize }, 0, 0, - null, 0, - null, 0); - checkError("on clEnqueueNDRangeKernel", ret); - - // Synchronous/blocking read of results - ret = cl.clEnqueueReadBuffer(commandQueue, devDst, CL.CL_TRUE, 0, dest.capacity(), dest, 0, null, 0, null, 0); - checkError("on clEnqueueReadBuffer", ret); - - out.println("a+b=c result snapshot: "); - for(int i = 0; i < 10; i++) - out.print(dest.getInt()+", "); - out.println("...; "+dest.remaining()/BufferFactory.SIZEOF_INT + " more"); - - - // cleanup - ret = cl.clReleaseCommandQueue(commandQueue); - checkError("on clReleaseCommandQueue", ret); - - ret = cl.clReleaseMemObject(devSrcA); - checkError("on clReleaseMemObject", ret); - ret = cl.clReleaseMemObject(devSrcB); - checkError("on clReleaseMemObject", ret); - ret = cl.clReleaseMemObject(devDst); - checkError("on clReleaseMemObject", ret); - - ret = cl.clReleaseProgram(program); - checkError("on clReleaseProgram", ret); - - ret = cl.clReleaseKernel(kernel); - checkError("on clReleaseKernel", ret); - - ret = cl.clUnloadCompiler(); - checkError("on clUnloadCompiler", ret); - - ret = cl.clReleaseContext(context); - checkError("on clReleaseContext", ret); - - } - -// @Test - public void loadTest() { - //for memory leak detection; e.g watch out for "out of host memory" errors - out.println(" - - - loadTest - - - "); - for(int i = 0; i < 100; i++) { - out.println("###iteration "+i); - lowLevelTest2(); - } - } - - private void fillBuffer(ByteBuffer buffer, int seed) { - - Random rnd = new Random(seed); - - while(buffer.remaining() != 0) - buffer.putInt(rnd.nextInt()); - - buffer.rewind(); - } - - private ByteBuffer wrap(long value) { - return (ByteBuffer) BufferFactory.newDirectByteBuffer(8).putLong(value).rewind(); - } - - @Test - public void highLevelTest1() { - - out.println(" - - - highLevelTest; contextless - - - "); - - CLPlatform[] clPlatforms = CLContext.listCLPlatforms(); - - for (CLPlatform platform : clPlatforms) { - - out.println("platform info:"); - out.println(" name: "+platform.getName()); - out.println(" profile: "+platform.getProfile()); - out.println(" version: "+platform.getVersion()); - out.println(" vendor: "+platform.getVendor()); - - CLDevice[] clDevices = platform.listCLDevices(); - for (CLDevice device : clDevices) { - out.println("device info:"); - out.println(" name: "+device.getName()); - out.println(" profile: "+device.getProfile()); - out.println(" vendor: "+device.getVendor()); - out.println(" type: "+device.getType()); - out.println(" global mem: "+device.getGlobalMemSize()/(1024*1024)+" MB"); - out.println(" local mem: "+device.getLocalMemSize()/1024+" KB"); - out.println(" clock: "+device.getMaxClockFrequency()+" MHz"); - out.println(" max work group size: "+device.getMaxWorkGroupSize()); - out.println(" max compute units: "+device.getMaxComputeUnits()); - out.println(" extensions: "+device.getExtensions()); - } - } - - - } - - - @Test - public void highLevelTest2() { - - out.println(" - - - highLevelTest - - - "); - - CLContext context = CLContext.create(); - - CLDevice[] contextDevices = context.getCLDevices(); - - out.println("context devices:"); - for (CLDevice device : contextDevices) { - out.println(" "+device.toString()); - } - - CLProgram program = context.createProgram(programSource).build(); - - CLDevice[] programDevices = program.getCLDevices(); - - assertEquals(contextDevices.length, programDevices.length); - - out.println("program devices:"); - for (CLDevice device : programDevices) { - out.println(" "+device.toString()); - out.println(" build log: "+program.getBuildLog(device)); - out.println(" build status: "+program.getBuildStatus(device)); - } - - String source = program.getSource(); - assertFalse(source.trim().isEmpty()); -// out.println("source:\n"+source); - - int elementCount = 11444777; // Length of float arrays to process (odd # for illustration) - int localWorkSize = 256; // set and log Global and Local work size dimensions - int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the LocalWorkSize - - out.println("allocateing buffers of size: "+globalWorkSize); - - ByteBuffer srcA = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); - ByteBuffer srcB = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); - ByteBuffer dest = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); - - fillBuffer(srcA, 23456); - fillBuffer(srcB, 46987); - - CLBuffer clBufferA = context.createBuffer(CL.CL_MEM_READ_ONLY, srcA); - CLBuffer clBufferB = context.createBuffer(CL.CL_MEM_READ_ONLY, srcB); - CLBuffer clBufferC = context.createBuffer(CL.CL_MEM_WRITE_ONLY, dest); - - Map kernels = program.getCLKernels(); - for (CLKernel kernel : kernels.values()) { - out.println("kernel: "+kernel.toString()); - } - - assertNotNull(kernels.get("VectorAdd")); - assertNotNull(kernels.get("Test")); - - CLKernel vectorAddKernel = kernels.get("VectorAdd"); - - vectorAddKernel.setArg(0, BufferFactory.SIZEOF_LONG, clBufferA) - .setArg(1, BufferFactory.SIZEOF_LONG, clBufferB) - .setArg(2, BufferFactory.SIZEOF_LONG, clBufferC) - .setArg(3, BufferFactory.SIZEOF_INT, elementCount); - - CLCommandQueue queue = programDevices[0].createCommandQueue(); - - // Asynchronous write of data to GPU device, blocking read later - queue.putWriteBuffer(clBufferA, false) - .putWriteBuffer(clBufferB, false) - .putNDRangeKernel(vectorAddKernel, 1, null, new long[]{ globalWorkSize }, new long[]{ localWorkSize }) - .putReadBuffer(clBufferC, true).release(); - - out.println("a+b=c result snapshot: "); - for(int i = 0; i < 10; i++) - out.print(dest.getInt()+", "); - out.println("...; "+dest.remaining()/BufferFactory.SIZEOF_INT + " more"); - - assertTrue(3 == context.getCLBuffers().size()); - clBufferA.release(); - assertTrue(2 == context.getCLBuffers().size()); - - assertTrue(2 == context.getCLBuffers().size()); - clBufferB.release(); - assertTrue(1 == context.getCLBuffers().size()); - - assertTrue(1 == context.getCLBuffers().size()); - clBufferC.release(); - assertTrue(0 == context.getCLBuffers().size()); - - - assertTrue(1 == context.getCLPrograms().size()); - program.release(); - assertTrue(0 == context.getCLPrograms().size()); - -// CLDevice device = ctx.getMaxFlopsDevice(); -// out.println("max FLOPS device: " + device); - context.release(); - } - - - private final int roundUp(int groupSize, int globalSize) { - int r = globalSize % groupSize; - if (r == 0) { - return globalSize; - } else { - return globalSize + groupSize - r; - } - } - - private final void checkForError(int ret) { - this.checkError("", ret); - } - - private final void checkError(String msg, int ret) { - if(ret != CL.CL_SUCCESS) - throw new CLException(ret, msg); - } - - -} \ No newline at end of file diff --git a/test/com/mbien/opencl/LowLevelBindingTest.java b/test/com/mbien/opencl/LowLevelBindingTest.java new file mode 100644 index 00000000..e5175129 --- /dev/null +++ b/test/com/mbien/opencl/LowLevelBindingTest.java @@ -0,0 +1,316 @@ +package com.mbien.opencl; + +import com.sun.gluegen.runtime.BufferFactory; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import org.junit.BeforeClass; +import org.junit.Test; +import static org.junit.Assert.*; +import static java.lang.System.*; +import static com.mbien.opencl.TestUtils.*; + +/** + * Test testing the low level bindings. + * @author Michael Bien + */ +public class LowLevelBindingTest { + + private final static String programSource = + " // OpenCL Kernel Function for element by element vector addition \n" + + "__kernel void VectorAdd(__global const int* a, __global const int* b, __global int* c, int iNumElements) { \n" + + " // get index into global data array \n" + + " int iGID = get_global_id(0); \n" + + " // bound check (equivalent to the limit on a 'for' loop for standard/serial C code \n" + + " if (iGID >= iNumElements) { \n" + + " return; \n" + + " } \n" + + " // add the vector elements \n" + + " c[iGID] = a[iGID] + b[iGID]; \n" + + "} \n" + + "__kernel void Test(__global const int* a, __global const int* b, __global int* c, int iNumElements) { \n" + + " // get index into global data array \n" + + " int iGID = get_global_id(0); \n" + + " // bound check (equivalent to the limit on a 'for' loop for standard/serial C code \n" + + " if (iGID >= iNumElements) { \n" + + " return; \n" + + " } \n" + + " c[iGID] = iGID; \n" + + "} \n"; + + + @BeforeClass + public static void setUpClass() throws Exception { + out.println("OS: " + System.getProperty("os.name")); + out.println("VM: " + System.getProperty("java.vm.name")); + } + + @Test + public void lowLevelTest1() { + + out.println(" - - - lowLevelTest; contextless binding - - - "); + + int ret = CL.CL_SUCCESS; + + CL cl = CLPlatform.getLowLevelBinding(); + + int[] intBuffer = new int[1]; + // find all available OpenCL platforms + ret = cl.clGetPlatformIDs(0, null, 0, intBuffer, 0); + checkForError(ret); + out.println("#platforms: "+intBuffer[0]); + + long[] platformId = new long[intBuffer[0]]; + ret = cl.clGetPlatformIDs(platformId.length, platformId, 0, null, 0); + checkForError(ret); + + // print platform info + long[] longBuffer = new long[1]; + ByteBuffer bb = ByteBuffer.allocate(128); + bb.order(ByteOrder.nativeOrder()); + + for (int i = 0; i < platformId.length; i++) { + + long platform = platformId[i]; + out.println("platform id: "+platform); + + ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_PROFILE, bb.capacity(), bb, longBuffer, 0); + checkForError(ret); + out.println(" profile: "+new String(bb.array(), 0, (int)longBuffer[0])); + + ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VERSION, bb.capacity(), bb, longBuffer, 0); + checkForError(ret); + out.println(" version: "+new String(bb.array(), 0, (int)longBuffer[0])); + + ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_NAME, bb.capacity(), bb, longBuffer, 0); + checkForError(ret); + out.println(" name: "+new String(bb.array(), 0, (int)longBuffer[0])); + + ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VENDOR, bb.capacity(), bb, longBuffer, 0); + checkForError(ret); + out.println(" vendor: "+new String(bb.array(), 0, (int)longBuffer[0])); + + //find all devices + ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, 0, null, 0, intBuffer, 0); + checkForError(ret); + out.println("#devices: "+intBuffer[0]); + + long[] devices = new long[intBuffer[0]]; + ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, devices.length, devices, 0, null, 0); + + //print device info + for (int j = 0; j < devices.length; j++) { + long device = devices[j]; + ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_NAME, bb.capacity(), bb, longBuffer, 0); + checkForError(ret); + out.println(" device: "+new String(bb.array(), 0, (int)longBuffer[0])); + + ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_TYPE, bb.capacity(), bb, longBuffer, 0); + checkForError(ret); + out.println(" type: " + CLDevice.Type.valueOf(bb.get())); + bb.rewind(); + + } + + } + + } + + @Test + public void lowLevelTest2() { + + out.println(" - - - lowLevelTest2; VectorAdd kernel - - - "); + +// CreateContextCallback cb = new CreateContextCallback() { +// @Override +// public void createContextCallback(String errinfo, ByteBuffer private_info, long cb, Object user_data) { +// throw new RuntimeException("not yet implemented..."); +// } +// }; + + long[] longArray = new long[1]; + ByteBuffer bb = ByteBuffer.allocate(4096).order(ByteOrder.nativeOrder()); + + CL cl = CLPlatform.getLowLevelBinding(); + + int ret = CL.CL_SUCCESS; + int[] intArray = new int[1]; + + long context = cl.clCreateContextFromType(null, 0, CL.CL_DEVICE_TYPE_ALL, null, null, null, 0); + out.println("context handle: "+context); + + ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, 0, null, longArray, 0); + checkError("on clGetContextInfo", ret); + + int sizeofLong = 8; // TODO sizeof long... + out.println("context created with " + longArray[0]/sizeofLong + " devices"); + + ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, bb.capacity(), bb, null, 0); + checkError("on clGetContextInfo", ret); + + for (int i = 0; i < longArray[0]/sizeofLong; i++) { + out.println("device id: "+bb.getLong()); + } + + long firstDeviceID = bb.getLong(0); + + // Create a command-queue + long commandQueue = cl.clCreateCommandQueue(context, firstDeviceID, 0, intArray, 0); + checkError("on clCreateCommandQueue", intArray[0]); + + int elementCount = 11444777; // Length of float arrays to process (odd # for illustration) + int localWorkSize = 256; // set and log Global and Local work size dimensions + int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the LocalWorkSize + + out.println("allocateing buffers of size: "+globalWorkSize); + + ByteBuffer srcA = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); + ByteBuffer srcB = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); + ByteBuffer dest = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); + + // TODO sizeof int ... + // Allocate the OpenCL buffer memory objects for source and result on the device GMEM + long devSrcA = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcA.capacity(), null, intArray, 0); + checkError("on clCreateBuffer", intArray[0]); + long devSrcB = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcB.capacity(), null, intArray, 0); + checkError("on clCreateBuffer", intArray[0]); + long devDst = cl.clCreateBuffer(context, CL.CL_MEM_WRITE_ONLY, dest.capacity(), null, intArray, 0); + checkError("on clCreateBuffer", intArray[0]); + + + // Create the program + long program = cl.clCreateProgramWithSource(context, 1, new String[] {programSource}, new long[]{programSource.length()}, 0, intArray, 0); + checkError("on clCreateProgramWithSource", intArray[0]); + + // Build the program + ret = cl.clBuildProgram(program, null, null, null, null); + checkError("on clBuildProgram", ret); + + // Read program infos + bb.rewind(); + ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_NUM_DEVICES, bb.capacity(), bb, null, 0); + checkError("on clGetProgramInfo1", ret); + out.println("program associated with "+bb.getInt(0)+" device(s)"); + + ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, 0, bb, longArray, 0); + checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret); + out.println("program source length (cl): "+longArray[0]); + out.println("program source length (java): "+programSource.length()); + + bb.rewind(); + ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, bb.capacity(), bb, null, 0); + checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret); + out.println("program source:\n"+new String(bb.array(), 0, (int)longArray[0])); + + // Check program status + Arrays.fill(longArray, 42); + bb.rewind(); + ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_STATUS, bb.capacity(), bb, null, 0); + checkError("on clGetProgramBuildInfo1", ret); + + out.println("program build status: " + CLProgram.Status.valueOf(bb.getInt(0))); + assertEquals("build status", CL.CL_BUILD_SUCCESS, bb.getInt(0)); + + // Read build log + ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, 0, null, longArray, 0); + checkError("on clGetProgramBuildInfo2", ret); + out.println("program log length: " + longArray[0]); + + bb.rewind(); + ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, bb.capacity(), bb, null, 0); + checkError("on clGetProgramBuildInfo3", ret); + out.println("log:\n" + new String(bb.array(), 0, (int)longArray[0])); + + // Create the kernel + Arrays.fill(intArray, 42); + long kernel = cl.clCreateKernel(program, "VectorAdd", intArray, 0); + checkError("on clCreateKernel", intArray[0]); + +// srcA.limit(elementCount*BufferFactory.SIZEOF_FLOAT); +// srcB.limit(elementCount*BufferFactory.SIZEOF_FLOAT); + + fillBuffer(srcA, 23456); + fillBuffer(srcB, 46987); + + // Set the Argument values + ret = cl.clSetKernelArg(kernel, 0, BufferFactory.SIZEOF_LONG, wrap(devSrcA)); checkError("on clSetKernelArg0", ret); + ret = cl.clSetKernelArg(kernel, 1, BufferFactory.SIZEOF_LONG, wrap(devSrcB)); checkError("on clSetKernelArg1", ret); + ret = cl.clSetKernelArg(kernel, 2, BufferFactory.SIZEOF_LONG, wrap(devDst)); checkError("on clSetKernelArg2", ret); + ret = cl.clSetKernelArg(kernel, 3, BufferFactory.SIZEOF_INT, wrap(elementCount)); checkError("on clSetKernelArg3", ret); + + out.println("used device memory: "+ (srcA.capacity()+srcB.capacity()+dest.capacity())/1000000 +"MB"); + + // Asynchronous write of data to GPU device + ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcA, CL.CL_FALSE, 0, srcA.capacity(), srcA, 0, null, 0, null, 0); + checkError("on clEnqueueWriteBuffer", ret); + ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcB, CL.CL_FALSE, 0, srcB.capacity(), srcB, 0, null, 0, null, 0); + checkError("on clEnqueueWriteBuffer", ret); + + // Launch kernel + ret = cl.clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, 0, + new long[]{ globalWorkSize }, 0, + new long[]{ localWorkSize }, 0, 0, + null, 0, + null, 0); + checkError("on clEnqueueNDRangeKernel", ret); + + // Synchronous/blocking read of results + ret = cl.clEnqueueReadBuffer(commandQueue, devDst, CL.CL_TRUE, 0, dest.capacity(), dest, 0, null, 0, null, 0); + checkError("on clEnqueueReadBuffer", ret); + + out.println("a+b=c result snapshot: "); + for(int i = 0; i < 10; i++) + out.print(dest.getInt()+", "); + out.println("...; "+dest.remaining()/BufferFactory.SIZEOF_INT + " more"); + + + // cleanup + ret = cl.clReleaseCommandQueue(commandQueue); + checkError("on clReleaseCommandQueue", ret); + + ret = cl.clReleaseMemObject(devSrcA); + checkError("on clReleaseMemObject", ret); + ret = cl.clReleaseMemObject(devSrcB); + checkError("on clReleaseMemObject", ret); + ret = cl.clReleaseMemObject(devDst); + checkError("on clReleaseMemObject", ret); + + ret = cl.clReleaseProgram(program); + checkError("on clReleaseProgram", ret); + + ret = cl.clReleaseKernel(kernel); + checkError("on clReleaseKernel", ret); + + ret = cl.clUnloadCompiler(); + checkError("on clUnloadCompiler", ret); + + ret = cl.clReleaseContext(context); + checkError("on clReleaseContext", ret); + + } + +// @Test + public void loadTest() { + //for memory leak detection; e.g watch out for "out of host memory" errors + out.println(" - - - loadTest - - - "); + for(int i = 0; i < 100; i++) { + out.println("###iteration "+i); + lowLevelTest2(); + } + } + + private ByteBuffer wrap(long value) { + return (ByteBuffer) BufferFactory.newDirectByteBuffer(8).putLong(value).rewind(); + } + + private final void checkForError(int ret) { + this.checkError("", ret); + } + + private final void checkError(String msg, int ret) { + if(ret != CL.CL_SUCCESS) + throw new CLException(ret, msg); + } + + +} \ No newline at end of file diff --git a/test/com/mbien/opencl/TestUtils.java b/test/com/mbien/opencl/TestUtils.java new file mode 100644 index 00000000..70bade8a --- /dev/null +++ b/test/com/mbien/opencl/TestUtils.java @@ -0,0 +1,29 @@ +package com.mbien.opencl; + +import java.nio.ByteBuffer; +import java.util.Random; + +/** + * @author Michael Bien + */ +public class TestUtils { + + public static final void fillBuffer(ByteBuffer buffer, int seed) { + + Random rnd = new Random(seed); + + while(buffer.remaining() != 0) + buffer.putInt(rnd.nextInt()); + + buffer.rewind(); + } + + public static final int roundUp(int groupSize, int globalSize) { + int r = globalSize % groupSize; + if (r == 0) { + return globalSize; + } else { + return globalSize + groupSize - r; + } + } +} diff --git a/test/com/mbien/opencl/testkernels.cl b/test/com/mbien/opencl/testkernels.cl new file mode 100644 index 00000000..0790cb32 --- /dev/null +++ b/test/com/mbien/opencl/testkernels.cl @@ -0,0 +1,22 @@ + + // OpenCL Kernel Function for element by element vector addition + __kernel void VectorAddGM(__global const int* a, __global const int* b, __global int* c, int iNumElements) { + // get index into global data array + int iGID = get_global_id(0); + // bound check (equivalent to the limit on a 'for' loop for standard/serial C code + if (iGID >= iNumElements) { + return; + } + // add the vector elements + c[iGID] = a[iGID] + b[iGID]; + } + + __kernel void Test(__global const int* a, __global const int* b, __global int* c, int iNumElements) { + // get index into global data array + int iGID = get_global_id(0); + // bound check (equivalent to the limit on a 'for' loop for standard/serial C code + if (iGID >= iNumElements) { + return; + } + c[iGID] = iGID; + } -- cgit v1.2.3