package com.mbien.opencl; import com.sun.gluegen.runtime.BufferFactory; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; import java.util.Map; import java.util.Random; import org.junit.BeforeClass; import org.junit.Test; import static org.junit.Assert.*; import static java.lang.System.*; /** * Test for testing basic functionality. * @author Michael Bien */ public class JOCLTest { private final static String programSource = " // OpenCL Kernel Function for element by element vector addition \n" + "__kernel void VectorAdd(__global const int* a, __global const int* b, __global int* c, int iNumElements) { \n" + " // get index into global data array \n" + " int iGID = get_global_id(0); \n" + " // bound check (equivalent to the limit on a 'for' loop for standard/serial C code \n" + " if (iGID >= iNumElements) { \n" + " return; \n" + " } \n" + " // add the vector elements \n" + " c[iGID] = a[iGID] + b[iGID]; \n" + "} \n" + "__kernel void Test(__global const int* a, __global const int* b, __global int* c, int iNumElements) { \n" + " // get index into global data array \n" + " int iGID = get_global_id(0); \n" + " // bound check (equivalent to the limit on a 'for' loop for standard/serial C code \n" + " if (iGID >= iNumElements) { \n" + " return; \n" + " } \n" + " c[iGID] = iGID; \n" + "} \n"; public JOCLTest() { } @BeforeClass public static void setUpClass() throws Exception { out.println("OS: " + System.getProperty("os.name")); out.println("VM: " + System.getProperty("java.vm.name")); } @Test public void lowLevelTest1() { out.println(" - - - lowLevelTest; contextless binding - - - "); int ret = CL.CL_SUCCESS; CL cl = CLContext.getLowLevelBinding(); int[] intBuffer = new int[1]; // find all available OpenCL platforms ret = cl.clGetPlatformIDs(0, null, 0, intBuffer, 0); checkForError(ret); out.println("#platforms: "+intBuffer[0]); long[] platformId = new long[intBuffer[0]]; ret = cl.clGetPlatformIDs(platformId.length, platformId, 0, null, 0); checkForError(ret); // print platform info long[] longBuffer = new long[1]; ByteBuffer bb = ByteBuffer.allocate(128); bb.order(ByteOrder.nativeOrder()); for (int i = 0; i < platformId.length; i++) { long platform = platformId[i]; out.println("platform id: "+platform); ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_PROFILE, bb.capacity(), bb, longBuffer, 0); checkForError(ret); out.println(" profile: "+new String(bb.array(), 0, (int)longBuffer[0])); ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VERSION, bb.capacity(), bb, longBuffer, 0); checkForError(ret); out.println(" version: "+new String(bb.array(), 0, (int)longBuffer[0])); ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_NAME, bb.capacity(), bb, longBuffer, 0); checkForError(ret); out.println(" name: "+new String(bb.array(), 0, (int)longBuffer[0])); ret = cl.clGetPlatformInfo(platform, CL.CL_PLATFORM_VENDOR, bb.capacity(), bb, longBuffer, 0); checkForError(ret); out.println(" vendor: "+new String(bb.array(), 0, (int)longBuffer[0])); //find all devices ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, 0, null, 0, intBuffer, 0); checkForError(ret); out.println("#devices: "+intBuffer[0]); long[] devices = new long[intBuffer[0]]; ret = cl.clGetDeviceIDs(platform, CL.CL_DEVICE_TYPE_ALL, devices.length, devices, 0, null, 0); //print device info for (int j = 0; j < devices.length; j++) { long device = devices[j]; ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_NAME, bb.capacity(), bb, longBuffer, 0); checkForError(ret); out.println(" device: "+new String(bb.array(), 0, (int)longBuffer[0])); ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_TYPE, bb.capacity(), bb, longBuffer, 0); checkForError(ret); out.println(" type: " + CLDevice.Type.valueOf(bb.get())); bb.rewind(); } } } @Test public void lowLevelTest2() { out.println(" - - - lowLevelTest2; VectorAdd kernel - - - "); // CreateContextCallback cb = new CreateContextCallback() { // @Override // public void createContextCallback(String errinfo, ByteBuffer private_info, long cb, Object user_data) { // throw new RuntimeException("not yet implemented..."); // } // }; long[] longArray = new long[1]; ByteBuffer bb = ByteBuffer.allocate(4096).order(ByteOrder.nativeOrder()); CL cl = CLContext.getLowLevelBinding(); int ret = CL.CL_SUCCESS; int[] intArray = new int[1]; long context = cl.clCreateContextFromType(null, 0, CL.CL_DEVICE_TYPE_ALL, null, null, null, 0); out.println("context handle: "+context); ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, 0, null, longArray, 0); checkError("on clGetContextInfo", ret); int sizeofLong = 8; // TODO sizeof long... out.println("context created with " + longArray[0]/sizeofLong + " devices"); ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, bb.capacity(), bb, null, 0); checkError("on clGetContextInfo", ret); for (int i = 0; i < longArray[0]/sizeofLong; i++) { out.println("device id: "+bb.getLong()); } long firstDeviceID = bb.getLong(0); // Create a command-queue long commandQueue = cl.clCreateCommandQueue(context, firstDeviceID, 0, intArray, 0); checkError("on clCreateCommandQueue", intArray[0]); int elementCount = 11444777; // Length of float arrays to process (odd # for illustration) int localWorkSize = 256; // set and log Global and Local work size dimensions int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the LocalWorkSize out.println("allocateing buffers of size: "+globalWorkSize); ByteBuffer srcA = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); ByteBuffer srcB = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); ByteBuffer dest = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); // TODO sizeof int ... // Allocate the OpenCL buffer memory objects for source and result on the device GMEM long devSrcA = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcA.capacity(), null, intArray, 0); checkError("on clCreateBuffer", intArray[0]); long devSrcB = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcB.capacity(), null, intArray, 0); checkError("on clCreateBuffer", intArray[0]); long devDst = cl.clCreateBuffer(context, CL.CL_MEM_WRITE_ONLY, dest.capacity(), null, intArray, 0); checkError("on clCreateBuffer", intArray[0]); // Create the program long program = cl.clCreateProgramWithSource(context, 1, new String[] {programSource}, new long[]{programSource.length()}, 0, intArray, 0); checkError("on clCreateProgramWithSource", intArray[0]); // Build the program ret = cl.clBuildProgram(program, null, null, null, null); checkError("on clBuildProgram", ret); // Read program infos bb.rewind(); ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_NUM_DEVICES, bb.capacity(), bb, null, 0); checkError("on clGetProgramInfo1", ret); out.println("program associated with "+bb.getInt(0)+" device(s)"); ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, 0, bb, longArray, 0); checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret); out.println("program source length (cl): "+longArray[0]); out.println("program source length (java): "+programSource.length()); bb.rewind(); ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, bb.capacity(), bb, null, 0); checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret); out.println("program source:\n"+new String(bb.array(), 0, (int)longArray[0])); // Check program status Arrays.fill(longArray, 42); bb.rewind(); ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_STATUS, bb.capacity(), bb, null, 0); checkError("on clGetProgramBuildInfo1", ret); out.println("program build status: " + CLProgram.Status.valueOf(bb.getInt(0))); assertEquals("build status", CL.CL_BUILD_SUCCESS, bb.getInt(0)); // Read build log ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, 0, null, longArray, 0); checkError("on clGetProgramBuildInfo2", ret); out.println("program log length: " + longArray[0]); bb.rewind(); ret = cl.clGetProgramBuildInfo(program, firstDeviceID, CL.CL_PROGRAM_BUILD_LOG, bb.capacity(), bb, null, 0); checkError("on clGetProgramBuildInfo3", ret); out.println("log:\n" + new String(bb.array(), 0, (int)longArray[0])); // Create the kernel Arrays.fill(intArray, 42); long kernel = cl.clCreateKernel(program, "VectorAdd", intArray, 0); checkError("on clCreateKernel", intArray[0]); // srcA.limit(elementCount*BufferFactory.SIZEOF_FLOAT); // srcB.limit(elementCount*BufferFactory.SIZEOF_FLOAT); fillBuffer(srcA, 23456); fillBuffer(srcB, 46987); // Set the Argument values ret = cl.clSetKernelArg(kernel, 0, BufferFactory.SIZEOF_LONG, wrap(devSrcA)); checkError("on clSetKernelArg0", ret); ret = cl.clSetKernelArg(kernel, 1, BufferFactory.SIZEOF_LONG, wrap(devSrcB)); checkError("on clSetKernelArg1", ret); ret = cl.clSetKernelArg(kernel, 2, BufferFactory.SIZEOF_LONG, wrap(devDst)); checkError("on clSetKernelArg2", ret); ret = cl.clSetKernelArg(kernel, 3, BufferFactory.SIZEOF_INT, wrap(elementCount)); checkError("on clSetKernelArg3", ret); out.println("used device memory: "+ (srcA.capacity()+srcB.capacity()+dest.capacity())/1000000 +"MB"); // Asynchronous write of data to GPU device ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcA, CL.CL_FALSE, 0, srcA.capacity(), srcA, 0, null, 0, null, 0); checkError("on clEnqueueWriteBuffer", ret); ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcB, CL.CL_FALSE, 0, srcB.capacity(), srcB, 0, null, 0, null, 0); checkError("on clEnqueueWriteBuffer", ret); // Launch kernel ret = cl.clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, 0, new long[]{ globalWorkSize }, 0, new long[]{ localWorkSize }, 0, 0, null, 0, null, 0); checkError("on clEnqueueNDRangeKernel", ret); // Synchronous/blocking read of results ret = cl.clEnqueueReadBuffer(commandQueue, devDst, CL.CL_TRUE, 0, dest.capacity(), dest, 0, null, 0, null, 0); checkError("on clEnqueueReadBuffer", ret); out.println("a+b=c result snapshot: "); for(int i = 0; i < 10; i++) out.print(dest.getInt()+", "); out.println("...; "+dest.remaining()/BufferFactory.SIZEOF_INT + " more"); // cleanup ret = cl.clReleaseCommandQueue(commandQueue); checkError("on clReleaseCommandQueue", ret); ret = cl.clReleaseMemObject(devSrcA); checkError("on clReleaseMemObject", ret); ret = cl.clReleaseMemObject(devSrcB); checkError("on clReleaseMemObject", ret); ret = cl.clReleaseMemObject(devDst); checkError("on clReleaseMemObject", ret); ret = cl.clReleaseProgram(program); checkError("on clReleaseProgram", ret); ret = cl.clReleaseKernel(kernel); checkError("on clReleaseKernel", ret); ret = cl.clUnloadCompiler(); checkError("on clUnloadCompiler", ret); ret = cl.clReleaseContext(context); checkError("on clReleaseContext", ret); } // @Test public void loadTest() { //for memory leak detection; e.g watch out for "out of host memory" errors out.println(" - - - loadTest - - - "); for(int i = 0; i < 100; i++) { out.println("###iteration "+i); lowLevelTest2(); } } private void fillBuffer(ByteBuffer buffer, int seed) { Random rnd = new Random(seed); while(buffer.remaining() != 0) buffer.putInt(rnd.nextInt()); buffer.rewind(); } private ByteBuffer wrap(long value) { return (ByteBuffer) BufferFactory.newDirectByteBuffer(8).putLong(value).rewind(); } @Test public void highLevelTest1() { out.println(" - - - highLevelTest; contextless - - - "); CLPlatform[] clPlatforms = CLContext.listCLPlatforms(); for (CLPlatform platform : clPlatforms) { out.println("platform info:"); out.println(" name: "+platform.getName()); out.println(" profile: "+platform.getProfile()); out.println(" version: "+platform.getVersion()); out.println(" vendor: "+platform.getVendor()); CLDevice[] clDevices = platform.listCLDevices(); for (CLDevice device : clDevices) { out.println("device info:"); out.println(" name: "+device.getName()); out.println(" profile: "+device.getProfile()); out.println(" vendor: "+device.getVendor()); out.println(" type: "+device.getType()); out.println(" global mem: "+device.getGlobalMemSize()/(1024*1024)+" MB"); out.println(" local mem: "+device.getLocalMemSize()/1024+" KB"); out.println(" clock: "+device.getMaxClockFrequency()+" MHz"); out.println(" max work group size: "+device.getMaxWorkGroupSize()); out.println(" max compute units: "+device.getMaxComputeUnits()); out.println(" extensions: "+device.getExtensions()); } } } @Test public void highLevelTest2() { out.println(" - - - highLevelTest - - - "); CLContext context = CLContext.create(); CLDevice[] contextDevices = context.getCLDevices(); out.println("context devices:"); for (CLDevice device : contextDevices) { out.println(" "+device.toString()); } CLProgram program = context.createProgram(programSource).build(); CLDevice[] programDevices = program.getCLDevices(); assertEquals(contextDevices.length, programDevices.length); out.println("program devices:"); for (CLDevice device : programDevices) { out.println(" "+device.toString()); out.println(" build log: "+program.getBuildLog(device)); out.println(" build status: "+program.getBuildStatus(device)); } String source = program.getSource(); assertFalse(source.trim().isEmpty()); // out.println("source:\n"+source); int elementCount = 11444777; // Length of float arrays to process (odd # for illustration) int localWorkSize = 256; // set and log Global and Local work size dimensions int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the LocalWorkSize out.println("allocateing buffers of size: "+globalWorkSize); ByteBuffer srcA = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); ByteBuffer srcB = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); ByteBuffer dest = BufferFactory.newDirectByteBuffer(globalWorkSize*BufferFactory.SIZEOF_INT); fillBuffer(srcA, 23456); fillBuffer(srcB, 46987); CLBuffer clBufferA = context.createBuffer(CL.CL_MEM_READ_ONLY, srcA); CLBuffer clBufferB = context.createBuffer(CL.CL_MEM_READ_ONLY, srcB); CLBuffer clBufferC = context.createBuffer(CL.CL_MEM_WRITE_ONLY, dest); Map kernels = program.getCLKernels(); for (CLKernel kernel : kernels.values()) { out.println("kernel: "+kernel.toString()); } assertNotNull(kernels.get("VectorAdd")); assertNotNull(kernels.get("Test")); CLKernel vectorAddKernel = kernels.get("VectorAdd"); vectorAddKernel.setArg(0, BufferFactory.SIZEOF_LONG, clBufferA) .setArg(1, BufferFactory.SIZEOF_LONG, clBufferB) .setArg(2, BufferFactory.SIZEOF_LONG, clBufferC) .setArg(3, BufferFactory.SIZEOF_INT, elementCount); CLCommandQueue queue = programDevices[0].createCommandQueue(); // Asynchronous write of data to GPU device, blocking read later queue.putWriteBuffer(clBufferA, false) .putWriteBuffer(clBufferB, false) .putNDRangeKernel(vectorAddKernel, 1, new long[]{0}, new long[]{ globalWorkSize }, new long[]{ localWorkSize }) .putReadBuffer(clBufferC, true).release(); out.println("a+b=c result snapshot: "); for(int i = 0; i < 10; i++) out.print(dest.getInt()+", "); out.println("...; "+dest.remaining()/BufferFactory.SIZEOF_INT + " more"); assertTrue(3 == context.getCLBuffers().size()); clBufferA.release(); assertTrue(2 == context.getCLBuffers().size()); assertTrue(2 == context.getCLBuffers().size()); clBufferB.release(); assertTrue(1 == context.getCLBuffers().size()); assertTrue(1 == context.getCLBuffers().size()); clBufferC.release(); assertTrue(0 == context.getCLBuffers().size()); assertTrue(1 == context.getCLPrograms().size()); program.release(); assertTrue(0 == context.getCLPrograms().size()); // CLDevice device = ctx.getMaxFlopsDevice(); // out.println("max FLOPS device: " + device); context.release(); } private final int roundUp(int groupSize, int globalSize) { int r = globalSize % groupSize; if (r == 0) { return globalSize; } else { return globalSize + groupSize - r; } } private final void checkForError(int ret) { this.checkError("", ret); } private final void checkError(String msg, int ret) { if(ret != CL.CL_SUCCESS) throw new CLException(ret, msg); } }