/* * 18:42 Saturday, February 27 2010 */ package com.jogamp.opencl.demos.bitonicsort; import com.jogamp.opencl.CLBuffer; import com.jogamp.opencl.CLCommandQueue; import com.jogamp.opencl.CLContext; import com.jogamp.opencl.CLDevice; import com.jogamp.opencl.CLKernel; import com.jogamp.opencl.CLProgram; import java.io.IOException; import java.nio.IntBuffer; import java.util.Map; import java.util.Random; import static java.lang.System.*; import static com.jogamp.opencl.CLMemory.Mem.*; import static com.jogamp.opencl.CLProgram.*; /** * Bitonic sort optimized for GPUs. * Uses NVIDIA's bitonic merge sort kernel. * @author Michael Bien */ public class BitonicSort { private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal"; private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal"; private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal"; private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1"; private final static int LOCAL_SIZE_LIMIT = 1024; private final Map kernels; public BitonicSort() throws IOException { final int sortDir = 1; final int elements = 1048576; final int maxvalue = 1000000; out.println("Initializing OpenCL..."); //Create the context CLContext context = null; try{ context = CLContext.create(); CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); out.println("Initializing OpenCL bitonic sorter..."); kernels = initBitonicSort(queue); out.println("Creating OpenCL memory objects..."); CLBuffer keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); System.out.println(keyBuffer.getCLSize()/1000000.0f); out.println("Initializing data...\n"); Random random = new Random(); for (int i = 0; i < elements; i++) { int rnd = random.nextInt(maxvalue); keyBuffer.getBuffer().put(i, rnd); } int arrayLength = elements; int batch = elements / arrayLength; out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch); long time = currentTimeMillis(); bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir); queue.putReadBuffer(keyBuffer, true); out.println(currentTimeMillis() - time+"ms"); IntBuffer keys = keyBuffer.getBuffer(); printSnapshot(keys, 20); checkIfSorted(keys); out.println("\nTEST PASSED"); }finally{ if(context!=null) { context.release(); } } } private Map initBitonicSort(CLCommandQueue queue) throws IOException { out.println(" creating bitonic sort program"); CLContext context = queue.getContext(); CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl")) .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT)); Map kernelMap = program.createCLKernels(); out.println(" checking minimum supported workgroup size"); //Check for work group size CLDevice device = queue.getDevice(); long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device); long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device); long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device); if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) { throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2 +" required by this application is not supported on this device."); } return kernelMap; } public void bitonicSort(CLCommandQueue queue, CLBuffer dstKey, CLBuffer srcKey, int batch, int arrayLength, int dir) { if (arrayLength < 2) { throw new IllegalArgumentException("arrayLength was "+arrayLength); } // TODO Only power-of-two array lengths are supported so far dir = (dir != 0) ? 1 : 0; CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1); CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL); CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL); CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL); if (arrayLength <= LOCAL_SIZE_LIMIT) { // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE ); //Launch bitonicSortLocal sortlocal.putArgs(dstKey, srcKey) .putArg(arrayLength).putArg(dir).rewind(); int localWorkSize = LOCAL_SIZE_LIMIT / 2; int globalWorkSize = batch * arrayLength / 2; queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize); } else { //Launch bitonicSortLocal1 sortlocal1.setArgs(dstKey, srcKey); int localWorkSize = LOCAL_SIZE_LIMIT / 2; int globalWorkSize = batch * arrayLength / 2; queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize); for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { for (int stride = size / 2; stride > 0; stride >>= 1) { if (stride >= LOCAL_SIZE_LIMIT) { //Launch bitonicMergeGlobal mergeGlobal.putArgs(dstKey, dstKey) .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind(); globalWorkSize = batch * arrayLength / 2; queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0); } else { //Launch bitonicMergeLocal mergeLocal.putArgs(dstKey, dstKey) .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind(); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize); break; } } } } } private void printSnapshot(IntBuffer buffer, int snapshot) { for(int i = 0; i < snapshot; i++) out.print(buffer.get() + ", "); out.println("...; " + buffer.remaining() + " more"); buffer.rewind(); } private void checkIfSorted(IntBuffer keys) { for (int i = 1; i < keys.capacity(); i++) { if (keys.get(i - 1) > keys.get(i)) { throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i)); } } } public static void main(String[] args) throws IOException { new BitonicSort(); } }