From 2c85c416d85205ab98b33e1a0b0daab32d4d81ff Mon Sep 17 00:00:00 2001 From: Michael Bien Date: Mon, 12 Apr 2010 22:27:03 +0200 Subject: changes due to package renaming in jocl. --- nbproject/configs/BitonicSort.properties | 2 +- nbproject/configs/GLCLMandel.properties | 2 +- nbproject/configs/GLCLinterop.properties | 2 +- nbproject/configs/Julia3d.properties | 2 +- nbproject/configs/RadixSort.properties | 2 +- nbproject/project.properties | 4 +- .../jogamp/opencl/demos/bitonicsort/BitonicSort.cl | 214 +++++++++ .../opencl/demos/bitonicsort/BitonicSort.java | 201 +++++++++ src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl | 51 +++ .../opencl/demos/fractal/MultiDeviceFractal.java | 485 ++++++++++++++++++++ .../demos/fractal/MultiDeviceFractal.java.orig | 484 ++++++++++++++++++++ .../jogamp/opencl/demos/hellojocl/HelloJOCL.java | 91 ++++ src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl | 15 + .../joglinterop/GLCLInteroperabilityDemo.java | 277 ++++++++++++ .../jogamp/opencl/demos/joglinterop/JoglInterop.cl | 23 + .../demos/joglinterop/UserSceneInteraction.java | 103 +++++ src/com/jogamp/opencl/demos/julia3d/Julia3d.java | 212 +++++++++ src/com/jogamp/opencl/demos/julia3d/Renderer.java | 203 +++++++++ .../opencl/demos/julia3d/UserSceneController.java | 249 +++++++++++ src/com/jogamp/opencl/demos/julia3d/config.h | 24 + .../opencl/demos/julia3d/mandelbrot_kernel.cl | 357 +++++++++++++++ .../opencl/demos/julia3d/rendering_kernel.cl | 382 ++++++++++++++++ .../opencl/demos/julia3d/structs/Camera.java | 50 +++ .../opencl/demos/julia3d/structs/Camera32.java | 37 ++ .../opencl/demos/julia3d/structs/Camera64.java | 48 ++ .../demos/julia3d/structs/RenderingConfig.java | 78 ++++ .../demos/julia3d/structs/RenderingConfig32.java | 102 +++++ .../demos/julia3d/structs/RenderingConfig64.java | 105 +++++ .../jogamp/opencl/demos/julia3d/structs/Vec.java | 53 +++ .../jogamp/opencl/demos/julia3d/structs/Vec32.java | 44 ++ .../jogamp/opencl/demos/julia3d/structs/Vec64.java | 44 ++ src/com/jogamp/opencl/demos/radixsort/RadixSort.cl | 358 +++++++++++++++ .../jogamp/opencl/demos/radixsort/RadixSort.java | 182 ++++++++ .../opencl/demos/radixsort/RadixSortDemo.java | 129 ++++++ src/com/jogamp/opencl/demos/radixsort/Scan.java | 131 ++++++ src/com/jogamp/opencl/demos/radixsort/Scan_b.cl | 190 ++++++++ .../mbien/opencl/demos/bitonicsort/BitonicSort.cl | 214 --------- .../opencl/demos/bitonicsort/BitonicSort.java | 201 --------- src/com/mbien/opencl/demos/fractal/Mandelbrot.cl | 51 --- .../opencl/demos/fractal/MultiDeviceFractal.java | 486 --------------------- .../demos/fractal/MultiDeviceFractal.java.orig | 484 -------------------- .../mbien/opencl/demos/hellojocl/HelloJOCL.java | 91 ---- src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl | 15 - .../joglinterop/GLCLInteroperabilityDemo.java | 277 ------------ .../mbien/opencl/demos/joglinterop/JoglInterop.cl | 23 - .../demos/joglinterop/UserSceneInteraction.java | 103 ----- src/com/mbien/opencl/demos/julia3d/Julia3d.java | 212 --------- src/com/mbien/opencl/demos/julia3d/Renderer.java | 203 --------- .../opencl/demos/julia3d/UserSceneController.java | 249 ----------- src/com/mbien/opencl/demos/julia3d/config.h | 24 - .../opencl/demos/julia3d/mandelbrot_kernel.cl | 357 --------------- .../mbien/opencl/demos/julia3d/rendering_kernel.cl | 382 ---------------- .../mbien/opencl/demos/julia3d/structs/Camera.java | 52 --- .../opencl/demos/julia3d/structs/Camera32.java | 41 -- .../opencl/demos/julia3d/structs/Camera64.java | 52 --- .../demos/julia3d/structs/RenderingConfig.java | 80 ---- .../demos/julia3d/structs/RenderingConfig32.java | 106 ----- .../demos/julia3d/structs/RenderingConfig64.java | 109 ----- .../mbien/opencl/demos/julia3d/structs/Vec.java | 53 --- .../mbien/opencl/demos/julia3d/structs/Vec32.java | 48 -- .../mbien/opencl/demos/julia3d/structs/Vec64.java | 48 -- src/com/mbien/opencl/demos/radixsort/RadixSort.cl | 358 --------------- .../mbien/opencl/demos/radixsort/RadixSort.java | 183 -------- .../opencl/demos/radixsort/RadixSortDemo.java | 129 ------ src/com/mbien/opencl/demos/radixsort/Scan.java | 131 ------ src/com/mbien/opencl/demos/radixsort/Scan_b.cl | 190 -------- 66 files changed, 4929 insertions(+), 4959 deletions(-) create mode 100644 src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl create mode 100644 src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java create mode 100644 src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl create mode 100644 src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java create mode 100644 src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig create mode 100644 src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java create mode 100644 src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl create mode 100644 src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java create mode 100644 src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl create mode 100644 src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/Julia3d.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/Renderer.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/UserSceneController.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/config.h create mode 100644 src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl create mode 100644 src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Camera.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Vec.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java create mode 100644 src/com/jogamp/opencl/demos/radixsort/RadixSort.cl create mode 100644 src/com/jogamp/opencl/demos/radixsort/RadixSort.java create mode 100644 src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java create mode 100644 src/com/jogamp/opencl/demos/radixsort/Scan.java create mode 100644 src/com/jogamp/opencl/demos/radixsort/Scan_b.cl delete mode 100644 src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl delete mode 100644 src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java delete mode 100644 src/com/mbien/opencl/demos/fractal/Mandelbrot.cl delete mode 100644 src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java delete mode 100644 src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig delete mode 100644 src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java delete mode 100644 src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl delete mode 100644 src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java delete mode 100644 src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl delete mode 100644 src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/Julia3d.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/Renderer.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/UserSceneController.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/config.h delete mode 100644 src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl delete mode 100644 src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Camera.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Camera32.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Camera64.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Vec.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Vec32.java delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Vec64.java delete mode 100644 src/com/mbien/opencl/demos/radixsort/RadixSort.cl delete mode 100644 src/com/mbien/opencl/demos/radixsort/RadixSort.java delete mode 100644 src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java delete mode 100644 src/com/mbien/opencl/demos/radixsort/Scan.java delete mode 100644 src/com/mbien/opencl/demos/radixsort/Scan_b.cl diff --git a/nbproject/configs/BitonicSort.properties b/nbproject/configs/BitonicSort.properties index afddef4..87fc951 100644 --- a/nbproject/configs/BitonicSort.properties +++ b/nbproject/configs/BitonicSort.properties @@ -1 +1 @@ -main.class=com.mbien.opencl.demos.bitonicsort.BitonicSort +main.class=com.jogamp.opencl.demos.bitonicsort.BitonicSort diff --git a/nbproject/configs/GLCLMandel.properties b/nbproject/configs/GLCLMandel.properties index fac72c7..fab0042 100644 --- a/nbproject/configs/GLCLMandel.properties +++ b/nbproject/configs/GLCLMandel.properties @@ -1 +1 @@ -main.class=com.mbien.opencl.demos.fractal.MultiDeviceFractal +main.class=com.jogamp.opencl.demos.fractal.MultiDeviceFractal diff --git a/nbproject/configs/GLCLinterop.properties b/nbproject/configs/GLCLinterop.properties index 710c784..eff8ca4 100644 --- a/nbproject/configs/GLCLinterop.properties +++ b/nbproject/configs/GLCLinterop.properties @@ -1 +1 @@ -main.class=com.mbien.opencl.demos.joglinterop.GLCLInteroperabilityDemo +main.class=com.jogamp.opencl.demos.joglinterop.GLCLInteroperabilityDemo diff --git a/nbproject/configs/Julia3d.properties b/nbproject/configs/Julia3d.properties index 3fa79e8..e11756d 100644 --- a/nbproject/configs/Julia3d.properties +++ b/nbproject/configs/Julia3d.properties @@ -1 +1 @@ -main.class=com.mbien.opencl.demos.julia3d.Julia3d +main.class=com.jogamp.opencl.demos.julia3d.Julia3d diff --git a/nbproject/configs/RadixSort.properties b/nbproject/configs/RadixSort.properties index 41e68f5..d3c5458 100644 --- a/nbproject/configs/RadixSort.properties +++ b/nbproject/configs/RadixSort.properties @@ -1 +1 @@ -main.class=com.mbien.opencl.demos.radixsort.RadixSortDemo +main.class=com.jogamp.opencl.demos.radixsort.RadixSortDemo diff --git a/nbproject/project.properties b/nbproject/project.properties index a21489a..c1425c6 100644 --- a/nbproject/project.properties +++ b/nbproject/project.properties @@ -1,5 +1,5 @@ application.title=JOCL Demos -application.vendor=mbien +application.vendor=jogamp build.classes.dir=${build.dir}/classes build.classes.excludes=**/*.java,**/*.form # This directory is removed when the project is cleaned: @@ -52,7 +52,7 @@ javadoc.use=true javadoc.version=false javadoc.windowtitle= jaxbwiz.endorsed.dirs="${netbeans.home}/../ide12/modules/ext/jaxb/api" -main.class=com.mbien.opencl.demos.hellojocl.HelloJOCL +main.class=com.jogamp.opencl.demos.hellojocl.HelloJOCL manifest.file=manifest.mf meta.inf.dir=${src.dir}/META-INF no.dependencies=true diff --git a/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl new file mode 100644 index 0000000..a8d0e1d --- /dev/null +++ b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl @@ -0,0 +1,214 @@ +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. + * Any use, reproduction, disclosure, or distribution of this software + * and related documentation without an express license agreement from + * NVIDIA Corporation is strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + + + +//Passed down by clBuildProgram +//#define LOCAL_SIZE_LIMIT 1024 + + + +inline void ComparatorPrivate( + uint *keyA, + uint *keyB, + uint arrowDir +){ + if( (*keyA > *keyB) == arrowDir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + } +} + +inline void ComparatorLocal( + __local uint *keyA, + __local uint *keyB, + uint arrowDir +){ + if( (*keyA > *keyB) == arrowDir ){ + uint t; + t = *keyA; *keyA = *keyB; *keyB = t; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into local memory +//////////////////////////////////////////////////////////////////////////////// +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicSortLocal( + __global uint *d_DstKey, + __global uint *d_SrcKey, + uint arrayLength, + uint sortDir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subbatch and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + + for(uint size = 2; size < arrayLength; size <<= 1){ + //Bitonic merge + uint dir = ( (get_local_id(0) & (size / 2)) != 0 ); + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + } + + //dir == sortDir for the last bitonic merge step + { + for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + sortDir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into local memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortLocal with the only exception +//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being +//sorted in opposite directions +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicSortLocal1( + __global uint *d_DstKey, + __global uint *d_SrcKey +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + + //Offset to the beginning of subarray and load data + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + + uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); + + for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ + //Bitonic merge + uint dir = (comparatorI & (size / 2)) != 0; + for(uint stride = size / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + } + + //Odd / even arrays of LOCAL_SIZE_LIMIT elements + //sorted in opposite directions + { + uint dir = (get_group_id(0) & 1); + for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} + +//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT +__kernel void bitonicMergeGlobal( + __global uint *d_DstKey, + __global uint *d_SrcKey, + uint arrayLength, + uint size, + uint stride, + uint sortDir +){ + uint global_comparatorI = get_global_id(0); + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + uint keyA = d_SrcKey[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + + ComparatorPrivate( + &keyA, + &keyB, + dir + ); + + d_DstKey[pos + 0] = keyA; + d_DstKey[pos + stride] = keyB; +} + +//Combined bitonic merge steps for +//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) +void bitonicMergeLocal( + __global uint *d_DstKey, + __global uint *d_SrcKey, + uint arrayLength, + uint stride, + uint size, + uint sortDir +){ + __local uint l_key[LOCAL_SIZE_LIMIT]; + + d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); + l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; + l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; + + //Bitonic merge + uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); + uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); + for(; stride > 0; stride >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + ComparatorLocal( + &l_key[pos + 0], + &l_key[pos + stride], + dir + ); + } + + barrier(CLK_LOCAL_MEM_FENCE); + d_DstKey[ 0] = l_key[get_local_id(0) + 0]; + d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; +} diff --git a/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java new file mode 100644 index 0000000..3d954f2 --- /dev/null +++ b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java @@ -0,0 +1,201 @@ +/* + * 18:42 Saturday, February 27 2010 + */ +package com.jogamp.opencl.demos.bitonicsort; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.Map; +import java.util.Random; + +import static java.lang.System.*; +import static com.jogamp.opencl.CLMemory.Mem.*; +import static com.jogamp.opencl.CLProgram.*; + +/** + * Bitonic sort optimized for GPUs. + * Uses NVIDIA's bitonic merge sort kernel. + * @author Michael Bien + */ +public class BitonicSort { + + private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal"; + private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal"; + private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal"; + private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1"; + + private final static int LOCAL_SIZE_LIMIT = 1024; + private final Map kernels; + + public BitonicSort() throws IOException { + + final int sortDir = 1; + final int elements = 1048576; + final int maxvalue = 1000000; + + out.println("Initializing OpenCL..."); + + //Create the context + CLContext context = null; + + try{ + + context = CLContext.create(); + CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); + + out.println("Initializing OpenCL bitonic sorter..."); + kernels = initBitonicSort(queue); + + out.println("Creating OpenCL memory objects..."); + CLBuffer keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); + System.out.println(keyBuffer.getCLSize()/1000000.0f); + + out.println("Initializing data...\n"); + Random random = new Random(); + for (int i = 0; i < elements; i++) { + int rnd = random.nextInt(maxvalue); + keyBuffer.getBuffer().put(i, rnd); + } + + int arrayLength = elements; + int batch = elements / arrayLength; + + out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch); + + long time = currentTimeMillis(); + + bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir); + queue.putReadBuffer(keyBuffer, true); + + out.println(currentTimeMillis() - time+"ms"); + + IntBuffer keys = keyBuffer.getBuffer(); + printSnapshot(keys, 20); + checkIfSorted(keys); + + out.println("\nTEST PASSED"); + + }finally{ + if(context!=null) { + context.release(); + } + } + + } + + private Map initBitonicSort(CLCommandQueue queue) throws IOException { + + out.println(" creating bitonic sort program"); + + CLContext context = queue.getContext(); + + CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl")) + .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT)); + + Map kernelMap = program.createCLKernels(); + + out.println(" checking minimum supported workgroup size"); + //Check for work group size + CLDevice device = queue.getDevice(); + long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device); + long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device); + long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device); + + if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) + || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) + || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) { + throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2 + +" required by this application is not supported on this device."); + } + + return kernelMap; + + } + + public void bitonicSort(CLCommandQueue queue, CLBuffer dstKey, CLBuffer srcKey, int batch, int arrayLength, int dir) { + + if (arrayLength < 2) { + throw new IllegalArgumentException("arrayLength was "+arrayLength); + } + + // TODO Only power-of-two array lengths are supported so far + + dir = (dir != 0) ? 1 : 0; + + CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1); + CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL); + CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL); + CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL); + + if (arrayLength <= LOCAL_SIZE_LIMIT) { + + // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE ); + + //Launch bitonicSortLocal + sortlocal.putArgs(dstKey, srcKey) + .putArg(arrayLength).putArg(dir).rewind(); + + int localWorkSize = LOCAL_SIZE_LIMIT / 2; + int globalWorkSize = batch * arrayLength / 2; + queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize); + + } else { + + //Launch bitonicSortLocal1 + sortlocal1.setArgs(dstKey, srcKey); + + int localWorkSize = LOCAL_SIZE_LIMIT / 2; + int globalWorkSize = batch * arrayLength / 2; + + queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize); + + for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { + for (int stride = size / 2; stride > 0; stride >>= 1) { + if (stride >= LOCAL_SIZE_LIMIT) { + //Launch bitonicMergeGlobal + mergeGlobal.putArgs(dstKey, dstKey) + .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind(); + + globalWorkSize = batch * arrayLength / 2; + queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0); + } else { + //Launch bitonicMergeLocal + mergeLocal.putArgs(dstKey, dstKey) + .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind(); + + localWorkSize = LOCAL_SIZE_LIMIT / 2; + globalWorkSize = batch * arrayLength / 2; + + queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize); + break; + } + } + } + } + } + + private void printSnapshot(IntBuffer buffer, int snapshot) { + for(int i = 0; i < snapshot; i++) + out.print(buffer.get() + ", "); + out.println("...; " + buffer.remaining() + " more"); + buffer.rewind(); + } + + private void checkIfSorted(IntBuffer keys) { + for (int i = 1; i < keys.capacity(); i++) { + if (keys.get(i - 1) > keys.get(i)) { + throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i)); + } + } + } + + public static void main(String[] args) throws IOException { + new BitonicSort(); + } +} diff --git a/src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl b/src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl new file mode 100644 index 0000000..640c775 --- /dev/null +++ b/src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl @@ -0,0 +1,51 @@ +#ifdef DOUBLE_FP + #pragma OPENCL EXTENSION cl_khr_fp64 : enable + typedef double varfloat; +#else + typedef float varfloat; +#endif + +/** + * For a description of this algorithm please refer to + * http://en.wikipedia.org/wiki/Mandelbrot_set + * @author Michael Bien + */ +kernel void mandelbrot( + const int width, const int height, + const varfloat x0, const varfloat y0, + const varfloat rangeX, const varfloat rangeY, + global uint *output, global uint *colorMap, + const int colorMapSize, const int maxIterations) { + + unsigned int ix = get_global_id(0); + unsigned int iy = get_global_id(1); + + varfloat r = x0 + ix * rangeX / width; + varfloat i = y0 + iy * rangeY / height; + + varfloat x = 0; + varfloat y = 0; + + varfloat magnitudeSquared = 0; + int iteration = 0; + + while (magnitudeSquared < 4 && iteration < maxIterations) { + varfloat x2 = x*x; + varfloat y2 = y*y; + y = 2 * x * y + i; + x = x2 - y2 + r; + magnitudeSquared = x2+y2; + iteration++; + } + + if (iteration == maxIterations) { + output[iy * width + ix] = 0; + }else { + varfloat alpha = (varfloat)iteration / maxIterations; + int colorIndex = (int)(alpha * colorMapSize); + output[iy * width + ix] = colorMap[colorIndex]; + // monochrom + // output[iy * width + ix] = 255*iteration/maxIterations; + } + +} \ No newline at end of file diff --git a/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java new file mode 100644 index 0000000..26770b6 --- /dev/null +++ b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java @@ -0,0 +1,485 @@ +package com.jogamp.opencl.demos.fractal; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.CLEvent; +import com.jogamp.opencl.CLEventList; +import com.jogamp.opencl.CLException; +import com.jogamp.opencl.gl.CLGLBuffer; +import com.jogamp.opencl.gl.CLGLContext; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLPlatform; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opencl.CLProgram.CompilerOptions; +import com.jogamp.opengl.util.awt.TextRenderer; +import java.awt.Color; +import java.awt.Dimension; +import java.awt.Font; +import java.awt.Point; +import java.awt.event.KeyAdapter; +import java.awt.event.KeyEvent; +import java.awt.event.MouseAdapter; +import java.awt.event.MouseEvent; +import java.awt.event.MouseWheelEvent; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.logging.Level; +import java.util.logging.Logger; +import javax.media.opengl.DebugGL2; +import javax.media.opengl.GL; +import javax.media.opengl.GL2; +import javax.media.opengl.GLAutoDrawable; +import javax.media.opengl.GLCapabilities; +import javax.media.opengl.GLContext; +import javax.media.opengl.GLEventListener; +import javax.media.opengl.GLProfile; +import javax.media.opengl.awt.GLCanvas; +import javax.swing.JFrame; +import javax.swing.SwingUtilities; + +import static com.jogamp.common.nio.Buffers.*; +import static javax.media.opengl.GL2.*; +import static com.jogamp.opencl.CLMemory.Mem.*; +import static com.jogamp.opencl.CLEvent.ProfilingCommand.*; +import static com.jogamp.opencl.CLCommandQueue.Mode.*; +import static java.lang.Math.*; + +/** + * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL. + * A shared PBO is used as storage for the fractal image.
+ * http://en.wikipedia.org/wiki/Mandelbrot_set + *

+ * controls:
+ * keys 1-9 control parallelism level
+ * space enables/disables slice seperator
+ * 'd' toggles between 32/64bit floatingpoint precision
+ * mouse/mousewheel to drag and zoom
+ *

+ * @author Michael Bien + */ +public class MultiDeviceFractal implements GLEventListener { + + // max number of used GPUs + private static final int MAX_PARRALLELISM_LEVEL = 8; + + // max per pixel iterations to compute the fractal + private static final int MAX_ITERATIONS = 500; + + private GLCanvas canvas; + + private CLGLContext clContext; + private CLCommandQueue[] queues; + private CLKernel[] kernels; + private CLProgram program; + private CLEventList probes; + private CLGLBuffer[] pboBuffers; + private CLBuffer[] colorMap; + + private int width = 0; + private int height = 0; + + private double minX = -2f; + private double minY = -1.2f; + private double maxX = 0.6f; + private double maxY = 1.3f; + + private int slices; + + private boolean drawSeperator; + private boolean doublePrecision; + private boolean buffersInitialized; + private boolean rebuild; + + private final TextRenderer textRenderer; + + public MultiDeviceFractal(int width, int height) { + + this.width = width; + this.height = height; + + canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2))); + canvas.addGLEventListener(this); + initSceneInteraction(); + + JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set"); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + canvas.setPreferredSize(new Dimension(width, height)); + frame.add(canvas); + frame.pack(); + + frame.setVisible(true); + + textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false); + } + + public void init(GLAutoDrawable drawable) { + + if(clContext == null) { + // enable GL error checking using the composable pipeline + drawable.setGL(new DebugGL2(drawable.getGL().getGL2())); + + drawable.getGL().glFinish(); + initCL(drawable.getContext()); + + GL2 gl = drawable.getGL().getGL2(); + + gl.setSwapInterval(0); + gl.glDisable(GL_DEPTH_TEST); + gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f); + + initView(gl, drawable.getWidth(), drawable.getHeight()); + + initPBO(gl); + drawable.getGL().glFinish(); + + setKernelConstants(); + } + } + + private void initCL(GLContext glCtx){ + try { + // create context managing all available GPUs +// clContext = CLGLContext.create(glCtx, GPU); + clContext = CLGLContext.create(glCtx, CLPlatform.getDefault().listCLDevices()[0]); + + + CLDevice[] devices = clContext.getDevices(); + + slices = min(devices.length, MAX_PARRALLELISM_LEVEL); + + // create command queues for every GPU, setup colormap and init kernels + queues = new CLCommandQueue[slices]; + kernels = new CLKernel[slices]; + probes = new CLEventList(slices); + colorMap = new CLBuffer[slices]; + + for (int i = 0; i < slices; i++) { + + colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY); + initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED); + + // create command queue and upload color map buffer on each used device + queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload + + } + + // load and build program + program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl")); + buildProgram(); + + } catch (IOException ex) { + Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex); + } catch (CLException ex) { + Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex); + } + + } + + private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) { + + for (int n = 0; n < colors.length - 1; n++) { + + Color color = colors[n]; + int r0 = color.getRed(); + int g0 = color.getGreen(); + int b0 = color.getBlue(); + + color = colors[n + 1]; + int r1 = color.getRed(); + int g1 = color.getGreen(); + int b1 = color.getBlue(); + + int deltaR = r1 - r0; + int deltaG = g1 - g0; + int deltaB = b1 - b0; + + for (int step = 0; step < stepSize; step++) { + float alpha = (float) step / (stepSize - 1); + int r = (int) (r0 + alpha * deltaR); + int g = (int) (g0 + alpha * deltaG); + int b = (int) (b0 + alpha * deltaB); + colorMap.put((r << 16) | (g << 8) | (b << 0)); + } + } + colorMap.rewind(); + + } + + private void initView(GL2 gl, int width, int height) { + + gl.glViewport(0, 0, width, height); + + gl.glMatrixMode(GL_MODELVIEW); + gl.glLoadIdentity(); + + gl.glMatrixMode(GL_PROJECTION); + gl.glLoadIdentity(); + gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0); + } + + @SuppressWarnings("unchecked") + private void initPBO(GL gl) { + + if(pboBuffers != null) { + int[] oldPbos = new int[pboBuffers.length]; + for (int i = 0; i < pboBuffers.length; i++) { + CLGLBuffer buffer = pboBuffers[i]; + oldPbos[i] = buffer.GLID; + buffer.release(); + } + gl.glDeleteBuffers(oldPbos.length, oldPbos, 0); + } + + pboBuffers = new CLGLBuffer[slices]; + + int[] pbo = new int[slices]; + gl.glGenBuffers(slices, pbo, 0); + + // setup one empty PBO per slice + for (int i = 0; i < slices; i++) { + + gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]); + gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW); + gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + pboBuffers[i] = clContext.createFromGLBuffer(pbo[i], WRITE_ONLY); + + } + + buffersInitialized = true; + } + + private void buildProgram() { + + /* + * workaround: The driver keeps using the old binaries for some reason. + * to solve this we simple create a new program and release the old. + * however rebuilding programs should be possible -> remove when drivers are fixed. + */ + if(program != null && rebuild) { + String source = program.getSource(); + program.release(); + program = clContext.createProgram(source); + } + + // disable 64bit floating point math if not available + if(doublePrecision) { + for (CLDevice device : program.getCLDevices()) { + if(!device.isDoubleFPAvailable()) { + doublePrecision = false; + break; + } + } + } + + if(doublePrecision) { + program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP"); + }else{ + program.build(CompilerOptions.FAST_RELAXED_MATH); + } + rebuild = false; + + for (int i = 0; i < kernels.length; i++) { + // init kernel with constants + kernels[i] = program.createCLKernel("mandelbrot"); + } + + } + + // init kernels with constants + private void setKernelConstants() { + for (int i = 0; i < slices; i++) { + kernels[i].setForce32BitArgs(!doublePrecision) + .setArg(6, pboBuffers[i]) + .setArg(7, colorMap[i]) + .setArg(8, colorMap[i].getBuffer().capacity()) + .setArg(9, MAX_ITERATIONS); + } + } + + // rendering cycle + public void display(GLAutoDrawable drawable) { + GL gl = drawable.getGL(); + + // make sure GL does not use our objects before we start computeing + gl.glFinish(); + if(!buffersInitialized) { + initPBO(gl); + setKernelConstants(); + } + if(rebuild) { + buildProgram(); + setKernelConstants(); + } + compute(); + + render(gl.getGL2()); + } + + // OpenCL + private void compute() { + + int sliceWidth = width / slices; + double rangeX = (maxX - minX) / slices; + double rangeY = (maxY - minY); + + // release all old events, you can't reuse events in OpenCL + probes.release(); + + // start computation + for (int i = 0; i < slices; i++) { + + kernels[i].putArg( sliceWidth).putArg(height) + .putArg(minX + rangeX*i).putArg( minY) + .putArg( rangeX ).putArg(rangeY) + .rewind(); + + // aquire GL objects, and enqueue a kernel with a probe from the list + queues[i].putAcquireGLObject(pboBuffers[i].ID) + .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes) + .putReleaseGLObject(pboBuffers[i].ID); + + } + + // block until done (important: finish before doing further gl work) + for (int i = 0; i < slices; i++) { + queues[i].finish(); + } + + } + + // OpenGL + private void render(GL2 gl) { + + gl.glClear(GL_COLOR_BUFFER_BIT); + + //draw slices + int sliceWidth = width / slices; + + for (int i = 0; i < slices; i++) { + + int seperatorOffset = drawSeperator?i:0; + + gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID); + gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0); + + gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0); + + } + gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + //draw info text + textRenderer.beginRendering(width, height, false); + + textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15); + + for (int i = 0; i < slices; i++) { + CLEvent event = probes.getEvent(i); + long start = event.getProfilingInfo(START); + long end = event.getProfilingInfo(END); + textRenderer.draw("GPU"+i +" "+(int)((end-start)/1000000.0f)+"ms", 10, height-(20+16*(slices-i))); + } + + textRenderer.endRendering(); + } + + public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) { + + if(this.width == width && this.height == height) + return; + + this.width = width; + this.height = height; + + initPBO(drawable.getGL()); + + initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight()); + } + + private void initSceneInteraction() { + + MouseAdapter mouseAdapter = new MouseAdapter() { + + Point lastpos = new Point(); + + @Override + public void mouseDragged(MouseEvent e) { + + double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width; + double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height; + + minX += offsetX; + minY -= offsetY; + + maxX += offsetX; + maxY -= offsetY; + + lastpos = e.getPoint(); + + canvas.display(); + + } + + @Override + public void mouseMoved(MouseEvent e) { + lastpos = e.getPoint(); + } + + @Override + public void mouseWheelMoved(MouseWheelEvent e) { + float rotation = e.getWheelRotation() / 25.0f; + + double deltaX = rotation * (maxX - minX); + double deltaY = rotation * (maxY - minY); + + // offset for "zoom to cursor" + double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2; + double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2; + + minX += deltaX+offsetX; + minY += deltaY-offsetY; + + maxX +=-deltaX+offsetX; + maxY +=-deltaY-offsetY; + + canvas.display(); + } + }; + + KeyAdapter keyAdapter = new KeyAdapter() { + + @Override + public void keyPressed(KeyEvent e) { + if(e.getKeyCode() == KeyEvent.VK_SPACE) { + drawSeperator = !drawSeperator; + }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') { + int number = e.getKeyChar()-'0'; + slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL)); + buffersInitialized = false; + }else if(e.getKeyCode() == KeyEvent.VK_D) { + doublePrecision = !doublePrecision; + rebuild = true; + } + canvas.display(); + } + + }; + + canvas.addMouseMotionListener(mouseAdapter); + canvas.addMouseWheelListener(mouseAdapter); + canvas.addKeyListener(keyAdapter); + } + + public void dispose(GLAutoDrawable drawable) { + } + + public static void main(String args[]) { + SwingUtilities.invokeLater(new Runnable() { + public void run() { + new MultiDeviceFractal(512, 512); + } + }); + } + +} diff --git a/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig new file mode 100644 index 0000000..403aae3 --- /dev/null +++ b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig @@ -0,0 +1,484 @@ +package com.jogamp.opencl.demos.fractal; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.CLEvent; +import com.jogamp.opencl.CLEventList; +import com.jogamp.opencl.CLException; +import com.jogamp.opencl.CLGLBuffer; +import com.jogamp.opencl.CLGLContext; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opencl.CLProgram.CompilerOptions; +import com.sun.opengl.util.awt.TextRenderer; +import java.awt.Color; +import java.awt.Dimension; +import java.awt.Font; +import java.awt.Point; +import java.awt.event.KeyAdapter; +import java.awt.event.KeyEvent; +import java.awt.event.MouseAdapter; +import java.awt.event.MouseEvent; +import java.awt.event.MouseWheelEvent; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.logging.Level; +import java.util.logging.Logger; +import javax.media.opengl.DebugGL2; +import javax.media.opengl.GL; +import javax.media.opengl.GL2; +import javax.media.opengl.GLAutoDrawable; +import javax.media.opengl.GLCapabilities; +import javax.media.opengl.GLContext; +import javax.media.opengl.GLEventListener; +import javax.media.opengl.GLProfile; +import javax.media.opengl.awt.GLCanvas; +import javax.swing.JFrame; +import javax.swing.SwingUtilities; + +import static com.sun.gluegen.runtime.BufferFactory.*; +import static javax.media.opengl.GL2.*; +import static com.jogamp.opencl.CLMemory.Mem.*; +import static com.jogamp.opencl.CLEvent.ProfilingCommand.*; +import static com.jogamp.opencl.CLCommandQueue.Mode.*; +import static com.jogamp.opencl.CLDevice.Type.*; +import static java.lang.Math.*; + +/** + * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL. + * A shared PBO is used as storage for the fractal image.
+ * http://en.wikipedia.org/wiki/Mandelbrot_set + *

+ * controls:
+ * keys 1-9 control parallelism level
+ * space enables/disables slice seperator
+ * 'd' toggles between 32/64bit floatingpoint precision
+ * mouse/mousewheel to drag and zoom
+ *

+ * @author Michael Bien + */ +public class MultiDeviceFractal implements GLEventListener { + + // max number of used GPUs + private static final int MAX_PARRALLELISM_LEVEL = 8; + + // max per pixel iterations to compute the fractal + private static final int MAX_ITERATIONS = 1000; + + private GLCanvas canvas; + + private CLContext clContext; + private CLCommandQueue[] queues; + private CLKernel[] kernels; + private CLProgram program; + private CLEventList probes; + private CLBuffer[] pboBuffers; + private CLBuffer[] colorMap; + + private int width = 0; + private int height = 0; + + private double minX = -2f; + private double minY = -1.2f; + private double maxX = 0.6f; + private double maxY = 1.3f; + + private int slices; + + private boolean drawSeperator; + private boolean doublePrecision; + private boolean buffersInitialized; + private boolean rebuild; + + private final TextRenderer textRenderer; + + public MultiDeviceFractal(int width, int height) { + + this.width = width; + this.height = height; + + canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2))); + canvas.addGLEventListener(this); + initSceneInteraction(); + + JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set"); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + canvas.setPreferredSize(new Dimension(width, height)); + frame.add(canvas); + frame.pack(); + + frame.setVisible(true); + + textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false); + } + + public void init(GLAutoDrawable drawable) { + + // enable GL error checking using the composable pipeline + drawable.setGL(new DebugGL2(drawable.getGL().getGL2())); + + initCL(drawable.getContext()); + + GL2 gl = drawable.getGL().getGL2(); + + gl.setSwapInterval(0); + gl.glDisable(GL_DEPTH_TEST); + gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f); + + initView(gl, drawable.getWidth(), drawable.getHeight()); + + initPBO(gl); + setKernelConstants(); + } + + private void initCL(GLContext glCtx){ + try { + // create context managing all available GPUs + clContext = CLContext.create(GPU); + + CLDevice[] devices = clContext.getCLDevices(); + + slices = min(devices.length, MAX_PARRALLELISM_LEVEL); + + // create command queues for every GPU, setup colormap and init kernels + queues = new CLCommandQueue[slices]; + kernels = new CLKernel[slices]; + probes = new CLEventList(slices); + colorMap = new CLBuffer[slices]; + + for (int i = 0; i < slices; i++) { + + colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY); + initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED); + + // create command queue and upload color map buffer on each used device + queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload + + } + + // load and build program + program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl")); + buildProgram(); + + } catch (IOException ex) { + Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex); + } catch (CLException ex) { + Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex); + } + + } + + private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) { + + for (int n = 0; n < colors.length - 1; n++) { + + Color color = colors[n]; + int r0 = color.getRed(); + int g0 = color.getGreen(); + int b0 = color.getBlue(); + + color = colors[n + 1]; + int r1 = color.getRed(); + int g1 = color.getGreen(); + int b1 = color.getBlue(); + + int deltaR = r1 - r0; + int deltaG = g1 - g0; + int deltaB = b1 - b0; + + for (int step = 0; step < stepSize; step++) { + float alpha = (float) step / (stepSize - 1); + int r = (int) (r0 + alpha * deltaR); + int g = (int) (g0 + alpha * deltaG); + int b = (int) (b0 + alpha * deltaB); + colorMap.put((r << 16) | (g << 8) | (b << 0)); + } + } + colorMap.rewind(); + + } + + private void initView(GL2 gl, int width, int height) { + + gl.glViewport(0, 0, width, height); + + gl.glMatrixMode(GL_MODELVIEW); + gl.glLoadIdentity(); + + gl.glMatrixMode(GL_PROJECTION); + gl.glLoadIdentity(); + gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0); + } + + @SuppressWarnings("unchecked") + private void initPBO(GL gl) { + + if(pboBuffers != null) { + int[] oldPbos = new int[pboBuffers.length]; + for (int i = 0; i < pboBuffers.length; i++) { + CLBuffer buffer = pboBuffers[i]; +// oldPbos[i] = buffer.GLID; + buffer.release(); + } +// gl.glDeleteBuffers(oldPbos.length, oldPbos, 0); + } + + pboBuffers = new CLBuffer[slices]; + +// int[] pbo = new int[slices]; +// gl.glGenBuffers(slices, pbo, 0); + + // setup one empty PBO per slice + for (int i = 0; i < slices; i++) { + +// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]); +// gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW); +// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + pboBuffers[i] = clContext.createByteBuffer(width*height * SIZEOF_INT / slices, WRITE_ONLY); +// pboBuffers[i] = clContext.createFromGLBuffer(null, pbo[i], WRITE_ONLY); + + } + + buffersInitialized = true; + } + + private void buildProgram() { + + /* + * workaround: The driver keeps using the old binaries for some reason. + * to solve this we simple create a new program and release the old. + * however rebuilding programs should be possible -> remove when drivers are fixed. + */ + if(program != null && rebuild) { + String source = program.getSource(); + program.release(); + program = clContext.createProgram(source); + } + + // disable 64bit floating point math if not available + if(doublePrecision) { + for (CLDevice device : program.getCLDevices()) { + if(!device.isDoubleFPAvailable()) { + doublePrecision = false; + break; + } + } + } + + if(doublePrecision) { + program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP"); + }else{ + program.build(CompilerOptions.FAST_RELAXED_MATH); + } + rebuild = false; + + for (int i = 0; i < kernels.length; i++) { + // init kernel with constants + kernels[i] = program.createCLKernel("mandelbrot"); + } + + } + + // init kernels with constants + private void setKernelConstants() { + for (int i = 0; i < slices; i++) { + kernels[i].setForce32BitArgs(!doublePrecision) + .setArg(6, pboBuffers[i]) + .setArg(7, colorMap[i]) + .setArg(8, colorMap[i].getBuffer().capacity()) + .setArg(9, MAX_ITERATIONS); + } + } + + // rendering cycle + public void display(GLAutoDrawable drawable) { + GL gl = drawable.getGL(); + + if(!buffersInitialized) { + initPBO(gl); + setKernelConstants(); + } + if(rebuild) { + buildProgram(); + setKernelConstants(); + } + // make sure GL does not use our objects before we start computeing + gl.glFinish(); + compute(); + + render(gl.getGL2()); + } + + // OpenCL + private void compute() { + + int sliceWidth = width / slices; + double rangeX = (maxX - minX) / slices; + double rangeY = (maxY - minY); + + // release all old events, you can't reuse events in OpenCL + probes.release(); + + long time = System.currentTimeMillis(); + // start computation + for (int i = 0; i < slices; i++) { + + kernels[i].putArg( sliceWidth).putArg(height) + .putArg(minX + rangeX*i).putArg( minY) + .putArg( rangeX ).putArg(rangeY) + .rewind(); + + // aquire GL objects, and enqueue a kernel with a probe from the list + queues[i] +// .putAcquireGLObject(pboBuffers[i].ID) + .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes) +// .putReleaseGLObject(pboBuffers[i].ID) + ; + + } + + // block until done + for (int i = 0; i < slices; i++) { + queues[i].finish(); + } + System.out.println((System.currentTimeMillis()-time)/1000.0f); + + } + + // OpenGL + private void render(GL2 gl) { + + gl.glClear(GL_COLOR_BUFFER_BIT); + + //draw slices + int sliceWidth = width / slices; + +// for (int i = 0; i < slices; i++) { +// +// int seperatorOffset = drawSeperator?i:0; +// +// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID); +// gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0); +// +// gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0); +// +// } +// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + //draw info text + textRenderer.beginRendering(width, height, false); + + textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15); + + for (int i = 0; i < slices; i++) { + CLEvent event = probes.getEvent(i); + long start = event.getProfilingInfo(START); + long end = event.getProfilingInfo(END); + textRenderer.draw("GPU"+i +" "+((end-start)/1000000000.0f)+"s", 10, height-(20+16*(slices-i))); + } + + textRenderer.endRendering(); + } + + public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) { + + if(this.width == width && this.height == height) + return; + + this.width = width; + this.height = height; + + initPBO(drawable.getGL()); + + initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight()); + } + + private void initSceneInteraction() { + + MouseAdapter mouseAdapter = new MouseAdapter() { + + Point lastpos = new Point(); + + @Override + public void mouseDragged(MouseEvent e) { + + double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width; + double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height; + + minX += offsetX; + minY -= offsetY; + + maxX += offsetX; + maxY -= offsetY; + + lastpos = e.getPoint(); + + canvas.display(); + + } + + @Override + public void mouseMoved(MouseEvent e) { + lastpos = e.getPoint(); + } + + @Override + public void mouseWheelMoved(MouseWheelEvent e) { + float rotation = e.getWheelRotation() / 25.0f; + + double deltaX = rotation * (maxX - minX); + double deltaY = rotation * (maxY - minY); + + // offset for "zoom to cursor" + double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2; + double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2; + + minX += deltaX+offsetX; + minY += deltaY-offsetY; + + maxX +=-deltaX+offsetX; + maxY +=-deltaY-offsetY; + + canvas.display(); + } + }; + + KeyAdapter keyAdapter = new KeyAdapter() { + + @Override + public void keyPressed(KeyEvent e) { + if(e.getKeyCode() == KeyEvent.VK_SPACE) { + drawSeperator = !drawSeperator; + }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') { + int number = e.getKeyChar()-'0'; + slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL)); + buffersInitialized = false; + }else if(e.getKeyCode() == KeyEvent.VK_D) { + doublePrecision = !doublePrecision; + rebuild = true; + } + canvas.display(); + } + + }; + + canvas.addMouseMotionListener(mouseAdapter); + canvas.addMouseWheelListener(mouseAdapter); + canvas.addKeyListener(keyAdapter); + } + + public void dispose(GLAutoDrawable drawable) { + } + + public static void main(String args[]) { + SwingUtilities.invokeLater(new Runnable() { + public void run() { + new MultiDeviceFractal(512, 512); + } + }); + } + +} diff --git a/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java new file mode 100644 index 0000000..31fabab --- /dev/null +++ b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java @@ -0,0 +1,91 @@ +package com.jogamp.opencl.demos.hellojocl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.io.IOException; +import java.nio.FloatBuffer; +import java.util.Random; + +import static java.lang.System.*; +import static com.jogamp.opencl.CLMemory.Mem.*; + +/** + * Hello Java OpenCL example. Adds all elements of buffer A to buffer B + * and stores the result in buffer C.
+ * Sample was inspired by the Nvidia VectorAdd example written in C/C++ + * which is bundled in the Nvidia OpenCL SDK. + * @author Michael Bien + */ +public class HelloJOCL { + + public static void main(String[] args) throws IOException { + + int elementCount = 11444777; // Length of arrays to process + int localWorkSize = 256; // Local work size dimensions + int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize + + // set up + CLContext context = CLContext.create(); + + CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build(); + + CLBuffer clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY); + CLBuffer clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY); + CLBuffer clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY); + + out.println("used device memory: " + + (clBufferA.getSize()+clBufferB.getSize()+clBufferC.getSize())/1000000 +"MB"); + + // fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between runs). + fillBuffer(clBufferA.getBuffer(), 12345); + fillBuffer(clBufferB.getBuffer(), 67890); + + // get a reference to the kernel functon with the name 'VectorAdd' + // and map the buffers to its input parameters. + CLKernel kernel = program.createCLKernel("VectorAdd"); + kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount); + + // create command queue on fastest device. + CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); + + // asynchronous write of data to GPU device, blocking read later to get the computed results back. + long time = nanoTime(); + queue.putWriteBuffer(clBufferA, false) + .putWriteBuffer(clBufferB, false) + .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize) + .putReadBuffer(clBufferC, true); + time = nanoTime() - time; + + // cleanup all resources associated with this context. + context.release(); + + // print first few elements of the resulting buffer to the console. + out.println("a+b=c results snapshot: "); + for(int i = 0; i < 10; i++) + out.print(clBufferC.getBuffer().get() + ", "); + out.println("...; " + clBufferC.getBuffer().remaining() + " more"); + + out.println("computation took: "+(time/1000000)+"ms"); + + } + + private static final void fillBuffer(FloatBuffer buffer, int seed) { + Random rnd = new Random(seed); + while(buffer.remaining() != 0) + buffer.put(rnd.nextFloat()*100); + buffer.rewind(); + } + + private static final int roundUp(int groupSize, int globalSize) { + int r = globalSize % groupSize; + if (r == 0) { + return globalSize; + } else { + return globalSize + groupSize - r; + } + } + +} \ No newline at end of file diff --git a/src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl b/src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl new file mode 100644 index 0000000..ac9dde2 --- /dev/null +++ b/src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl @@ -0,0 +1,15 @@ + + // OpenCL Kernel Function for element by element vector addition + kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) { + + // get index into global data array + int iGID = get_global_id(0); + + // bound check (equivalent to the limit on a 'for' loop for standard/serial C code + if (iGID >= numElements) { + return; + } + + // add the vector elements + c[iGID] = a[iGID] + b[iGID]; + } \ No newline at end of file diff --git a/src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java b/src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java new file mode 100644 index 0000000..24af1fe --- /dev/null +++ b/src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java @@ -0,0 +1,277 @@ +package com.jogamp.opencl.demos.joglinterop; + +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.gl.CLGLBuffer; +import com.jogamp.opencl.gl.CLGLContext; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLPlatform; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opengl.util.Animator; +import java.io.IOException; +import javax.media.opengl.DebugGL2; +import javax.media.opengl.GL2; +import javax.media.opengl.GLAutoDrawable; +import javax.media.opengl.GLCapabilities; +import javax.media.opengl.GLEventListener; +import javax.media.opengl.GLProfile; +import javax.media.opengl.awt.GLCanvas; +import javax.media.opengl.glu.gl2.GLUgl2; +import javax.swing.JFrame; +import javax.swing.SwingUtilities; + +import static com.jogamp.common.nio.Buffers.*; + +/** + * JOCL - JOGL interoperability example. + * @author Michael Bien + */ +public class GLCLInteroperabilityDemo implements GLEventListener { + + private final GLUgl2 glu = new GLUgl2(); + + private final int MESH_SIZE = 256; + + private int width; + private int height; + +// private final FloatBuffer vb; +// private final IntBuffer ib; + + private final int[] glObjects = new int[2]; + private final int VERTICES = 0; +// private final int INDICES = 1; + + private final UserSceneInteraction usi; + + private CLGLContext clContext; + private CLKernel kernel; + private CLCommandQueue commandQueue; + private CLGLBuffer clBuffer; + + private float step = 0; + + public GLCLInteroperabilityDemo() { + + this.usi = new UserSceneInteraction(); + + // create direct memory buffers +// vb = newFloatBuffer(MESH_SIZE * MESH_SIZE * 4); +// ib = newIntBuffer((MESH_SIZE - 1) * (MESH_SIZE - 1) * 2 * 3); +// +// // build indices +// // 0---3 +// // | \ | +// // 1---2 +// for (int h = 0; h < MESH_SIZE - 1; h++) { +// for (int w = 0; w < MESH_SIZE - 1; w++) { +// +// // 0 - 3 - 2 +// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6, w + (h ) * (MESH_SIZE) ); +// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 1, w + (h ) * (MESH_SIZE) + 1); +// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 2, w + (h + 1) * (MESH_SIZE) + 1); +// +// // 0 - 2 - 1 +// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 3, w + (h ) * (MESH_SIZE) ); +// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 4, w + (h + 1) * (MESH_SIZE) + 1); +// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 5, w + (h + 1) * (MESH_SIZE) ); +// +// } +// } +// ib.rewind(); + + SwingUtilities.invokeLater(new Runnable() { + public void run() { + initUI(); + } + }); + + } + + private void initUI() { + + this.width = 600; + this.height = 400; + + GLCapabilities config = new GLCapabilities(GLProfile.get(GLProfile.GL2)); + config.setSampleBuffers(true); + config.setNumSamples(4); + + GLCanvas canvas = new GLCanvas(config); + canvas.addGLEventListener(this); + usi.init(canvas); + + JFrame frame = new JFrame("JOGL-JOCL Interoperability Example"); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + frame.add(canvas); + frame.setSize(width, height); + + frame.setVisible(true); + + } + + + public void init(GLAutoDrawable drawable) { + + if(clContext == null) { + + // find gl compatible device + CLDevice[] devices = CLPlatform.getDefault().listCLDevices(); + CLDevice device = null; + for (CLDevice d : devices) { + if(d.isGLMemorySharingSupported()) { + device = d; + break; + } + } + // create OpenCL context before creating any OpenGL objects + // you want to share with OpenCL (AMD driver requirement) + clContext = CLGLContext.create(drawable.getContext(), device); + + // enable GL error checking using the composable pipeline + drawable.setGL(new DebugGL2(drawable.getGL().getGL2())); + + // OpenGL initialization + GL2 gl = drawable.getGL().getGL2(); + + gl.setSwapInterval(1); + + gl.glPolygonMode(GL2.GL_FRONT_AND_BACK, GL2.GL_LINE); + + gl.glGenBuffers(glObjects.length, glObjects, 0); + + // gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]); + // gl.glBufferData(GL2.GL_ELEMENT_ARRAY_BUFFER, ib.capacity() * SIZEOF_INT, ib, GL2.GL_STATIC_DRAW); + // gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, 0); + + gl.glEnableClientState(GL2.GL_VERTEX_ARRAY); + gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]); + gl.glBufferData(GL2.GL_ARRAY_BUFFER, MESH_SIZE * MESH_SIZE * 4 * SIZEOF_FLOAT, null, GL2.GL_DYNAMIC_DRAW); + gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0); + gl.glDisableClientState(GL2.GL_VERTEX_ARRAY); + + pushPerspectiveView(gl); + gl.glFinish(); + + // init OpenCL + initCL(); + + // start rendering thread + Animator animator = new Animator(drawable); + animator.start(); + + } + } + + private void initCL() { + + CLProgram program; + try { + program = clContext.createProgram(getClass().getResourceAsStream("JoglInterop.cl")); + program.build(); + System.out.println(program.getBuildStatus()); + System.out.println(program.isExecutable()); + System.out.println(program.getBuildLog()); + } catch (IOException ex) { + throw new RuntimeException("can not handle exception", ex); + } + + commandQueue = clContext.getMaxFlopsDevice().createCommandQueue(); + + clBuffer = clContext.createFromGLBuffer(glObjects[VERTICES], CLGLBuffer.Mem.WRITE_ONLY); + + System.out.println("cl buffer type: " + clBuffer.getGLObjectType()); + System.out.println("shared with gl buffer: " + clBuffer.getGLObjectID()); + + kernel = program.createCLKernel("sineWave") + .putArg(clBuffer) + .putArg(MESH_SIZE) + .rewind(); + + System.out.println("cl initialised"); + } + + + public void display(GLAutoDrawable drawable) { + + GL2 gl = drawable.getGL().getGL2(); + + // ensure pipeline is clean before doing cl work + gl.glFinish(); + + computeHeightfield(); + + gl.glClear(GL2.GL_COLOR_BUFFER_BIT | GL2.GL_DEPTH_BUFFER_BIT); + gl.glLoadIdentity(); + + usi.interact(gl); + + gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]); + gl.glVertexPointer(4, GL2.GL_FLOAT, 0, 0); + +// gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]); + + gl.glEnableClientState(GL2.GL_VERTEX_ARRAY); + gl.glDrawArrays(GL2.GL_POINTS, 0, MESH_SIZE * MESH_SIZE); +// gl.glDrawElements(GL2.GL_TRIANGLES, ib.capacity(), GL2.GL_UNSIGNED_INT, 0); + gl.glDisableClientState(GL2.GL_VERTEX_ARRAY); + +// gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0); + + } + + /* + * Computes a heightfield using a OpenCL kernel. + */ + private void computeHeightfield() { + + kernel.setArg(2, step += 0.05f); + + commandQueue.putAcquireGLObject(clBuffer.ID) + .put2DRangeKernel(kernel, 0, 0, MESH_SIZE, MESH_SIZE, 0, 0) + .putReleaseGLObject(clBuffer.ID) + .finish(); + + } + + private void pushPerspectiveView(GL2 gl) { + + gl.glMatrixMode(GL2.GL_PROJECTION); + gl.glPushMatrix(); + + gl.glLoadIdentity(); + + glu.gluPerspective(60, width / (float)height, 1, 1000); + gl.glMatrixMode(GL2.GL_MODELVIEW); + + gl.glPushMatrix(); + gl.glLoadIdentity(); + + } + + private void popView(GL2 gl) { + + gl.glMatrixMode(GL2.GL_PROJECTION); + gl.glPopMatrix(); + + gl.glMatrixMode(GL2.GL_MODELVIEW); + gl.glPopMatrix(); + + } + + + public void reshape(GLAutoDrawable drawable, int arg1, int arg2, int width, int height) { + this.width = width; + this.height = height; + GL2 gl = drawable.getGL().getGL2(); + popView(gl); + pushPerspectiveView(gl); + } + + public void dispose(GLAutoDrawable drawable) { } + + public static void main(String[] args) { + new GLCLInteroperabilityDemo(); + } + +} \ No newline at end of file diff --git a/src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl b/src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl new file mode 100644 index 0000000..0f0bcfc --- /dev/null +++ b/src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl @@ -0,0 +1,23 @@ + +/** +* animated 2D sine pattern. +*/ +kernel void sineWave(global float4 * vertex, int size, float time) { + + unsigned int x = get_global_id(0); + unsigned int y = get_global_id(1); + + // calculate uv coordinates + float u = x / (float) size; + float v = y / (float) size; + + u = u*2.0f - 1.0f; + v = v*2.0f - 1.0f; + + // calculate simple sine wave pattern + float freq = 4.0f; + float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f; + + // write output vertex + vertex[y*size + x] = (float4)(u*10.0f, w*10.0f, v*10.0f, 1.0f); +} diff --git a/src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java b/src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java new file mode 100644 index 0000000..fc0f054 --- /dev/null +++ b/src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java @@ -0,0 +1,103 @@ +package com.jogamp.opencl.demos.joglinterop; + +import java.awt.Component; +import java.awt.Point; +import java.awt.event.MouseAdapter; +import java.awt.event.MouseEvent; +import java.awt.event.MouseMotionAdapter; +import java.awt.event.MouseWheelEvent; +import java.awt.event.MouseWheelListener; +import javax.media.opengl.GL2; + +/** + * Utility class for interacting with a scene. Supports rotation and zoom around origin. + * @author Michael Bien + */ +public class UserSceneInteraction { + + private float z = -20; + private float rotx = 45; + private float roty = 30; + + private Point dragstart; + private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM } + private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE; + + + public void init(Component component) { + initMouseListeners(component); + } + + private void initMouseListeners(Component component) { + component.addMouseMotionListener(new MouseMotionAdapter() { + + @Override + public void mouseDragged(MouseEvent e) { + + if (dragstart != null) { + switch (dragmode) { + case DRAG_ROTATE: + rotx += e.getY() - dragstart.getY(); + roty += e.getX() - dragstart.getX(); + break; + case DRAG_ZOOM: + z += (e.getY() - dragstart.getY()) / 5.0f; + break; + } + } + + dragstart = e.getPoint(); + } + }); + component.addMouseWheelListener(new MouseWheelListener() { + + public void mouseWheelMoved(MouseWheelEvent e) { + z += e.getWheelRotation()*5; + } + + }); + component.addMouseListener(new MouseAdapter() { + + @Override + public void mousePressed(MouseEvent e) { + switch (e.getButton()) { + case (MouseEvent.BUTTON1): + dragmode = MOUSE_MODE.DRAG_ROTATE; + break; + case (MouseEvent.BUTTON2): + dragmode = MOUSE_MODE.DRAG_ZOOM; + break; + case (MouseEvent.BUTTON3): + dragmode = MOUSE_MODE.DRAG_ZOOM; + break; + } + } + + @Override + public void mouseReleased(MouseEvent e) { + switch (e.getButton()) { + case (MouseEvent.BUTTON1): + dragmode = MOUSE_MODE.DRAG_ZOOM; + break; + case (MouseEvent.BUTTON2): + dragmode = MOUSE_MODE.DRAG_ROTATE; + break; + case (MouseEvent.BUTTON3): + dragmode = MOUSE_MODE.DRAG_ROTATE; + break; + } + + dragstart = null; + } + }); + } + + + public void interact(GL2 gl) { + gl.glTranslatef(0, 0, z); + gl.glRotatef(rotx, 1f, 0f, 0f); + gl.glRotatef(roty, 0f, 1.0f, 0f); + } + + +} \ No newline at end of file diff --git a/src/com/jogamp/opencl/demos/julia3d/Julia3d.java b/src/com/jogamp/opencl/demos/julia3d/Julia3d.java new file mode 100644 index 0000000..38633c6 --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/Julia3d.java @@ -0,0 +1,212 @@ +package com.jogamp.opencl.demos.julia3d; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLPlatform; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opencl.demos.julia3d.structs.Camera; +import com.jogamp.opencl.demos.julia3d.structs.RenderingConfig; +import com.jogamp.opencl.demos.julia3d.structs.Vec; +import java.io.IOException; +import java.nio.Buffer; +import java.nio.ByteBuffer; +import java.nio.FloatBuffer; +import javax.swing.SwingUtilities; + +import static com.jogamp.opencl.CLMemory.Mem.*; +import static com.jogamp.opencl.CLProgram.CompilerOptions.*; +import static com.jogamp.opencl.demos.julia3d.UserSceneController.*; + +/** + * This sample has been ported from David Buciarelli's juliaGPU v1.2 written in C. + * @author Michael Bien + */ +public class Julia3d { + + private final CLContext context; + private CLBuffer pixelBuffer; + private final CLBuffer configBuffer; + private final CLCommandQueue commandQueue; + private final CLProgram program; + private final CLKernel julia; + private final CLKernel multiply; + + private final int workGroupSize; + private final String kernelFileName = "rendering_kernel.cl"; + + final RenderingConfig config; + + private Julia3d(RenderingConfig renderConfig) { + this.config = renderConfig; + updateCamera(); + + //setup + CLDevice gpu = CLPlatform.getDefault().getMaxFlopsDevice(); + context = CLContext.create(gpu); + + workGroupSize = 256; + + //allocate buffers + configBuffer = context.createBuffer(config.getBuffer(), READ_ONLY); + commandQueue = gpu.createCommandQueue(); +// update(true); + + try { + program = context.createProgram(Julia3d.class.getResourceAsStream(kernelFileName)) + .build(FAST_RELAXED_MATH); + } catch (IOException ex) { + throw new RuntimeException("unable to load program from source", ex); + } + + julia = program.createCLKernel("JuliaGPU"); + multiply = program.createCLKernel("multiply"); + System.out.println(program.getBuildStatus(gpu)); + System.out.println(program.getBuildLog()); + + } + + void update(boolean reallocate) { + + updateCamera(); + + int bufferSize = config.getWidth() * config.getHeight() * 3; + if(reallocate) { + if(pixelBuffer != null) { + pixelBuffer.release(); + } + + pixelBuffer = context.createFloatBuffer(bufferSize, READ_WRITE, USE_BUFFER); + } + + commandQueue.putWriteBuffer(configBuffer, true); + + julia.putArg(pixelBuffer) + .putArg(configBuffer) + .rewind(); + + multiply.putArg(pixelBuffer) + .putArg(bufferSize) + .rewind(); + } + + + void compute(boolean fastRendering) { + + // calculate workgroup size + int globalThreads = config.getWidth() * config.getHeight(); + if(globalThreads % workGroupSize != 0) + globalThreads = (globalThreads / workGroupSize + 1) * workGroupSize; + + int localThreads = workGroupSize; + int superSamplingSize = config.getSuperSamplingSize(); + + if (!fastRendering && superSamplingSize > 1) { + + for (int y = 0; y < superSamplingSize; ++y) { + for (int x = 0; x < superSamplingSize; ++x) { + + float sampleX = (x + 0.5f) / superSamplingSize; + float sampleY = (y + 0.5f) / superSamplingSize; + + if (x == 0 && y == 0) { + // First pass + julia.setArg(2, 0) + .setArg(3, sampleX) + .setArg(4, sampleY); + + commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads); + + } else if (x == (superSamplingSize - 1) && y == (superSamplingSize - 1)) { + // Last pass + julia.setArg(2, 1) + .setArg(3, sampleX) + .setArg(4, sampleY); + + // normalize the values we accumulated + multiply.setArg(2, 1.0f/(superSamplingSize*superSamplingSize)); + + commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads) + .put1DRangeKernel(multiply, 0, globalThreads*3, localThreads); + } else { + julia.setArg(2, 1) + .setArg(3, sampleX) + .setArg(4, sampleY); + + commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads); + + } + } + } + + }else{ + + //fast rendering + julia.setArg(2, 0) + .setArg(3, 0.0f) + .setArg(4, 0.0f); + + commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads); + } + + commandQueue.putBarrier() + .putReadBuffer(pixelBuffer, true); + + } + + private void updateCamera() { + + Camera camera = config.getCamera(); + + Vec dir = camera.getDir(); + Vec target = camera.getTarget(); + Vec camX = camera.getX(); + Vec camY = camera.getY(); + Vec orig = camera.getOrig(); + + vsub(dir, target, orig); + vnorm(dir); + + Vec up = Vec.create().setX(0).setY(1).setZ(0); + vxcross(camX, dir, up); + vnorm(camX); + vmul(camX, config.getWidth() * .5135f / config.getHeight(), camX); + + vxcross(camY, camX, dir); + vnorm(camY); + vmul(camY, .5135f, camY); + } + + + public static void main(String[] args) { + + RenderingConfig config = RenderingConfig.create() + .setWidth(640).setHeight(480) + .setEnableShadow(1) + .setSuperSamplingSize(2) + .setActvateFastRendering(1) + .setMaxIterations(9) + .setEpsilon(0.003f * 0.75f) + .setLight(new float[] {5, 10, 15}) + .setMu(new float[] {-0.2f, 0.4f, -0.4f, -0.4f}); + + config.getCamera().getOrig() .setX(1).setY(2).setZ(8); + config.getCamera().getTarget().setX(0).setY(0).setZ(0); + + final Julia3d julia3d = new Julia3d(config); + + SwingUtilities.invokeLater(new Runnable() { + public void run() { + new Renderer(julia3d); + } + }); + } + + Buffer getPixelBuffer() { + return pixelBuffer.getBuffer(); + } + + +} diff --git a/src/com/jogamp/opencl/demos/julia3d/Renderer.java b/src/com/jogamp/opencl/demos/julia3d/Renderer.java new file mode 100644 index 0000000..ce97e4a --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/Renderer.java @@ -0,0 +1,203 @@ +package com.jogamp.opencl.demos.julia3d; + +import com.jogamp.opencl.demos.julia3d.structs.RenderingConfig; +import com.jogamp.opengl.util.awt.TextRenderer; +import java.awt.Dimension; +import java.awt.Font; +import java.nio.FloatBuffer; +import java.util.Timer; +import java.util.TimerTask; +import javax.media.opengl.GL2; +import javax.media.opengl.GLAutoDrawable; +import javax.media.opengl.GLCapabilities; +import javax.media.opengl.GLEventListener; +import javax.media.opengl.GLProfile; +import javax.media.opengl.awt.GLCanvas; +import javax.swing.JFrame; + +import static com.jogamp.common.nio.Buffers.*; +import static javax.media.opengl.GL2.*; +import static java.lang.String.*; + +/** + * JOGL renderer for displaying the julia set. + * @author Michael Bien + */ +public class Renderer implements GLEventListener { + + public final static int MU_RECT_SIZE = 80; + + private final Julia3d julia3d; + private final GLCanvas canvas; + private final RenderingConfig config; + private final FloatBuffer juliaSlice; + private final UserSceneController usi; + private final TextRenderer textRenderer; + + private TimerTask task; + private final Timer timer; + + public Renderer(Julia3d julia3d) { + this.julia3d = julia3d; + this.config = julia3d.config; + + timer = new Timer(); + + juliaSlice = newDirectFloatBuffer(MU_RECT_SIZE * MU_RECT_SIZE * 4); + + canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2))); + canvas.addGLEventListener(this); + + usi = new UserSceneController(); + usi.init(this, canvas, config); + + JFrame frame = new JFrame("Java OpenCL - Julia3D GPU"); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + canvas.setPreferredSize(new Dimension(config.getWidth(), config.getHeight())); + frame.add(canvas); + frame.pack(); + + textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false); + + frame.setVisible(true); + } + + public void init(GLAutoDrawable drawable) { + drawable.getGL().getGL2().glMatrixMode(GL_PROJECTION); + } + + void update() { + julia3d.update(false); + canvas.display(); + } + + public void display(GLAutoDrawable drawable) { + + //compute + julia3d.compute(config.getActvateFastRendering() == 1); + + GL2 gl = drawable.getGL().getGL2(); + gl.glClear(GL_COLOR_BUFFER_BIT); + + // draw julia set + gl.glRasterPos2i(0, 0); + gl.glDrawPixels(config.getWidth(), config.getHeight(), GL_RGB, GL_FLOAT, julia3d.getPixelBuffer()); + + + // Draw Mu constant + int width = config.getWidth(); + int height = config.getHeight(); + float[] mu = config.getMu(); + + gl.glEnable(GL_BLEND); + gl.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); + int baseMu1 = width - MU_RECT_SIZE - 2; + int baseMu2 = 1; + drawJuliaSlice(gl, baseMu1, baseMu2, mu[0], mu[1]); + int baseMu3 = width - MU_RECT_SIZE - 2; + int baseMu4 = MU_RECT_SIZE + 2; + drawJuliaSlice(gl, baseMu3, baseMu4, mu[2], mu[3]); + gl.glDisable(GL_BLEND); + + gl.glColor3f(1, 1, 1); + int mu1 = (int) (baseMu1 + MU_RECT_SIZE * (mu[0] + 1.5f) / 3.f); + int mu2 = (int) (baseMu2 + MU_RECT_SIZE * (mu[1] + 1.5f) / 3.f); + gl.glBegin(GL_LINES); + gl.glVertex2i(mu1 - 4, mu2); + gl.glVertex2i(mu1 + 4, mu2); + gl.glVertex2i(mu1, mu2 - 4); + gl.glVertex2i(mu1, mu2 + 4); + gl.glEnd(); + + int mu3 = (int) (baseMu3 + MU_RECT_SIZE * (mu[2] + 1.5f) / 3.f); + int mu4 = (int) (baseMu4 + MU_RECT_SIZE * (mu[3] + 1.5f) / 3.f); + gl.glBegin(GL_LINES); + gl.glVertex2i(mu3 - 4, mu4); + gl.glVertex2i(mu3 + 4, mu4); + gl.glVertex2i(mu3, mu4 - 4); + gl.glVertex2i(mu3, mu4 + 4); + gl.glEnd(); + + // info text + textRenderer.beginRendering(width, height); + textRenderer.draw(format("Epsilon %.5f - Max. Iter. %d", config.getEpsilon(), config.getMaxIterations()), 8, 10); + textRenderer.draw(format("Mu = (%.3f, %.3f, %.3f, %.3f)", mu[0], mu[1], mu[2], mu[3]), 8, 25); + textRenderer.draw(format("Shadow %s - SuperSampling %dx%d - Fast rendering %s", + config.getEnableShadow() == 1 ? "on" : "off", + config.getSuperSamplingSize(), config.getSuperSamplingSize(), + config.getActvateFastRendering() == 1 ? "on" : "off"), 8, 40); + textRenderer.endRendering(); + + // timer task scheduling, delay gpu intensive high quality rendering + if(task != null) { + task.cancel(); + } + if(config.getActvateFastRendering() == 1) { + task = new TimerTask() { + @Override + public void run() { + config.setActvateFastRendering(0); + update(); + config.setActvateFastRendering(1); + } + }; + timer.schedule(task, 2000); + } + } + + private void drawJuliaSlice(GL2 gl, int origX, int origY, float cR, float cI) { + + int index = 0; + float invSize = 3.0f / MU_RECT_SIZE; + for (int i = 0; i < MU_RECT_SIZE; ++i) { + for (int j = 0; j < MU_RECT_SIZE; ++j) { + + float x = i * invSize - 1.5f; + float y = j * invSize - 1.5f; + + int iter; + for (iter = 0; iter < 64; ++iter) { + float x2 = x * x; + float y2 = y * y; + if (x2 + y2 > 4.0f) { + break; + } + + float newx = x2 - y2 + cR; + float newy = 2.f * x * y + cI; + x = newx; + y = newy; + } + + juliaSlice.put(index++, iter / 64.0f); + juliaSlice.put(index++, 0.0f); + juliaSlice.put(index++, 0.0f); + juliaSlice.put(index++, 0.5f); + } + } + + gl.glRasterPos2i(origX, origY); + gl.glDrawPixels(MU_RECT_SIZE, MU_RECT_SIZE, GL_RGBA, GL_FLOAT, juliaSlice); + } + + + public void reshape(GLAutoDrawable drawable, int x, int y, int newWidth, int newHeight) { + + config.setWidth(newWidth); + config.setHeight(newHeight); + + GL2 gl = drawable.getGL().getGL2(); + + gl.glViewport(0, 0, newWidth, newHeight); + gl.glLoadIdentity(); + gl.glOrtho(-0.5f, newWidth - 0.5f, -0.5f, newHeight - 0.5f, -1.0f, 1.0f); + + julia3d.update(true); + + } + + public void dispose(GLAutoDrawable drawable) { + } + + +} diff --git a/src/com/jogamp/opencl/demos/julia3d/UserSceneController.java b/src/com/jogamp/opencl/demos/julia3d/UserSceneController.java new file mode 100644 index 0000000..fda54be --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/UserSceneController.java @@ -0,0 +1,249 @@ +package com.jogamp.opencl.demos.julia3d; + +import com.jogamp.opencl.demos.julia3d.structs.RenderingConfig; +import com.jogamp.opencl.demos.julia3d.structs.Vec; +import java.awt.Component; +import java.awt.Point; +import java.awt.event.KeyAdapter; +import java.awt.event.KeyEvent; +import java.awt.event.MouseAdapter; +import java.awt.event.MouseEvent; +import java.awt.event.MouseWheelEvent; + +import static java.lang.Math.*; +import static com.jogamp.opencl.demos.julia3d.Renderer.*; + +/** + * Utility class for interacting with a scene. Supports rotation and zoom around origin. + * @author Michael Bien + */ +public class UserSceneController { + + private Point dragstart; + private RenderingConfig model; + private Renderer view; + + private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM } + private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE; + + + public void init(Renderer view, Component component, RenderingConfig model) { + initMouseListeners(component); + this.view = view; + this.model = model; + } + + private void initMouseListeners(Component component) { + + MouseAdapter mouseAdapter = new MouseAdapter() { + @Override + public void mouseDragged(MouseEvent e) { + + int x = e.getX(); + int y = e.getY(); + + switch (dragmode) { + case DRAG_ROTATE: + if (dragstart != null) { + int height = model.getHeight(); + int width = model.getWidth(); + + int ry = height - y - 1; + int baseMu1 = width - MU_RECT_SIZE - 2; + int baseMu2 = 1; + int baseMu3 = width - MU_RECT_SIZE - 2; + int baseMu4 = MU_RECT_SIZE + 2; + + if ((x >= baseMu1 && x <= baseMu1 + MU_RECT_SIZE) && (ry >= baseMu2 && ry <= baseMu2 + MU_RECT_SIZE)) { + float[] mu = model.getMu(); + mu[0] = 3.f * ( x - baseMu1) / (float)MU_RECT_SIZE - 1.5f; + mu[1] = 3.f * (ry - baseMu2) / (float)MU_RECT_SIZE - 1.5f; + model.setMu(mu); + } else if ((x >= baseMu3 && x <= baseMu3 + MU_RECT_SIZE) && (ry >= baseMu4 && ry <= baseMu4 + MU_RECT_SIZE)) { + float[] mu = model.getMu(); + mu[2] = 3.f * ( x - baseMu3) / (float)MU_RECT_SIZE - 1.5f; + mu[3] = 3.f * (ry - baseMu4) / (float)MU_RECT_SIZE - 1.5f; + model.setMu(mu); + } else { + rotateCameraYbyOrig(0.01f * (x - dragstart.getX())); + rotateCameraXbyOrig(0.01f * (y - dragstart.getY())); + } + } + dragstart = e.getPoint(); + view.update(); + break; + case DRAG_ZOOM: + if (dragstart != null) { + float zoom = (float) ((y - dragstart.getY()) / 10.0f); + zoom(zoom); + } + dragstart = e.getPoint(); + view.update(); + break; + } + + } + + @Override + public void mousePressed(MouseEvent e) { + switch (e.getButton()) { + case (MouseEvent.BUTTON1): + dragmode = MOUSE_MODE.DRAG_ROTATE; + break; + case (MouseEvent.BUTTON2): + dragmode = MOUSE_MODE.DRAG_ZOOM; + break; + case (MouseEvent.BUTTON3): + dragmode = MOUSE_MODE.DRAG_ZOOM; + break; + } + } + + @Override + public void mouseReleased(MouseEvent e) { + switch (e.getButton()) { + case (MouseEvent.BUTTON1): + dragmode = MOUSE_MODE.DRAG_ZOOM; + break; + case (MouseEvent.BUTTON2): + dragmode = MOUSE_MODE.DRAG_ROTATE; + break; + case (MouseEvent.BUTTON3): + dragmode = MOUSE_MODE.DRAG_ROTATE; + break; + } + + dragstart = null; + } + + @Override + public void mouseWheelMoved(MouseWheelEvent e) { + float zoom = e.getWheelRotation() * 0.1f; + zoom(zoom); + view.update(); + } + + }; + + KeyAdapter keyAdapter = new KeyAdapter() { + + @Override + public void keyPressed(KeyEvent e) { + + switch (e.getKeyChar()) { + case 'l': + model.setEnableShadow(model.getEnableShadow()==0 ? 1 : 0); + break; + case '1': + model.setEpsilon(model.getEpsilon() * 0.75f); + break; + case '2': + model.setEpsilon(model.getEpsilon() * 1.f / 0.75f); + break; + case '3': + model.setMaxIterations(max(1, model.getMaxIterations() -1)); + break; + case '4': + model.setMaxIterations(min(12, model.getMaxIterations()+1)); + break; + case '5': + model.setSuperSamplingSize(max(1, model.getSuperSamplingSize() -1)); + break; + case '6': + model.setSuperSamplingSize(min(5, model.getSuperSamplingSize() +1)); + break; + default: + break; + } + view.update(); + + } + + }; + + component.addKeyListener(keyAdapter); + + component.addMouseListener(mouseAdapter); + component.addMouseMotionListener(mouseAdapter); + component.addMouseWheelListener(mouseAdapter); + + } + private void zoom(float zoom) { + Vec orig = model.getCamera().getOrig(); + orig.setX(orig.getX()+zoom) + .setY(orig.getY()+zoom) + .setZ(orig.getZ()+zoom); + } + + private void rotateLightX(float k) { + float[] light = model.getLight(); + float y = light[1]; + float z = light[2]; + light[1] = (float) ( y * cos(k) + z * sin(k)); + light[2] = (float) (-y * sin(k) + z * cos(k)); + model.setLight(light); + } + + private void rotateLightY(float k) { + float[] light = model.getLight(); + float x = light[0]; + float z = light[2]; + light[0] = (float) (x * cos(k) - z * sin(k)); + light[2] = (float) (x * sin(k) + z * cos(k)); + model.setLight(light); + } + + private void rotateCameraXbyOrig(double k) { + Vec orig = model.getCamera().getOrig(); + float y = orig.getY(); + float z = orig.getZ(); + orig.setY((float) ( y * cos(k) + z * sin(k))); + orig.setZ((float) (-y * sin(k) + z * cos(k))); + } + + private void rotateCameraYbyOrig(double k) { + Vec orig = model.getCamera().getOrig(); + float x = orig.getX(); + float z = orig.getZ(); + orig.setX((float) (x * cos(k) - z * sin(k))); + orig.setZ((float) (x * sin(k) + z * cos(k))); + } + + + public final static void vadd(Vec v, Vec a, Vec b) { + v.setX(a.getX() + b.getX()); + v.setY(a.getY() + b.getY()); + v.setZ(a.getZ() + b.getZ()); + } + + public final static void vsub(Vec v, Vec a, Vec b) { + v.setX(a.getX() - b.getX()); + v.setY(a.getY() - b.getY()); + v.setZ(a.getZ() - b.getZ()); + } + + public final static void vmul(Vec v, float s, Vec b) { + v.setX(s * b.getX()); + v.setY(s * b.getY()); + v.setZ(s * b.getZ()); + } + + public final static float vdot(Vec a, Vec b) { + return a.getX() * b.getX() + + a.getY() * b.getY() + + a.getZ() * b.getZ(); + } + + public final static void vnorm(Vec v) { + float s = (float) (1.0f / sqrt(vdot(v, v))); + vmul(v, s, v); + } + + public final static void vxcross(Vec v, Vec a, Vec b) { + v.setX(a.getY() * b.getZ() - a.getZ() * b.getY()); + v.setY(a.getZ() * b.getX() - a.getX() * b.getZ()); + v.setZ(a.getX() * b.getY() - a.getY() * b.getX()); + } + + +} \ No newline at end of file diff --git a/src/com/jogamp/opencl/demos/julia3d/config.h b/src/com/jogamp/opencl/demos/julia3d/config.h new file mode 100644 index 0000000..72df3ff --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/config.h @@ -0,0 +1,24 @@ + +typedef struct { + float x, y, z; // position, also color (r,g,b) +} Vec; + +typedef struct { + /* User defined values */ + Vec orig, target; + /* Calculated values */ + Vec dir, x, y; +} Camera; + +typedef struct { + unsigned int width, height; + int superSamplingSize; + int actvateFastRendering; + int enableShadow; + + unsigned int maxIterations; + float epsilon; + float mu[4]; + float light[3]; + Camera camera; +} RenderingConfig; diff --git a/src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl b/src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl new file mode 100644 index 0000000..d5acd02 --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl @@ -0,0 +1,357 @@ +/* +Copyright (c) 2009 David Bucciarelli (davibu@interfree.it) + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#define GPU_KERNEL + + +typedef struct { + float x, y, z; // position, also color (r,g,b) +} Vec; + +typedef struct { + /* User defined values */ + Vec orig, target; + /* Calculated values */ + Vec dir, x, y; +} Camera; + +typedef struct { + unsigned int width, height; + int superSamplingSize; + int actvateFastRendering; + int enableShadow; + + unsigned int maxIterations; + float epsilon; + float mu[4]; + float light[3]; + Camera camera; +} RenderingConfig; + +#define BOUNDING_RADIUS_2 4.f + +// Scalar derivative approach by Enforcer: +// http://www.fractalforums.com/mandelbulb-implementation/realtime-renderingoptimisations/ +static float IterateIntersect(const float4 z0, const float4 c0, const uint maxIterations) { + float4 z = z0; + float4 c = c0; + + float dr = 1.0f; + float r2 = dot(z, z); + float r = sqrt(r2); + for (int n = 0; (n < maxIterations) && (r < 2.f); ++n) { + const float zo0 = asin(z.z / r); + const float zi0 = atan2(z.y, z.x); + float zr = r2 * r2 * r2 * r; + const float zo = zo0 * 7.f; + const float zi = zi0 * 7.f; + const float czo = cos(zo); + + dr = zr * dr * 7.f + 1.f; + zr *= r; + + z = zr * (float4)(czo * cos(zi), czo * sin(zi), sin(zo), 0.f); + z += c; + + r2 = dot(z, z); + r = sqrt(r2); + } + + return 0.5f * log(r) * r / dr; +} + +static float IntersectBulb(const float4 eyeRayOrig, const float4 eyeRayDir, + const float4 c, const uint maxIterations, const float epsilon, + const float maxDist, float4 *hitPoint, uint *steps) { + float dist; + float4 r0 = eyeRayOrig; + float distDone = 0.f; + + uint s = 0; + do { + dist = IterateIntersect(r0, c, maxIterations); + distDone += dist; + // We are inside + if (dist <= 0.f) + break; + + r0 += eyeRayDir * dist; + s++; + } while ((dist > epsilon) && (distDone < maxDist)); + + *hitPoint = r0; + *steps = s; + return dist; +} + +#define WORLD_RADIUS 1000.f +#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f)) +float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) { + const float4 op = WORLD_CENTER - eyeRayOrig; + const float b = dot(op, eyeRayDir); + float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS; + + if (det < 0.f) + return -1.f; + else + det = sqrt(det); + + float t = b - det; + if (t > 0.f) + return t; + else { + // We are inside, avoid the hit + return -1.f; + } +} + +int IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir, + float *tmin, float*tmax) { + const float4 op = -eyeRayOrig; + const float b = dot(op, eyeRayDir); + float det = b * b - dot(op, op) + BOUNDING_RADIUS_2; + + if (det < 0.f) + return 0; + else + det = sqrt(det); + + float t1 = b - det; + float t2 = b + det; + if (t1 > 0.f) { + *tmin = t1; + *tmax = t2; + return 1; + } else { + if (t2 > 0.f) { + // We are inside, start from the ray origin + *tmin = 0.f; + *tmax = t2; + + return 1; + } else + return 0; + } +} + +static float4 NormEstimate(const float4 p, const float4 c, + const float delta, const uint maxIterations) { + const float4 qP = p; + const float4 gx1 = qP - (float4)(delta, 0.f, 0.f, 0.f); + const float4 gx2 = qP + (float4)(delta, 0.f, 0.f, 0.f); + const float4 gy1 = qP - (float4)(0.f, delta, 0.f, 0.f); + const float4 gy2 = qP + (float4)(0.f, delta, 0.f, 0.f); + const float4 gz1 = qP - (float4)(0.f, 0.f, delta, 0.f); + const float4 gz2 = qP + (float4)(0.f, 0.f, delta, 0.f); + + const float gradX = length(IterateIntersect(gx2, c, maxIterations)) - + length(IterateIntersect(gx1, c, maxIterations)); + const float gradY = length(IterateIntersect(gy2, c, maxIterations)) - + length(IterateIntersect(gy1, c, maxIterations)); + const float gradZ = length(IterateIntersect(gz2, c, maxIterations)) - + length(IterateIntersect(gz1, c, maxIterations)); + + const float4 N = normalize((float4)(gradX, gradY, gradZ, 0.f)); + + return N; +} + +static float4 Phong(const float4 light, const float4 eye, const float4 pt, + const float4 N, const float4 diffuse) { + const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f); + float4 L = normalize(light - pt); + float NdotL = dot(N, L); + if (NdotL < 0.f) + return diffuse * ambient; + + const float specularExponent = 30.f; + const float specularity = 0.65f; + + float4 E = normalize(eye - pt); + float4 H = (L + E) * (float)0.5f; + + return diffuse * NdotL + + specularity * pow(dot(N, H), specularExponent) + + diffuse * ambient; +} + +__kernel void MandelbulbGPU( + __global float *pixels, + const __global RenderingConfig *config, + const int enableAccumulation, + const float sampleX, + const float sampleY) { + const int gid = get_global_id(0); + const unsigned width = config->width; + const unsigned height = config->height; + + const unsigned int x = gid % width; + const int y = gid / width; + + // Check if we have to do something + if (y >= height) + return; + + const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.5f / 0.75f)) : config->epsilon; + const uint maxIterations = config->actvateFastRendering ? (max(3u, config->maxIterations) - 2u) : config->maxIterations; + + const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]); + const float4 light = (float4)(config->light[0], config->light[1], config->light[2], 0.f); + const __global Camera *camera = &config->camera; + + //-------------------------------------------------------------------------- + // Calculate eye ray + //-------------------------------------------------------------------------- + + const float invWidth = 1.f / width; + const float invHeight = 1.f / height; + const float kcx = (x + sampleX) * invWidth - .5f; + const float4 kcx4 = (float4)kcx; + const float kcy = (y + sampleY) * invHeight - .5f; + const float4 kcy4 = (float4)kcy; + + const float4 cameraX = (float4)(camera->x.x, camera->x.y, camera->x.z, 0.f); + const float4 cameraY = (float4)(camera->y.x, camera->y.y, camera->y.z, 0.f); + const float4 cameraDir = (float4)(camera->dir.x, camera->dir.y, camera->dir.z, 0.f); + const float4 cameraOrig = (float4)(camera->orig.x, camera->orig.y, camera->orig.z, 0.f); + + const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir); + const float4 eyeRayOrig = eyeRayDir * (float4)0.1f + cameraOrig; + + //-------------------------------------------------------------------------- + // Check if we hit the bounding sphere + //-------------------------------------------------------------------------- + + int useAO = 1; + float4 diffuse, n, color; + + float4 hitPoint; + float dist, tmin, tmax; + if (IntersectBoundingSphere(eyeRayOrig, eyeRayDir, &tmin, &tmax)) { + //-------------------------------------------------------------------------- + // Find the intersection with the set + //-------------------------------------------------------------------------- + + uint steps; + float4 rayOrig = eyeRayOrig + eyeRayDir * (float4)tmin; + dist = IntersectBulb(rayOrig, eyeRayDir, mu, maxIterations, + epsilon, tmax - tmin, &hitPoint, &steps); + + if (dist <= epsilon) { + // Set hit + diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f); + n = NormEstimate(hitPoint, mu, dist, maxIterations); + } else + dist = -1.f; + } else + dist = -1.f; + + //-------------------------------------------------------------------------- + // Check if we hit the floor + //-------------------------------------------------------------------------- + + if (dist < 0.f) { + dist = IntersectFloorSphere(eyeRayOrig, eyeRayDir); + + if (dist >= 0.f) { + // Floor hit + hitPoint = eyeRayOrig + eyeRayDir * (float4)dist; + n = hitPoint - WORLD_CENTER; + n = normalize(n); + // The most important feature in a ray tracer: a checker texture ! + const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x); + const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z); + if ((ix + iz) % 2) + diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f); + else + diffuse = (float4) (0.75f, 0.f, 0.f, 0.f); + useAO = 0; + } else { + // Sky hit + color = (float4)(0.f, 0.1f, 0.3f, 0.f); + } + } else { + // Sky hit + color = (float4)(0.f, 0.1f, 0.3f, 0.f); + } + + //-------------------------------------------------------------------------- + // Select the shadow pass + //-------------------------------------------------------------------------- + + if (dist >= 0.f) { + float shadowFactor = 1.f; + if (config->enableShadow) { + float4 L = normalize(light - hitPoint); + float4 rO = hitPoint + n * 1e-2f; + float4 shadowHitPoint; + + // Check bounding sphere + if (IntersectBoundingSphere(rO, L, &tmin, &tmax)) { + float shadowDistSet = tmin; + uint steps; + + rO = rO + L * (float4)shadowDistSet; + shadowDistSet = IntersectBulb(rO, L, mu, maxIterations, epsilon, + tmax - tmin, &shadowHitPoint, &steps); + if (shadowDistSet < epsilon) { + if (useAO) { + // Use steps count to simulate ambient occlusion + shadowFactor = 0.6f - min(steps / 255.f, 0.5f); + } else + shadowFactor = 0.6f; + } + } + } + + //-------------------------------------------------------------------------- + // Direct lighting of hit point + //-------------------------------------------------------------------------- + + color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor; + } + + //-------------------------------------------------------------------------- + // Write pixel + //-------------------------------------------------------------------------- + + int offset = 3 * (x + y * width); + color = clamp(color, (float4)(0.f, 0.f ,0.f, 0.f), (float4)(1.f, 1.f ,1.f, 0.f)); + if (enableAccumulation) { + pixels[offset++] += color.s0; + pixels[offset++] += color.s1; + pixels[offset] += color.s2; + } else { + pixels[offset++] = color.s0; + pixels[offset++] = color.s1; + pixels[offset] = color.s2; + } +} + +kernel void multiply(global float *array, const int numElements, const float s) { + const int gid = get_global_id(0); + if (gid >= numElements) { + return; + } + array[gid] *= s; +} diff --git a/src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl b/src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl new file mode 100644 index 0000000..9c25c1b --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl @@ -0,0 +1,382 @@ +/* +Copyright (c) 2009 David Bucciarelli (davibu@interfree.it) + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#define GPU_KERNEL + + +#define BOUNDING_RADIUS_2 4.f +#define ESCAPE_THRESHOLD 1e1f +#define DELTA 1e-4f + +typedef struct { + float x, y, z; // position, also color (r,g,b) +} Vec; + +typedef struct { + Vec orig, target; + Vec dir, x, y; +} Camera; + +typedef struct { + unsigned int width, height; + int superSamplingSize; + int actvateFastRendering; + int enableShadow; + + unsigned int maxIterations; + float epsilon; + float mu[4]; + float light[3]; + Camera camera; +} RenderingConfig; + + +static float4 QuatMult(const float4 q1, const float4 q2) { + float4 r; + + // a1a2 - b1b2 - c1c2 - d1d2 + r.x = q1.x * q2.x - q1.y * q2.y - q1.z * q2.z - q1.w * q2.w; + // a1b2 + b1a2 + c1d2 - d1c2 + r.y = q1.x * q2.y + q1.y * q2.x + q1.z * q2.w - q1.w * q2.z; + // a1c2 - b1d2 + c1a2 + d1b2 + r.z = q1.x * q2.z - q1.y * q2.w + q1.z * q2.x + q1.w * q2.y; + // a1d2 + b1c2 - c1b2 + d1a2 + r.w = q1.x * q2.w + q1.y * q2.z - q1.z * q2.y + q1.w * q2.x; + + return r; +} + +static float4 QuatSqr(const float4 q) { + float4 r; + + r.x = q.x * q.x - q.y * q.y - q.z * q.z - q.w * q.w; + r.y = 2.f * q.x * q.y; + r.z = 2.f * q.x * q.z; + r.w = 2.f * q.x * q.w; + + return r; +} + +static void IterateIntersect(float4 *q, float4 *qp, const float4 c, const uint maxIterations) { + float4 q0 = *q; + float4 qp0 = *qp; + + for (uint i = 0; i < maxIterations; ++i) { + qp0 = 2.f * QuatMult(q0, qp0); + q0 = QuatSqr(q0) + c; + + if (dot(q0, q0) > ESCAPE_THRESHOLD) + break; + } + + *q = q0; + *qp = qp0; +} + +static float IntersectJulia(const float4 eyeRayOrig, const float4 eyeRayDir, + const float4 c, const uint maxIterations, const float epsilon, + float4 *hitPoint, uint *steps) { + float dist; + float4 r0 = eyeRayOrig; + + uint s = 0; + do { + float4 z = r0; + float4 zp = (float4) (1.f, 0.f, 0.f, 0.f); + + IterateIntersect(&z, &zp, c, maxIterations); + + const float normZP = length(zp); + + // We are inside + if (normZP == 0.f) + break; + + const float normZ = length(z); + dist = 0.5f * normZ * log(normZ) / normZP; + + r0 += eyeRayDir * dist; + s++; + } while ((dist > epsilon) && (dot(r0, r0) < BOUNDING_RADIUS_2)); + + *hitPoint = r0; + *steps = s; + return dist; +} + +#define WORLD_RADIUS 1000.f +#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f)) + +float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) { + const float4 op = WORLD_CENTER - eyeRayOrig; + const float b = dot(op, eyeRayDir); + float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS; + + if (det < 0.f) + return -1.f; + else + det = sqrt(det); + + float t = b - det; + if (t > 0.f) + return t; + else { + // We are inside, avoid the hit + return -1.f; + } +} + +float IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir) { + const float4 op = -eyeRayOrig; + const float b = dot(op, eyeRayDir); + float det = b * b - dot(op, op) + BOUNDING_RADIUS_2; + + if (det < 0.f) + return -1.f; + else + det = sqrt(det); + + float t = b - det; + if (t > 0.f) + return t; + else { + t = b + det; + + if (t > 0.f) { + // We are inside, start from the ray origin + return 0.0f; + } else + return -1.f; + } +} + +static float4 NormEstimate(const float4 p, const float4 c, + const float delta, const uint maxIterations) { + float4 N; + float4 qP = p; + float gradX, gradY, gradZ; + + float4 gx1 = qP - (float4) (DELTA, 0.f, 0.f, 0.f); + float4 gx2 = qP + (float4) (DELTA, 0.f, 0.f, 0.f); + float4 gy1 = qP - (float4) (0.f, DELTA, 0.f, 0.f); + float4 gy2 = qP + (float4) (0.f, DELTA, 0.f, 0.f); + float4 gz1 = qP - (float4) (0.f, 0.f, DELTA, 0.f); + float4 gz2 = qP + (float4) (0.f, 0.f, DELTA, 0.f); + + for (uint i = 0; i < maxIterations; ++i) { + gx1 = QuatSqr(gx1) + c; + gx2 = QuatSqr(gx2) + c; + gy1 = QuatSqr(gy1) + c; + gy2 = QuatSqr(gy2) + c; + gz1 = QuatSqr(gz1) + c; + gz2 = QuatSqr(gz2) + c; + } + + gradX = length(gx2) - length(gx1); + gradY = length(gy2) - length(gy1); + gradZ = length(gz2) - length(gz1); + + N = normalize((float4) (gradX, gradY, gradZ, 0.f)); + + return N; +} + +static float4 Phong(const float4 light, const float4 eye, const float4 pt, const float4 N, const float4 diffuse) { + + const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f); + float4 L = normalize(light - pt); + float NdotL = dot(N, L); + if (NdotL < 0.f) + return diffuse * ambient; + + const float specularExponent = 30.f; + const float specularity = 0.65f; + + float4 E = normalize(eye - pt); + float4 H = (L + E) * (float) 0.5f; + + return diffuse * NdotL + + specularity * pow(dot(N, H), specularExponent) + + diffuse * ambient; +} + +kernel void JuliaGPU( global float *pixels, + const global RenderingConfig *config, + int enableAccumulation, + float sampleX, + float sampleY ) { + + const int gid = get_global_id(0); + unsigned width = config->width; + unsigned height = config->height; + + const unsigned int x = gid % width; + const int y = gid / width; + + // Check if we have to do something + if (y >= height) + return; + + const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.f / 0.75f)) : config->epsilon; + const uint maxIterations = max(1u, config->actvateFastRendering ? (config->maxIterations - 1) : config->maxIterations); + + const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]); + const float4 light = (float4) (config->light[0], config->light[1], config->light[2], 0.f); + const global Camera *camera = &config->camera; + + //-------------------------------------------------------------------------- + // Calculate eye ray + //-------------------------------------------------------------------------- + + const float invWidth = 1.f / width; + const float invHeight = 1.f / height; + const float kcx = (x + sampleX) * invWidth - .5f; + const float4 kcx4 = (float4) kcx; + const float kcy = (y + sampleY) * invHeight - .5f; + const float4 kcy4 = (float4) kcy; + + const float4 cameraX = (float4) (camera->x.x, camera->x.y, camera->x.z, 0.f); + const float4 cameraY = (float4) (camera->y.x, camera->y.y, camera->y.z, 0.f); + const float4 cameraDir = (float4) (camera->dir.x, camera->dir.y, camera->dir.z, 0.f); + const float4 cameraOrig = (float4) (camera->orig.x, camera->orig.y, camera->orig.z, 0.f); + + const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir); + const float4 eyeRayOrig = eyeRayDir * (float4) 0.1f + cameraOrig; + + //-------------------------------------------------------------------------- + // Check if we hit the bounding sphere + //-------------------------------------------------------------------------- + + float distSet = IntersectBoundingSphere(eyeRayOrig, eyeRayDir); + float4 hitPoint; + if (distSet >= 0.f) { + //-------------------------------------------------------------------------- + // Find the intersection with the set + //-------------------------------------------------------------------------- + + uint steps; + float4 rayOrig = eyeRayOrig + eyeRayDir * (float4) distSet; + distSet = IntersectJulia(rayOrig, eyeRayDir, mu, maxIterations, + epsilon, &hitPoint, &steps); + if (distSet > epsilon) + distSet = -1.f; + } + + //-------------------------------------------------------------------------- + // Check if we hit the floor + //-------------------------------------------------------------------------- + + float distFloor = IntersectFloorSphere(eyeRayOrig, eyeRayDir); + + //-------------------------------------------------------------------------- + // Select the hit point + //-------------------------------------------------------------------------- + + int doShade = 0; + int useAO = 1; + float4 diffuse, n, color; + if ((distSet < 0.f) && (distFloor < 0.f)) { + // Sky hit + color = (float4) (0.f, 0.1f, 0.3f, 0.f); + } else if ((distSet >= 0.f) && ((distFloor < 0.f) || (distSet <= distFloor))) { + // Set hit + diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f); + n = NormEstimate(hitPoint, mu, distSet, maxIterations); + doShade = 1; + } else if ((distFloor >= 0.f) && ((distSet < 0.f) || (distFloor <= distSet))) { + // Floor hit + hitPoint = eyeRayOrig + eyeRayDir * (float4) distFloor; + n = hitPoint - WORLD_CENTER; + n = normalize(n); + // The most important feature in a ray tracer: a checker texture ! + const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x); + const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z); + if ((ix + iz) % 2) + diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f); + else + diffuse = (float4) (0.75f, 0.f, 0.f, 0.f); + doShade = 1; + useAO = 0; + } + + //-------------------------------------------------------------------------- + // Select the shadow pass + //-------------------------------------------------------------------------- + + if (doShade) { + float shadowFactor = 1.f; + if (config->enableShadow) { + float4 L = normalize(light - hitPoint); + float4 rO = hitPoint + n * 1e-2f; + float4 shadowHitPoint; + + // Check bounding sphere + float shadowDistSet = IntersectBoundingSphere(rO, L); + if (shadowDistSet >= 0.f) { + uint steps; + + rO = rO + L * (float4) shadowDistSet; + shadowDistSet = IntersectJulia(rO, L, mu, maxIterations, epsilon, + &shadowHitPoint, &steps); + if (shadowDistSet < epsilon) { + if (useAO) { + // Use steps count to simulate ambient occlusion + shadowFactor = 0.6f - min(steps / 255.f, 0.5f); + } else + shadowFactor = 0.6f; + } + } else + shadowDistSet = -1.f; + } + + //-------------------------------------------------------------------------- + // Direct lighting of hit point + //-------------------------------------------------------------------------- + + color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor; + } + + //-------------------------------------------------------------------------- + // Write pixel + //-------------------------------------------------------------------------- + + int offset = 3 * (x + y * width); + color = clamp(color, (float4) (0.f, 0.f, 0.f, 0.f), (float4) (1.f, 1.f, 1.f, 0.f)); + if (enableAccumulation) { + pixels[offset++] += color.s0; + pixels[offset++] += color.s1; + pixels[offset ] += color.s2; + } else { + pixels[offset++] = color.s0; + pixels[offset++] = color.s1; + pixels[offset ] = color.s2; + } +} + +kernel void multiply(global float *array, const int numElements, const float s) { + const int gid = get_global_id(0); + if (gid >= numElements) { + return; + } + array[gid] *= s; +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Camera.java b/src/com/jogamp/opencl/demos/julia3d/structs/Camera.java new file mode 100644 index 0000000..68c567c --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/Camera.java @@ -0,0 +1,50 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + +import com.jogamp.common.nio.*; + + +public abstract class Camera { + + StructAccessor accessor; + + public static int size() { +// if (CPU.is32Bit()) { +// return Camera32.size(); +// } else { + return Camera64.size(); +// } + } + + public static Camera create() { + return create(Buffers.newDirectByteBuffer(size())); + } + + public static Camera create(java.nio.ByteBuffer buf) { +// if (CPU.is32Bit()) { +// return new Camera32(buf); +// } else { + return new Camera64(buf); +// } + } + + Camera(java.nio.ByteBuffer buf) { + accessor = new StructAccessor(buf); + } + + public java.nio.ByteBuffer getBuffer() { + return accessor.getBuffer(); + } + + public abstract Vec getOrig(); + + public abstract Vec getTarget(); + + public abstract Vec getDir(); + + public abstract Vec getX(); + + public abstract Vec getY(); +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java b/src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java new file mode 100644 index 0000000..1811583 --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java @@ -0,0 +1,37 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + + +class Camera32 extends Camera { + + public static int size() { + return 76; + } + + Camera32(java.nio.ByteBuffer buf) { + super(buf); + } + + + public Vec getOrig() { + return Vec.create(accessor.slice(0, 12)); + } + + public Vec getTarget() { + return Vec.create(accessor.slice(16, 12)); + } + + public Vec getDir() { + return Vec.create(accessor.slice(32, 12)); + } + + public Vec getX() { + return Vec.create(accessor.slice(48, 12)); + } + + public Vec getY() { + return Vec.create(accessor.slice(64, 12)); + } +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java b/src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java new file mode 100644 index 0000000..f82d3b3 --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java @@ -0,0 +1,48 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + + +class Camera64 extends Camera { + + private final Vec orig; + private final Vec target; + private final Vec dir; + private final Vec x; + private final Vec y; + + public static int size() { + return 60; + } + + Camera64(java.nio.ByteBuffer buf) { + super(buf); + orig = Vec.create(accessor.slice(0, 12)); + target = Vec.create(accessor.slice(12, 12)); + dir = Vec.create(accessor.slice(24, 12)); + x = Vec.create(accessor.slice(36, 12)); + y = Vec.create(accessor.slice(48, 12)); + } + + + public Vec getOrig() { + return orig; + } + + public Vec getTarget() { + return target; + } + + public Vec getDir() { + return dir; + } + + public Vec getX() { + return x; + } + + public Vec getY() { + return y; + } +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java new file mode 100644 index 0000000..4b14f1a --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java @@ -0,0 +1,78 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + +import com.jogamp.common.nio.*; + + +public abstract class RenderingConfig { + + StructAccessor accessor; + + public static int size() { +// if (CPU.is32Bit()) { +// return RenderingConfig32.size(); +// } else { + return RenderingConfig64.size(); +// } + } + + public static RenderingConfig create() { + return create(Buffers.newDirectByteBuffer(size())); + } + + public static RenderingConfig create(java.nio.ByteBuffer buf) { +// if (CPU.is32Bit()) { +// return new RenderingConfig32(buf); +// } else { + return new RenderingConfig64(buf); +// } + } + + RenderingConfig(java.nio.ByteBuffer buf) { + accessor = new StructAccessor(buf); + } + + public java.nio.ByteBuffer getBuffer() { + return accessor.getBuffer(); + } + + public abstract RenderingConfig setWidth(int val); + + public abstract int getWidth(); + + public abstract RenderingConfig setHeight(int val); + + public abstract int getHeight(); + + public abstract RenderingConfig setSuperSamplingSize(int val); + + public abstract int getSuperSamplingSize(); + + public abstract RenderingConfig setActvateFastRendering(int val); + + public abstract int getActvateFastRendering(); + + public abstract RenderingConfig setEnableShadow(int val); + + public abstract int getEnableShadow(); + + public abstract RenderingConfig setMaxIterations(int val); + + public abstract int getMaxIterations(); + + public abstract RenderingConfig setEpsilon(float val); + + public abstract float getEpsilon(); + + public abstract RenderingConfig setMu(float[] val); + + public abstract float[] getMu(); + + public abstract RenderingConfig setLight(float[] val); + + public abstract float[] getLight(); + + public abstract Camera getCamera(); +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java new file mode 100644 index 0000000..27f40e6 --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java @@ -0,0 +1,102 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + + +class RenderingConfig32 extends RenderingConfig { + + public static int size() { + return 140; + } + + RenderingConfig32(java.nio.ByteBuffer buf) { + super(buf); + } + + + public RenderingConfig setWidth(int val) { + accessor.setIntAt(0, val); + return this; + } + + public int getWidth() { + return accessor.getIntAt(0); + } + + public RenderingConfig setHeight(int val) { + accessor.setIntAt(1, val); + return this; + } + + public int getHeight() { + return accessor.getIntAt(1); + } + + public RenderingConfig setSuperSamplingSize(int val) { + accessor.setIntAt(2, val); + return this; + } + + public int getSuperSamplingSize() { + return accessor.getIntAt(2); + } + + public RenderingConfig setActvateFastRendering(int val) { + accessor.setIntAt(3, val); + return this; + } + + public int getActvateFastRendering() { + return accessor.getIntAt(3); + } + + public RenderingConfig setEnableShadow(int val) { + accessor.setIntAt(4, val); + return this; + } + + public int getEnableShadow() { + return accessor.getIntAt(4); + } + + public RenderingConfig setMaxIterations(int val) { + accessor.setIntAt(5, val); + return this; + } + + public int getMaxIterations() { + return accessor.getIntAt(5); + } + + public RenderingConfig setEpsilon(float val) { + accessor.setFloatAt(6, val); + return this; + } + + public float getEpsilon() { + return accessor.getFloatAt(6); + } + + public RenderingConfig setMu(float[] val) { + accessor.setFloatsAt(8, val); + return this; + } + + public float[] getMu() { + return accessor.getFloatsAt(8, new float[4]); + } + + public RenderingConfig setLight(float[] val) { + accessor.setFloatsAt(12, val); + return this; + } + + public float[] getLight() { + return accessor.getFloatsAt(12, new float[3]); + } + + public Camera getCamera() { + return Camera.create(accessor.slice(64, 76)); + } +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java new file mode 100644 index 0000000..e60987e --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java @@ -0,0 +1,105 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + + +class RenderingConfig64 extends RenderingConfig { + + private final Camera camera; + + public static int size() { + return 116; + } + + RenderingConfig64(java.nio.ByteBuffer buf) { + super(buf); + camera = Camera.create(accessor.slice(56, 60)); + } + + + public RenderingConfig setWidth(int val) { + accessor.setIntAt(0, val); + return this; + } + + public int getWidth() { + return accessor.getIntAt(0); + } + + public RenderingConfig setHeight(int val) { + accessor.setIntAt(1, val); + return this; + } + + public int getHeight() { + return accessor.getIntAt(1); + } + + public RenderingConfig setSuperSamplingSize(int val) { + accessor.setIntAt(2, val); + return this; + } + + public int getSuperSamplingSize() { + return accessor.getIntAt(2); + } + + public RenderingConfig setActvateFastRendering(int val) { + accessor.setIntAt(3, val); + return this; + } + + public int getActvateFastRendering() { + return accessor.getIntAt(3); + } + + public RenderingConfig setEnableShadow(int val) { + accessor.setIntAt(4, val); + return this; + } + + public int getEnableShadow() { + return accessor.getIntAt(4); + } + + public RenderingConfig setMaxIterations(int val) { + accessor.setIntAt(5, val); + return this; + } + + public int getMaxIterations() { + return accessor.getIntAt(5); + } + + public RenderingConfig setEpsilon(float val) { + accessor.setFloatAt(6, val); + return this; + } + + public float getEpsilon() { + return accessor.getFloatAt(6); + } + + public RenderingConfig setMu(float[] val) { + accessor.setFloatsAt(7, val); + return this; + } + + public float[] getMu() { + return accessor.getFloatsAt(7, new float[4]); + } + + public RenderingConfig setLight(float[] val) { + accessor.setFloatsAt(11, val); + return this; + } + + public float[] getLight() { + return accessor.getFloatsAt(11, new float[3]); + } + + public Camera getCamera() { + return camera; + } +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Vec.java b/src/com/jogamp/opencl/demos/julia3d/structs/Vec.java new file mode 100644 index 0000000..d4b2d48 --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/Vec.java @@ -0,0 +1,53 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + + +import com.jogamp.common.nio.*; + + +public abstract class Vec { + + StructAccessor accessor; + + public static int size() { +// if (CPU.is32Bit()) { +// return Vec32.size(); +// } else { + return Vec64.size(); +// } + } + + public static Vec create() { + return create(Buffers.newDirectByteBuffer(size())); + } + + public static Vec create(java.nio.ByteBuffer buf) { +// if (CPU.is32Bit()) { +// return new Vec32(buf); +// } else { + return new Vec64(buf); +// } + } + + Vec(java.nio.ByteBuffer buf) { + accessor = new StructAccessor(buf); + } + + public java.nio.ByteBuffer getBuffer() { + return accessor.getBuffer(); + } + + public abstract Vec setX(float val); + + public abstract float getX(); + + public abstract Vec setY(float val); + + public abstract float getY(); + + public abstract Vec setZ(float val); + + public abstract float getZ(); +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java b/src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java new file mode 100644 index 0000000..e7668ac --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java @@ -0,0 +1,44 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + + +class Vec32 extends Vec { + + public static int size() { + return 12; + } + + Vec32(java.nio.ByteBuffer buf) { + super(buf); + } + + + public Vec setX(float val) { + accessor.setFloatAt(0, val); + return this; + } + + public float getX() { + return accessor.getFloatAt(0); + } + + public Vec setY(float val) { + accessor.setFloatAt(1, val); + return this; + } + + public float getY() { + return accessor.getFloatAt(1); + } + + public Vec setZ(float val) { + accessor.setFloatAt(2, val); + return this; + } + + public float getZ() { + return accessor.getFloatAt(2); + } +} diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java b/src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java new file mode 100644 index 0000000..60750a4 --- /dev/null +++ b/src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java @@ -0,0 +1,44 @@ +/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ + + +package com.jogamp.opencl.demos.julia3d.structs; + + +class Vec64 extends Vec { + + public static int size() { + return 12; + } + + Vec64(java.nio.ByteBuffer buf) { + super(buf); + } + + + public Vec setX(float val) { + accessor.setFloatAt(0, val); + return this; + } + + public float getX() { + return accessor.getFloatAt(0); + } + + public Vec setY(float val) { + accessor.setFloatAt(1, val); + return this; + } + + public float getY() { + return accessor.getFloatAt(1); + } + + public Vec setZ(float val) { + accessor.setFloatAt(2, val); + return this; + } + + public float getZ() { + return accessor.getFloatAt(2); + } +} diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl new file mode 100644 index 0000000..d014692 --- /dev/null +++ b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl @@ -0,0 +1,358 @@ +/* +* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. +* +* NVIDIA Corporation and its licensors retain all intellectual property and +* proprietary rights in and to this software and related documentation. +* Any use, reproduction, disclosure, or distribution of this software +* and related documentation without an express license agreement from +* NVIDIA Corporation is strictly prohibited. +* +* Please refer to the applicable NVIDIA end user license agreement (EULA) +* associated with this source code for terms and conditions that govern +* your use of this NVIDIA software. +* +*/ + +//---------------------------------------------------------------------------- +// Scans each warp in parallel ("warp-scan"), one element per thread. +// uses 2 numElements of shared memory per thread (64 = elements per warp) +//---------------------------------------------------------------------------- +//#define WARP_SIZE 32 +uint scanwarp(uint val, __local uint* sData, int maxlevel) +{ + // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = + // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1)) + int localId = get_local_id(0); + int idx = 2 * localId - (localId & (WARP_SIZE - 1)); + sData[idx] = 0; + idx += WARP_SIZE; + sData[idx] = val; + + if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } + if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } + if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } + if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } + if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } + + return sData[idx] - val; // convert inclusive -> exclusive +} + +//---------------------------------------------------------------------------- +// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using +// a warp-scan algorithm +//---------------------------------------------------------------------------- +uint4 scan4(uint4 idata, __local uint* ptr) +{ + + uint idx = get_local_id(0); + + uint4 val4 = idata; + uint sum[3]; + sum[0] = val4.x; + sum[1] = val4.y + sum[0]; + sum[2] = val4.z + sum[1]; + + uint val = val4.w + sum[2]; + + val = scanwarp(val, ptr, 4); + barrier(CLK_LOCAL_MEM_FENCE); + + if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1) + { + ptr[idx >> 5] = val + val4.w + sum[2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (idx < WARP_SIZE) + ptr[idx] = scanwarp(ptr[idx], ptr, 2); + + barrier(CLK_LOCAL_MEM_FENCE); + + val += ptr[idx >> 5]; + + val4.x = val; + val4.y = val + sum[0]; + val4.z = val + sum[1]; + val4.w = val + sum[2]; + + return val4; +} + +#ifdef MAC +__kernel uint4 rank4(uint4 preds, __local uint* sMem) +#else +uint4 rank4(uint4 preds, __local uint* sMem) +#endif +{ + int localId = get_local_id(0); + int localSize = get_local_size(0); + + uint4 address = scan4(preds, sMem); + + __local uint numtrue; + if (localId == localSize - 1) + { + numtrue = address.w + preds.w; + } + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 rank; + int idx = localId*4; + rank.x = (preds.x) ? address.x : numtrue + idx - address.x; + rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y; + rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z; + rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w; + + return rank; +} + +void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem) +{ + int localId = get_local_id(0); + int localSize = get_local_size(0); + + for(uint shift = startbit; shift < (startbit + nbits); ++shift) + { + uint4 lsb; + lsb.x = !(((*key).x >> shift) & 0x1); + lsb.y = !(((*key).y >> shift) & 0x1); + lsb.z = !(((*key).z >> shift) & 0x1); + lsb.w = !(((*key).w >> shift) & 0x1); + + uint4 r; + + r = rank4(lsb, sMem); + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x; + sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y; + sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z; + sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w; + barrier(CLK_LOCAL_MEM_FENCE); + + // The above allows us to read without 4-way bank conflicts: + (*key).x = sMem[localId]; + (*key).y = sMem[localId + localSize]; + (*key).z = sMem[localId + 2 * localSize]; + (*key).w = sMem[localId + 3 * localSize]; + + barrier(CLK_LOCAL_MEM_FENCE); + } +} + +__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, + __global uint4* keysOut, + uint nbits, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sMem) +{ + int globalId = get_global_id(0); + + uint4 key; + key = keysIn[globalId]; + + barrier(CLK_LOCAL_MEM_FENCE); + + radixSortBlockKeysOnly(&key, nbits, startbit, sMem); + + keysOut[globalId] = key; +} + +//---------------------------------------------------------------------------- +// Given an array with blocks sorted according to a 4-bit radix group, each +// block counts the number of keys that fall into each radix in the group, and +// finds the starting offset of each radix in the block. It then writes the radix +// counts to the counters array, and the starting offsets to the blockOffsets array. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +// +//---------------------------------------------------------------------------- +__kernel void findRadixOffsets(__global uint2* keys, + __global uint* counters, + __global uint* blockOffsets, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint* sRadix1) +{ + __local uint sStartPointers[16]; + + uint groupId = get_group_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + uint2 radix2; + + radix2 = keys[get_global_id(0)]; + + + sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF; + sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF; + + // Finds the position where the sRadix1 entries differ and stores start + // index for each radix. + if(localId < 16) + { + sStartPointers[localId] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId]] = localId; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) + { + sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + blockOffsets[groupId*16 + localId] = sStartPointers[localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Compute the sizes of each block. + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + { + sStartPointers[sRadix1[localId - 1]] = + localId - sStartPointers[sRadix1[localId - 1]]; + } + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) + { + sStartPointers[sRadix1[localId + groupSize - 1]] = + localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]]; + } + + + if(localId == groupSize - 1) + { + sStartPointers[sRadix1[2 * groupSize - 1]] = + 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(localId < 16) + { + counters[localId * totalBlocks + groupId] = sStartPointers[localId]; + } +} + +// a naive scan routine that works only for array that +// can fit into a single block, just for debugging purpose, +// not used in the sort now +__kernel void scanNaive(__global uint *g_odata, + __global uint *g_idata, + uint n, + __local uint* temp) +{ + + int localId = get_local_id(0); + + int pout = 0; + int pin = 1; + + // Cache the computational window in shared memory + temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0; + + for (int offset = 1; offset < n; offset *= 2) + { + pout = 1 - pout; + pin = 1 - pout; + barrier(CLK_LOCAL_MEM_FENCE); + + temp[pout*n+localId] = temp[pin*n+localId]; + + if (localId >= offset) + temp[pout*n+localId] += temp[pin*n+localId - offset]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + g_odata[localId] = temp[pout*n+localId]; +} + +//---------------------------------------------------------------------------- +// reorderData shuffles data in the array globally after the radix offsets +// have been found. On compute version 1.1 and earlier GPUs, this code depends +// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +// +// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +// that all writes are coalesced using extra work in the kernel. On later +// GPUs coalescing rules have been relaxed, so this extra overhead hurts +// performance. On these GPUs we set manualCoalesce=false and directly store +// the results. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +//---------------------------------------------------------------------------- +__kernel void reorderDataKeysOnly(__global uint *outKeys, + __global uint2 *keys, + __global uint *blockOffsets, + __global uint *offsets, + __global uint *sizes, + uint startbit, + uint numElements, + uint totalBlocks, + __local uint2* sKeys2) +{ + __local uint sOffsets[16]; + __local uint sBlockOffsets[16]; + + __local uint *sKeys1 = (__local uint*)sKeys2; + + uint groupId = get_group_id(0); + + uint globalId = get_global_id(0); + uint localId = get_local_id(0); + uint groupSize = get_local_size(0); + + sKeys2[localId] = keys[globalId]; + + if(localId < 16) + { + sOffsets[localId] = offsets[localId * totalBlocks + groupId]; + sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + uint radix = (sKeys1[localId] >> startbit) & 0xF; + uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId]; + } + + radix = (sKeys1[localId + groupSize] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix]; + + if (globalOffset < numElements) + { + outKeys[globalOffset] = sKeys1[localId + groupSize]; + } + + +} diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSort.java b/src/com/jogamp/opencl/demos/radixsort/RadixSort.java new file mode 100644 index 0000000..e2a7b46 --- /dev/null +++ b/src/com/jogamp/opencl/demos/radixsort/RadixSort.java @@ -0,0 +1,182 @@ +/* + * 20:38 Sunday, February 28 2010 + */ + +package com.jogamp.opencl.demos.radixsort; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opencl.CLResource; +import java.io.IOException; +import java.nio.IntBuffer; + +import static com.jogamp.opencl.CLMemory.Mem.*; +import static com.jogamp.opencl.CLProgram.*; +import static com.jogamp.opencl.CLProgram.CompilerOptions.*; + +/** + * + * @author Michael Bien + */ +public class RadixSort implements CLResource { + + private static final int NUM_BANKS = 16; + private static final int WARP_SIZE = 32; + private static final int bitStep = 4; + + private final int CTA_SIZE; + + private final CLKernel ckRadixSortBlocksKeysOnly; + private final CLKernel ckFindRadixOffsets; + private final CLKernel ckScanNaive; + private final CLKernel ckReorderDataKeysOnly; + + private final CLBuffer tempKeys; + private final CLBuffer mCounters; + private final CLBuffer mCountersSum; + private final CLBuffer mBlockOffsets; + + private final CLCommandQueue queue; + private final Scan scan; + private final CLProgram program; + + public RadixSort(CLCommandQueue queue, int maxElements, int CTA_SIZE) throws IOException { + + this.CTA_SIZE = CTA_SIZE; + scan = new Scan(queue, maxElements / 2 / CTA_SIZE * 16); + + int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0) + ? (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1); + + this.queue = queue; + + CLContext context = queue.getContext(); + this.tempKeys = context.createBuffer(4 * maxElements, READ_WRITE); + this.mCounters = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); + this.mCountersSum = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); + this.mBlockOffsets = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); + + program = context.createProgram(getClass().getResourceAsStream("RadixSort.cl")) + .build(ENABLE_MAD, define("WARP_SIZE", WARP_SIZE)); + +// out.println(program.getBuildLog()); + + ckRadixSortBlocksKeysOnly = program.createCLKernel("radixSortBlocksKeysOnly"); + ckFindRadixOffsets = program.createCLKernel("findRadixOffsets"); + ckScanNaive = program.createCLKernel("scanNaive"); + ckReorderDataKeysOnly = program.createCLKernel("reorderDataKeysOnly"); + + } + + void sort(CLBuffer d_keys, int numElements, int keyBits) { + radixSortKeysOnly(d_keys, numElements, keyBits); + } + + //---------------------------------------------------------------------------- + // Main key-only radix sort function. Sorts in place in the keys and values + // arrays, but uses the other device arrays as temporary storage. All pointer + // parameters are device pointers. Uses cudppScan() for the prefix sum of + // radix counters. + //---------------------------------------------------------------------------- + void radixSortKeysOnly(CLBuffer keys, int numElements, int keyBits) { + int i = 0; + while (keyBits > i * bitStep) { + radixSortStepKeysOnly(keys, bitStep, i * bitStep, numElements); + i++; + } + } + + //---------------------------------------------------------------------------- + // Perform one step of the radix sort. Sorts by nbits key bits per step, + // starting at startbit. + //---------------------------------------------------------------------------- + void radixSortStepKeysOnly(CLBuffer keys, int nbits, int startbit, int numElements) { + + // Four step algorithms from Satish, Harris & Garland + radixSortBlocksKeysOnlyOCL(keys, nbits, startbit, numElements); + + findRadixOffsetsOCL(startbit, numElements); + + scan.scanExclusiveLarge(mCountersSum, mCounters, 1, numElements / 2 / CTA_SIZE * 16); + + reorderDataKeysOnlyOCL(keys, startbit, numElements); + } + + //---------------------------------------------------------------------------- + // Wrapper for the kernels of the four steps + //---------------------------------------------------------------------------- + void radixSortBlocksKeysOnlyOCL(CLBuffer keys, int nbits, int startbit, int numElements) { + + int totalBlocks = numElements / 4 / CTA_SIZE; + int globalWorkSize = CTA_SIZE * totalBlocks; + int localWorkSize = CTA_SIZE; + + ckRadixSortBlocksKeysOnly.putArg(keys).putArg(tempKeys).putArg(nbits).putArg(startbit) + .putArg(numElements).putArg(totalBlocks).putNullArg(4 * CTA_SIZE * 4) + .rewind(); + + queue.put1DRangeKernel(ckRadixSortBlocksKeysOnly, 0, globalWorkSize, localWorkSize); + } + + void findRadixOffsetsOCL(int startbit, int numElements) { + + int totalBlocks = numElements / 2 / CTA_SIZE; + int globalWorkSize = CTA_SIZE * totalBlocks; + int localWorkSize = CTA_SIZE; + + ckFindRadixOffsets.putArg(tempKeys).putArg(mCounters).putArg(mBlockOffsets) + .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4) + .rewind(); + + queue.put1DRangeKernel(ckFindRadixOffsets, 0, globalWorkSize, localWorkSize); + } + + void scanNaiveOCL(int numElements) { + + int nHist = numElements / 2 / CTA_SIZE * 16; + int globalWorkSize = nHist; + int localWorkSize = nHist; + int extra_space = nHist / NUM_BANKS; + int shared_mem_size = 4 * (nHist + extra_space); + + ckScanNaive.putArg(mCountersSum).putArg(mCounters).putArg(nHist).putNullArg(2 * shared_mem_size).rewind(); + + queue.put1DRangeKernel(ckScanNaive, 0, globalWorkSize, localWorkSize); + } + + void reorderDataKeysOnlyOCL(CLBuffer keys, int startbit, int numElements) { + + int totalBlocks = numElements / 2 / CTA_SIZE; + int globalWorkSize = CTA_SIZE * totalBlocks; + int localWorkSize = CTA_SIZE; + + ckReorderDataKeysOnly.putArg(keys).putArg(tempKeys).putArg(mBlockOffsets).putArg(mCountersSum).putArg(mCounters) + .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4).rewind(); + + queue.put1DRangeKernel(ckReorderDataKeysOnly, 0, globalWorkSize, localWorkSize); + } + + public void release() { + + scan.release(); + + //program & kernels + program.release(); + + //buffers + tempKeys.release(); + mCounters.release(); + mCountersSum.release(); + mBlockOffsets.release(); + } + + public void close() { + release(); + } + + + +} diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java new file mode 100644 index 0000000..2ce429a --- /dev/null +++ b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java @@ -0,0 +1,129 @@ +/* + * 20:48 Sunday, February 28 2010 + */ + +package com.jogamp.opencl.demos.radixsort; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLPlatform; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.Random; + +import static com.jogamp.opencl.CLMemory.Mem.*; +import static java.lang.System.*; +import static com.jogamp.opencl.CLDevice.Type.*; + +/** + * GPU radix sort demo. + * @author Michael Bien + */ +public class RadixSortDemo { + + public RadixSortDemo() throws IOException { + + CLContext context = null; + try{ + //single GPU setup + context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU)); + CLCommandQueue queue = context.getDevices()[0].createCommandQueue(); + + int maxValue = Integer.MAX_VALUE; + int samples = 10; + + int[] workgroupSizes = new int[] {128, 256}; + + int[] runs = new int[] { 32768, + 65536, + 131072, + 262144, + 524288, + 1048576, + 2097152, + 4194304, + 8388608 }; + + for (int i = 0; i < workgroupSizes.length; i++) { + + int workgroupSize = workgroupSizes[i]; + + out.println("\n = = = workgroup size: "+workgroupSize+" = = = "); + + for(int run = 0; run < runs.length; run++) { + + if( workgroupSize==128 && runs[run] >= 8388608 + || workgroupSize==256 && runs[run] <= 32768) { + continue; // we can only sort up to 4MB with wg size of 128 + } + + int numElements = runs[run]; + + CLBuffer array = context.createIntBuffer(numElements, READ_WRITE); + out.print("array size: " + array.getCLSize()/1000000.0f+"MB; "); + out.println("elements: " + array.getCapacity()/1000+"K"); + + fillBuffer(array, maxValue); + + RadixSort radixSort = new RadixSort(queue, numElements, workgroupSize); + for(int a = 0; a < samples; a++) { + + queue.finish(); + + long time = nanoTime(); + + queue.putWriteBuffer(array, false); + radixSort.sort(array, numElements, 32); + queue.putReadBuffer(array, true); + + out.println("time: " + (nanoTime() - time)/1000000.0f+"ms"); + } + + out.print("snapshot: "); + printSnapshot(array.getBuffer(), 20); + + out.println("validating..."); + checkIfSorted(array.getBuffer()); + out.println("values sorted"); + + array.release(); + radixSort.release(); + } + } + + }finally{ + if(context != null) { + context.release(); + } + } + + } + + private void fillBuffer(CLBuffer array, int maxValue) { + Random random = new Random(42); + for (int n = 0; n < array.getBuffer().capacity(); n++) { + int rnd = random.nextInt(maxValue); + array.getBuffer().put(n, rnd); + } + } + + private void printSnapshot(IntBuffer buffer, int snapshot) { + for(int i = 0; i < snapshot; i++) + out.print(buffer.get() + ", "); + out.println("...; " + buffer.remaining() + " more"); + buffer.rewind(); + } + + private void checkIfSorted(IntBuffer keys) { + for (int i = 1; i < keys.capacity(); i++) { + if (keys.get(i - 1) > keys.get(i)) { + throw new RuntimeException("not sorted "+ keys.get(i - 1) +" !> "+ keys.get(i)); + } + } + } + + public static void main(String[] args) throws IOException { + new RadixSortDemo(); + } +} diff --git a/src/com/jogamp/opencl/demos/radixsort/Scan.java b/src/com/jogamp/opencl/demos/radixsort/Scan.java new file mode 100644 index 0000000..3d364ed --- /dev/null +++ b/src/com/jogamp/opencl/demos/radixsort/Scan.java @@ -0,0 +1,131 @@ +/* + * 22:12 Sunday, February 28 2010 + */ +package com.jogamp.opencl.demos.radixsort; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import com.jogamp.opencl.CLResource; +import java.io.IOException; + +import static com.jogamp.opencl.CLMemory.Mem.*; +import static com.jogamp.opencl.CLProgram.CompilerOptions.*; + +/** + * + * @author Michael Bien + */ +public class Scan implements CLResource { + + private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024; + private final static int MAX_LOCAL_GROUP_SIZE = 256; + private final static int WORKGROUP_SIZE = 256; + private final static int MAX_BATCH_ELEMENTS = 64 * 1048576; + private final static int MIN_SHORT_ARRAY_SIZE = 4; + private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE; + private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE; + private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE; + + private final CLKernel ckScanExclusiveLocal1; + private final CLKernel ckScanExclusiveLocal2; + private final CLKernel ckUniformUpdate; + + private final CLCommandQueue queue; + private final CLProgram program; + private CLBuffer buffer; + + public Scan(CLCommandQueue queue, int numElements) throws IOException { + + this.queue = queue; + + CLContext context = queue.getContext(); + if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) { + buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE); + } + program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl")) + .build(ENABLE_MAD); + + ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1"); + ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2"); + ckUniformUpdate = program.createCLKernel("uniformUpdate"); + } + + // main exclusive scan routine + void scanExclusiveLarge(CLBuffer dst, CLBuffer src, int batchSize, int arrayLength) { + + //Check power-of-two factorization + if(!isPowerOf2(arrayLength)) { + throw new RuntimeException(); + } + + //Check supported size range + if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) { + throw new RuntimeException(); + } + + //Check total batch size limit + if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) { + throw new RuntimeException(); + } + + scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE); + scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE)); + uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE)); + } + + void scanExclusiveLocal1(CLBuffer dst, CLBuffer src, int n, int size) { + + ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size) + .rewind(); + + int localWorkSize = WORKGROUP_SIZE; + int globalWorkSize = (n * size) / 4; + + queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize); + } + + void scanExclusiveLocal2(CLBuffer buffer, CLBuffer dst, CLBuffer src, int n, int size) { + + int elements = n * size; + ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4) + .putArg(elements).putArg(size).rewind(); + + int localWorkSize = WORKGROUP_SIZE; + int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE); + + queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize); + } + + void uniformUpdate(CLBuffer dst, CLBuffer buffer, int n) { + + ckUniformUpdate.setArgs(dst, buffer); + + int localWorkSize = WORKGROUP_SIZE; + int globalWorkSize = n * WORKGROUP_SIZE; + + queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize); + } + + private int iSnapUp(int dividend, int divisor) { + return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor); + } + + public static boolean isPowerOf2(int x) { + return ((x - 1) & x) == 0; + } + + public void release() { + program.release(); + + if(buffer!=null) { + buffer.release(); + } + } + + public void close() { + release(); + } +} diff --git a/src/com/jogamp/opencl/demos/radixsort/Scan_b.cl b/src/com/jogamp/opencl/demos/radixsort/Scan_b.cl new file mode 100644 index 0000000..32fd4dd --- /dev/null +++ b/src/com/jogamp/opencl/demos/radixsort/Scan_b.cl @@ -0,0 +1,190 @@ +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation. + * Any use, reproduction, disclosure, or distribution of this software + * and related documentation without an express license agreement from + * NVIDIA Corporation is strictly prohibited. + * + * Please refer to the applicable NVIDIA end user license agreement (EULA) + * associated with this source code for terms and conditions that govern + * your use of this NVIDIA software. + * + */ + + + +//All three kernels run 512 threads per workgroup +//Must be a power of two +#define WORKGROUP_SIZE 256 + + + +//////////////////////////////////////////////////////////////////////////////// +// Scan codelets +//////////////////////////////////////////////////////////////////////////////// +#if(1) + //Naive inclusive scan: O(N * log2(N)) operations + //Allocate 2 * 'size' local memory, initialize the first half + //with 'size' zeros avoiding if(pos >= offset) condition evaluation + //and saving instructions + inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); + l_Data[pos] = 0; + pos += size; + l_Data[pos] = idata; + + for(uint offset = 1; offset < size; offset <<= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + uint t = l_Data[pos] + l_Data[pos - offset]; + barrier(CLK_LOCAL_MEM_FENCE); + l_Data[pos] = t; + } + + return l_Data[pos]; + } + + inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ + return scan1Inclusive(idata, l_Data, size) - idata; + } + +#else + #define LOG2_WARP_SIZE 5U + #define WARP_SIZE (1U << LOG2_WARP_SIZE) + + //Almost the same as naiveScan1 but doesn't need barriers + //assuming size <= WARP_SIZE + inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){ + uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); + l_Data[pos] = 0; + pos += size; + l_Data[pos] = idata; + + for(uint offset = 1; offset < size; offset <<= 1) + l_Data[pos] += l_Data[pos - offset]; + + return l_Data[pos]; + } + + inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){ + return warpScanInclusive(idata, l_Data, size) - idata; + } + + inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ + if(size > WARP_SIZE){ + //Bottom-level inclusive warp scan + uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE); + + //Save top elements of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because l_Data is being overwritten) + barrier(CLK_LOCAL_MEM_FENCE); + if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) ) + l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult; + + //wait for warp scans to complete + barrier(CLK_LOCAL_MEM_FENCE); + if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){ + //grab top warp elements + uint val = l_Data[get_local_id(0)]; + //calculate exclsive scan and write back to shared memory + l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE); + } + + //return updated warp scans with exclusive scan results + barrier(CLK_LOCAL_MEM_FENCE); + return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE]; + }else{ + return warpScanInclusive(idata, l_Data, size); + } + } + + inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ + return scan1Inclusive(idata, l_Data, size) - idata; + } +#endif + + +//Vector scan: the array to be scanned is stored +//in work-item private memory as uint4 +inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){ + //Level-0 inclusive scan + data4.y += data4.x; + data4.z += data4.y; + data4.w += data4.z; + + //Level-1 exclusive scan + uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w; + + return (data4 + (uint4)val); +} + +inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){ + return scan4Inclusive(data4, l_Data, size) - data4; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Scan kernels +//////////////////////////////////////////////////////////////////////////////// +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void scanExclusiveLocal1( + __global uint4 *d_Dst, + __global uint4 *d_Src, + __local uint* l_Data, + uint size +){ + //Load data + uint4 idata4 = d_Src[get_global_id(0)]; + + //Calculate exclusive scan + uint4 odata4 = scan4Exclusive(idata4, l_Data, size); + + //Write back + d_Dst[get_global_id(0)] = odata4; +} + +//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE) +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void scanExclusiveLocal2( + __global uint *d_Buf, + __global uint *d_Dst, + __global uint *d_Src, + __local uint* l_Data, + uint N, + uint arrayLength +){ + //Load top elements + //Convert results of bottom-level scan back to inclusive + //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N) + uint data = 0; + if(get_global_id(0) < N) + data = + d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + + d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)]; + + //Compute + uint odata = scan1Exclusive(data, l_Data, arrayLength); + + //Avoid out-of-bound access + if(get_global_id(0) < N) + d_Buf[get_global_id(0)] = odata; +} + +//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays +__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) +void uniformUpdate( + __global uint4 *d_Data, + __global uint *d_Buf +){ + __local uint buf[1]; + + uint4 data4 = d_Data[get_global_id(0)]; + + if(get_local_id(0) == 0) + buf[0] = d_Buf[get_group_id(0)]; + + barrier(CLK_LOCAL_MEM_FENCE); + data4 += (uint4)buf[0]; + d_Data[get_global_id(0)] = data4; +} diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl deleted file mode 100644 index a8d0e1d..0000000 --- a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - - - -//Passed down by clBuildProgram -//#define LOCAL_SIZE_LIMIT 1024 - - - -inline void ComparatorPrivate( - uint *keyA, - uint *keyB, - uint arrowDir -){ - if( (*keyA > *keyB) == arrowDir ){ - uint t; - t = *keyA; *keyA = *keyB; *keyB = t; - } -} - -inline void ComparatorLocal( - __local uint *keyA, - __local uint *keyB, - uint arrowDir -){ - if( (*keyA > *keyB) == arrowDir ){ - uint t; - t = *keyA; *keyA = *keyB; *keyB = t; - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Monolithic bitonic sort kernel for short arrays fitting into local memory -//////////////////////////////////////////////////////////////////////////////// -__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) -void bitonicSortLocal( - __global uint *d_DstKey, - __global uint *d_SrcKey, - uint arrayLength, - uint sortDir -){ - __local uint l_key[LOCAL_SIZE_LIMIT]; - - //Offset to the beginning of subbatch and load data - d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; - l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; - - for(uint size = 2; size < arrayLength; size <<= 1){ - //Bitonic merge - uint dir = ( (get_local_id(0) & (size / 2)) != 0 ); - for(uint stride = size / 2; stride > 0; stride >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); - ComparatorLocal( - &l_key[pos + 0], - &l_key[pos + stride], - dir - ); - } - } - - //dir == sortDir for the last bitonic merge step - { - for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); - ComparatorLocal( - &l_key[pos + 0], - &l_key[pos + stride], - sortDir - ); - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - d_DstKey[ 0] = l_key[get_local_id(0) + 0]; - d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; -} - -//////////////////////////////////////////////////////////////////////////////// -// Bitonic sort kernel for large arrays (not fitting into local memory) -//////////////////////////////////////////////////////////////////////////////// -//Bottom-level bitonic sort -//Almost the same as bitonicSortLocal with the only exception -//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being -//sorted in opposite directions -__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) -void bitonicSortLocal1( - __global uint *d_DstKey, - __global uint *d_SrcKey -){ - __local uint l_key[LOCAL_SIZE_LIMIT]; - - //Offset to the beginning of subarray and load data - d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; - l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; - - uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); - - for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ - //Bitonic merge - uint dir = (comparatorI & (size / 2)) != 0; - for(uint stride = size / 2; stride > 0; stride >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); - ComparatorLocal( - &l_key[pos + 0], - &l_key[pos + stride], - dir - ); - } - } - - //Odd / even arrays of LOCAL_SIZE_LIMIT elements - //sorted in opposite directions - { - uint dir = (get_group_id(0) & 1); - for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); - ComparatorLocal( - &l_key[pos + 0], - &l_key[pos + stride], - dir - ); - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - d_DstKey[ 0] = l_key[get_local_id(0) + 0]; - d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; -} - -//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT -__kernel void bitonicMergeGlobal( - __global uint *d_DstKey, - __global uint *d_SrcKey, - uint arrayLength, - uint size, - uint stride, - uint sortDir -){ - uint global_comparatorI = get_global_id(0); - uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); - - //Bitonic merge - uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); - uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); - - uint keyA = d_SrcKey[pos + 0]; - uint keyB = d_SrcKey[pos + stride]; - - ComparatorPrivate( - &keyA, - &keyB, - dir - ); - - d_DstKey[pos + 0] = keyA; - d_DstKey[pos + stride] = keyB; -} - -//Combined bitonic merge steps for -//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] -__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1))) -void bitonicMergeLocal( - __global uint *d_DstKey, - __global uint *d_SrcKey, - uint arrayLength, - uint stride, - uint size, - uint sortDir -){ - __local uint l_key[LOCAL_SIZE_LIMIT]; - - d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); - l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; - l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; - - //Bitonic merge - uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); - uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 ); - for(; stride > 0; stride >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); - ComparatorLocal( - &l_key[pos + 0], - &l_key[pos + stride], - dir - ); - } - - barrier(CLK_LOCAL_MEM_FENCE); - d_DstKey[ 0] = l_key[get_local_id(0) + 0]; - d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; -} diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java deleted file mode 100644 index ed5b32e..0000000 --- a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java +++ /dev/null @@ -1,201 +0,0 @@ -/* - * 18:42 Saturday, February 27 2010 - */ -package com.mbien.opencl.demos.bitonicsort; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLContext; -import com.mbien.opencl.CLDevice; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLProgram; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.Map; -import java.util.Random; - -import static java.lang.System.*; -import static com.mbien.opencl.CLMemory.Mem.*; -import static com.mbien.opencl.CLProgram.*; - -/** - * Bitonic sort optimized for GPUs. - * Uses NVIDIA's bitonic merge sort kernel. - * @author Michael Bien - */ -public class BitonicSort { - - private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal"; - private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal"; - private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal"; - private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1"; - - private final static int LOCAL_SIZE_LIMIT = 1024; - private final Map kernels; - - public BitonicSort() throws IOException { - - final int sortDir = 1; - final int elements = 1048576; - final int maxvalue = 1000000; - - out.println("Initializing OpenCL..."); - - //Create the context - CLContext context = null; - - try{ - - context = CLContext.create(); - CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); - - out.println("Initializing OpenCL bitonic sorter..."); - kernels = initBitonicSort(queue); - - out.println("Creating OpenCL memory objects..."); - CLBuffer keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER); - System.out.println(keyBuffer.getCLSize()/1000000.0f); - - out.println("Initializing data...\n"); - Random random = new Random(); - for (int i = 0; i < elements; i++) { - int rnd = random.nextInt(maxvalue); - keyBuffer.getBuffer().put(i, rnd); - } - - int arrayLength = elements; - int batch = elements / arrayLength; - - out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch); - - long time = currentTimeMillis(); - - bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir); - queue.putReadBuffer(keyBuffer, true); - - out.println(currentTimeMillis() - time+"ms"); - - IntBuffer keys = keyBuffer.getBuffer(); - printSnapshot(keys, 20); - checkIfSorted(keys); - - out.println("\nTEST PASSED"); - - }finally{ - if(context!=null) { - context.release(); - } - } - - } - - private Map initBitonicSort(CLCommandQueue queue) throws IOException { - - out.println(" creating bitonic sort program"); - - CLContext context = queue.getContext(); - - CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl")) - .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT)); - - Map kernelMap = program.createCLKernels(); - - out.println(" checking minimum supported workgroup size"); - //Check for work group size - CLDevice device = queue.getDevice(); - long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device); - long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device); - long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device); - - if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2)) - || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2)) - || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) { - throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2 - +" required by this application is not supported on this device."); - } - - return kernelMap; - - } - - public void bitonicSort(CLCommandQueue queue, CLBuffer dstKey, CLBuffer srcKey, int batch, int arrayLength, int dir) { - - if (arrayLength < 2) { - throw new IllegalArgumentException("arrayLength was "+arrayLength); - } - - // TODO Only power-of-two array lengths are supported so far - - dir = (dir != 0) ? 1 : 0; - - CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1); - CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL); - CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL); - CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL); - - if (arrayLength <= LOCAL_SIZE_LIMIT) { - - // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE ); - - //Launch bitonicSortLocal - sortlocal.putArgs(dstKey, srcKey) - .putArg(arrayLength).putArg(dir).rewind(); - - int localWorkSize = LOCAL_SIZE_LIMIT / 2; - int globalWorkSize = batch * arrayLength / 2; - queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize); - - } else { - - //Launch bitonicSortLocal1 - sortlocal1.setArgs(dstKey, srcKey); - - int localWorkSize = LOCAL_SIZE_LIMIT / 2; - int globalWorkSize = batch * arrayLength / 2; - - queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize); - - for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { - for (int stride = size / 2; stride > 0; stride >>= 1) { - if (stride >= LOCAL_SIZE_LIMIT) { - //Launch bitonicMergeGlobal - mergeGlobal.putArgs(dstKey, dstKey) - .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind(); - - globalWorkSize = batch * arrayLength / 2; - queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0); - } else { - //Launch bitonicMergeLocal - mergeLocal.putArgs(dstKey, dstKey) - .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind(); - - localWorkSize = LOCAL_SIZE_LIMIT / 2; - globalWorkSize = batch * arrayLength / 2; - - queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize); - break; - } - } - } - } - } - - private void printSnapshot(IntBuffer buffer, int snapshot) { - for(int i = 0; i < snapshot; i++) - out.print(buffer.get() + ", "); - out.println("...; " + buffer.remaining() + " more"); - buffer.rewind(); - } - - private void checkIfSorted(IntBuffer keys) { - for (int i = 1; i < keys.capacity(); i++) { - if (keys.get(i - 1) > keys.get(i)) { - throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i)); - } - } - } - - public static void main(String[] args) throws IOException { - new BitonicSort(); - } -} diff --git a/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl b/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl deleted file mode 100644 index 640c775..0000000 --- a/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl +++ /dev/null @@ -1,51 +0,0 @@ -#ifdef DOUBLE_FP - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - typedef double varfloat; -#else - typedef float varfloat; -#endif - -/** - * For a description of this algorithm please refer to - * http://en.wikipedia.org/wiki/Mandelbrot_set - * @author Michael Bien - */ -kernel void mandelbrot( - const int width, const int height, - const varfloat x0, const varfloat y0, - const varfloat rangeX, const varfloat rangeY, - global uint *output, global uint *colorMap, - const int colorMapSize, const int maxIterations) { - - unsigned int ix = get_global_id(0); - unsigned int iy = get_global_id(1); - - varfloat r = x0 + ix * rangeX / width; - varfloat i = y0 + iy * rangeY / height; - - varfloat x = 0; - varfloat y = 0; - - varfloat magnitudeSquared = 0; - int iteration = 0; - - while (magnitudeSquared < 4 && iteration < maxIterations) { - varfloat x2 = x*x; - varfloat y2 = y*y; - y = 2 * x * y + i; - x = x2 - y2 + r; - magnitudeSquared = x2+y2; - iteration++; - } - - if (iteration == maxIterations) { - output[iy * width + ix] = 0; - }else { - varfloat alpha = (varfloat)iteration / maxIterations; - int colorIndex = (int)(alpha * colorMapSize); - output[iy * width + ix] = colorMap[colorIndex]; - // monochrom - // output[iy * width + ix] = 255*iteration/maxIterations; - } - -} \ No newline at end of file diff --git a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java b/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java deleted file mode 100644 index 265add8..0000000 --- a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java +++ /dev/null @@ -1,486 +0,0 @@ -package com.mbien.opencl.demos.fractal; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLDevice; -import com.mbien.opencl.CLEvent; -import com.mbien.opencl.CLEventList; -import com.mbien.opencl.CLException; -import com.mbien.opencl.gl.CLGLBuffer; -import com.mbien.opencl.gl.CLGLContext; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLPlatform; -import com.mbien.opencl.CLProgram; -import com.mbien.opencl.CLProgram.CompilerOptions; -import com.jogamp.opengl.util.awt.TextRenderer; -import java.awt.Color; -import java.awt.Dimension; -import java.awt.Font; -import java.awt.Point; -import java.awt.event.KeyAdapter; -import java.awt.event.KeyEvent; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.MouseWheelEvent; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.logging.Level; -import java.util.logging.Logger; -import javax.media.opengl.DebugGL2; -import javax.media.opengl.GL; -import javax.media.opengl.GL2; -import javax.media.opengl.GLAutoDrawable; -import javax.media.opengl.GLCapabilities; -import javax.media.opengl.GLContext; -import javax.media.opengl.GLEventListener; -import javax.media.opengl.GLProfile; -import javax.media.opengl.awt.GLCanvas; -import javax.swing.JFrame; -import javax.swing.SwingUtilities; - -import static com.jogamp.common.nio.Buffers.*; -import static javax.media.opengl.GL2.*; -import static com.mbien.opencl.CLMemory.Mem.*; -import static com.mbien.opencl.CLEvent.ProfilingCommand.*; -import static com.mbien.opencl.CLCommandQueue.Mode.*; -import static com.mbien.opencl.CLDevice.Type.*; -import static java.lang.Math.*; - -/** - * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL. - * A shared PBO is used as storage for the fractal image.
- * http://en.wikipedia.org/wiki/Mandelbrot_set - *

- * controls:
- * keys 1-9 control parallelism level
- * space enables/disables slice seperator
- * 'd' toggles between 32/64bit floatingpoint precision
- * mouse/mousewheel to drag and zoom
- *

- * @author Michael Bien - */ -public class MultiDeviceFractal implements GLEventListener { - - // max number of used GPUs - private static final int MAX_PARRALLELISM_LEVEL = 8; - - // max per pixel iterations to compute the fractal - private static final int MAX_ITERATIONS = 500; - - private GLCanvas canvas; - - private CLGLContext clContext; - private CLCommandQueue[] queues; - private CLKernel[] kernels; - private CLProgram program; - private CLEventList probes; - private CLGLBuffer[] pboBuffers; - private CLBuffer[] colorMap; - - private int width = 0; - private int height = 0; - - private double minX = -2f; - private double minY = -1.2f; - private double maxX = 0.6f; - private double maxY = 1.3f; - - private int slices; - - private boolean drawSeperator; - private boolean doublePrecision; - private boolean buffersInitialized; - private boolean rebuild; - - private final TextRenderer textRenderer; - - public MultiDeviceFractal(int width, int height) { - - this.width = width; - this.height = height; - - canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2))); - canvas.addGLEventListener(this); - initSceneInteraction(); - - JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set"); - frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - canvas.setPreferredSize(new Dimension(width, height)); - frame.add(canvas); - frame.pack(); - - frame.setVisible(true); - - textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false); - } - - public void init(GLAutoDrawable drawable) { - - if(clContext == null) { - // enable GL error checking using the composable pipeline - drawable.setGL(new DebugGL2(drawable.getGL().getGL2())); - - drawable.getGL().glFinish(); - initCL(drawable.getContext()); - - GL2 gl = drawable.getGL().getGL2(); - - gl.setSwapInterval(0); - gl.glDisable(GL_DEPTH_TEST); - gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f); - - initView(gl, drawable.getWidth(), drawable.getHeight()); - - initPBO(gl); - drawable.getGL().glFinish(); - - setKernelConstants(); - } - } - - private void initCL(GLContext glCtx){ - try { - // create context managing all available GPUs -// clContext = CLGLContext.create(glCtx, GPU); - clContext = CLGLContext.create(glCtx, CLPlatform.getDefault().listCLDevices()[0]); - - - CLDevice[] devices = clContext.getDevices(); - - slices = min(devices.length, MAX_PARRALLELISM_LEVEL); - - // create command queues for every GPU, setup colormap and init kernels - queues = new CLCommandQueue[slices]; - kernels = new CLKernel[slices]; - probes = new CLEventList(slices); - colorMap = new CLBuffer[slices]; - - for (int i = 0; i < slices; i++) { - - colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY); - initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED); - - // create command queue and upload color map buffer on each used device - queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload - - } - - // load and build program - program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl")); - buildProgram(); - - } catch (IOException ex) { - Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex); - } catch (CLException ex) { - Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex); - } - - } - - private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) { - - for (int n = 0; n < colors.length - 1; n++) { - - Color color = colors[n]; - int r0 = color.getRed(); - int g0 = color.getGreen(); - int b0 = color.getBlue(); - - color = colors[n + 1]; - int r1 = color.getRed(); - int g1 = color.getGreen(); - int b1 = color.getBlue(); - - int deltaR = r1 - r0; - int deltaG = g1 - g0; - int deltaB = b1 - b0; - - for (int step = 0; step < stepSize; step++) { - float alpha = (float) step / (stepSize - 1); - int r = (int) (r0 + alpha * deltaR); - int g = (int) (g0 + alpha * deltaG); - int b = (int) (b0 + alpha * deltaB); - colorMap.put((r << 16) | (g << 8) | (b << 0)); - } - } - colorMap.rewind(); - - } - - private void initView(GL2 gl, int width, int height) { - - gl.glViewport(0, 0, width, height); - - gl.glMatrixMode(GL_MODELVIEW); - gl.glLoadIdentity(); - - gl.glMatrixMode(GL_PROJECTION); - gl.glLoadIdentity(); - gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0); - } - - @SuppressWarnings("unchecked") - private void initPBO(GL gl) { - - if(pboBuffers != null) { - int[] oldPbos = new int[pboBuffers.length]; - for (int i = 0; i < pboBuffers.length; i++) { - CLGLBuffer buffer = pboBuffers[i]; - oldPbos[i] = buffer.GLID; - buffer.release(); - } - gl.glDeleteBuffers(oldPbos.length, oldPbos, 0); - } - - pboBuffers = new CLGLBuffer[slices]; - - int[] pbo = new int[slices]; - gl.glGenBuffers(slices, pbo, 0); - - // setup one empty PBO per slice - for (int i = 0; i < slices; i++) { - - gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]); - gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW); - gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - pboBuffers[i] = clContext.createFromGLBuffer(pbo[i], WRITE_ONLY); - - } - - buffersInitialized = true; - } - - private void buildProgram() { - - /* - * workaround: The driver keeps using the old binaries for some reason. - * to solve this we simple create a new program and release the old. - * however rebuilding programs should be possible -> remove when drivers are fixed. - */ - if(program != null && rebuild) { - String source = program.getSource(); - program.release(); - program = clContext.createProgram(source); - } - - // disable 64bit floating point math if not available - if(doublePrecision) { - for (CLDevice device : program.getCLDevices()) { - if(!device.isDoubleFPAvailable()) { - doublePrecision = false; - break; - } - } - } - - if(doublePrecision) { - program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP"); - }else{ - program.build(CompilerOptions.FAST_RELAXED_MATH); - } - rebuild = false; - - for (int i = 0; i < kernels.length; i++) { - // init kernel with constants - kernels[i] = program.createCLKernel("mandelbrot"); - } - - } - - // init kernels with constants - private void setKernelConstants() { - for (int i = 0; i < slices; i++) { - kernels[i].setForce32BitArgs(!doublePrecision) - .setArg(6, pboBuffers[i]) - .setArg(7, colorMap[i]) - .setArg(8, colorMap[i].getBuffer().capacity()) - .setArg(9, MAX_ITERATIONS); - } - } - - // rendering cycle - public void display(GLAutoDrawable drawable) { - GL gl = drawable.getGL(); - - // make sure GL does not use our objects before we start computeing - gl.glFinish(); - if(!buffersInitialized) { - initPBO(gl); - setKernelConstants(); - } - if(rebuild) { - buildProgram(); - setKernelConstants(); - } - compute(); - - render(gl.getGL2()); - } - - // OpenCL - private void compute() { - - int sliceWidth = width / slices; - double rangeX = (maxX - minX) / slices; - double rangeY = (maxY - minY); - - // release all old events, you can't reuse events in OpenCL - probes.release(); - - // start computation - for (int i = 0; i < slices; i++) { - - kernels[i].putArg( sliceWidth).putArg(height) - .putArg(minX + rangeX*i).putArg( minY) - .putArg( rangeX ).putArg(rangeY) - .rewind(); - - // aquire GL objects, and enqueue a kernel with a probe from the list - queues[i].putAcquireGLObject(pboBuffers[i].ID) - .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes) - .putReleaseGLObject(pboBuffers[i].ID); - - } - - // block until done (important: finish before doing further gl work) - for (int i = 0; i < slices; i++) { - queues[i].finish(); - } - - } - - // OpenGL - private void render(GL2 gl) { - - gl.glClear(GL_COLOR_BUFFER_BIT); - - //draw slices - int sliceWidth = width / slices; - - for (int i = 0; i < slices; i++) { - - int seperatorOffset = drawSeperator?i:0; - - gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID); - gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0); - - gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0); - - } - gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - //draw info text - textRenderer.beginRendering(width, height, false); - - textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15); - - for (int i = 0; i < slices; i++) { - CLEvent event = probes.getEvent(i); - long start = event.getProfilingInfo(START); - long end = event.getProfilingInfo(END); - textRenderer.draw("GPU"+i +" "+(int)((end-start)/1000000.0f)+"ms", 10, height-(20+16*(slices-i))); - } - - textRenderer.endRendering(); - } - - public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) { - - if(this.width == width && this.height == height) - return; - - this.width = width; - this.height = height; - - initPBO(drawable.getGL()); - - initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight()); - } - - private void initSceneInteraction() { - - MouseAdapter mouseAdapter = new MouseAdapter() { - - Point lastpos = new Point(); - - @Override - public void mouseDragged(MouseEvent e) { - - double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width; - double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height; - - minX += offsetX; - minY -= offsetY; - - maxX += offsetX; - maxY -= offsetY; - - lastpos = e.getPoint(); - - canvas.display(); - - } - - @Override - public void mouseMoved(MouseEvent e) { - lastpos = e.getPoint(); - } - - @Override - public void mouseWheelMoved(MouseWheelEvent e) { - float rotation = e.getWheelRotation() / 25.0f; - - double deltaX = rotation * (maxX - minX); - double deltaY = rotation * (maxY - minY); - - // offset for "zoom to cursor" - double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2; - double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2; - - minX += deltaX+offsetX; - minY += deltaY-offsetY; - - maxX +=-deltaX+offsetX; - maxY +=-deltaY-offsetY; - - canvas.display(); - } - }; - - KeyAdapter keyAdapter = new KeyAdapter() { - - @Override - public void keyPressed(KeyEvent e) { - if(e.getKeyCode() == KeyEvent.VK_SPACE) { - drawSeperator = !drawSeperator; - }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') { - int number = e.getKeyChar()-'0'; - slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL)); - buffersInitialized = false; - }else if(e.getKeyCode() == KeyEvent.VK_D) { - doublePrecision = !doublePrecision; - rebuild = true; - } - canvas.display(); - } - - }; - - canvas.addMouseMotionListener(mouseAdapter); - canvas.addMouseWheelListener(mouseAdapter); - canvas.addKeyListener(keyAdapter); - } - - public void dispose(GLAutoDrawable drawable) { - } - - public static void main(String args[]) { - SwingUtilities.invokeLater(new Runnable() { - public void run() { - new MultiDeviceFractal(512, 512); - } - }); - } - -} diff --git a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig b/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig deleted file mode 100644 index fc09d19..0000000 --- a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig +++ /dev/null @@ -1,484 +0,0 @@ -package com.mbien.opencl.demos.fractal; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLContext; -import com.mbien.opencl.CLDevice; -import com.mbien.opencl.CLEvent; -import com.mbien.opencl.CLEventList; -import com.mbien.opencl.CLException; -import com.mbien.opencl.CLGLBuffer; -import com.mbien.opencl.CLGLContext; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLProgram; -import com.mbien.opencl.CLProgram.CompilerOptions; -import com.sun.opengl.util.awt.TextRenderer; -import java.awt.Color; -import java.awt.Dimension; -import java.awt.Font; -import java.awt.Point; -import java.awt.event.KeyAdapter; -import java.awt.event.KeyEvent; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.MouseWheelEvent; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.logging.Level; -import java.util.logging.Logger; -import javax.media.opengl.DebugGL2; -import javax.media.opengl.GL; -import javax.media.opengl.GL2; -import javax.media.opengl.GLAutoDrawable; -import javax.media.opengl.GLCapabilities; -import javax.media.opengl.GLContext; -import javax.media.opengl.GLEventListener; -import javax.media.opengl.GLProfile; -import javax.media.opengl.awt.GLCanvas; -import javax.swing.JFrame; -import javax.swing.SwingUtilities; - -import static com.sun.gluegen.runtime.BufferFactory.*; -import static javax.media.opengl.GL2.*; -import static com.mbien.opencl.CLMemory.Mem.*; -import static com.mbien.opencl.CLEvent.ProfilingCommand.*; -import static com.mbien.opencl.CLCommandQueue.Mode.*; -import static com.mbien.opencl.CLDevice.Type.*; -import static java.lang.Math.*; - -/** - * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL. - * A shared PBO is used as storage for the fractal image.
- * http://en.wikipedia.org/wiki/Mandelbrot_set - *

- * controls:
- * keys 1-9 control parallelism level
- * space enables/disables slice seperator
- * 'd' toggles between 32/64bit floatingpoint precision
- * mouse/mousewheel to drag and zoom
- *

- * @author Michael Bien - */ -public class MultiDeviceFractal implements GLEventListener { - - // max number of used GPUs - private static final int MAX_PARRALLELISM_LEVEL = 8; - - // max per pixel iterations to compute the fractal - private static final int MAX_ITERATIONS = 1000; - - private GLCanvas canvas; - - private CLContext clContext; - private CLCommandQueue[] queues; - private CLKernel[] kernels; - private CLProgram program; - private CLEventList probes; - private CLBuffer[] pboBuffers; - private CLBuffer[] colorMap; - - private int width = 0; - private int height = 0; - - private double minX = -2f; - private double minY = -1.2f; - private double maxX = 0.6f; - private double maxY = 1.3f; - - private int slices; - - private boolean drawSeperator; - private boolean doublePrecision; - private boolean buffersInitialized; - private boolean rebuild; - - private final TextRenderer textRenderer; - - public MultiDeviceFractal(int width, int height) { - - this.width = width; - this.height = height; - - canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2))); - canvas.addGLEventListener(this); - initSceneInteraction(); - - JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set"); - frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - canvas.setPreferredSize(new Dimension(width, height)); - frame.add(canvas); - frame.pack(); - - frame.setVisible(true); - - textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false); - } - - public void init(GLAutoDrawable drawable) { - - // enable GL error checking using the composable pipeline - drawable.setGL(new DebugGL2(drawable.getGL().getGL2())); - - initCL(drawable.getContext()); - - GL2 gl = drawable.getGL().getGL2(); - - gl.setSwapInterval(0); - gl.glDisable(GL_DEPTH_TEST); - gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f); - - initView(gl, drawable.getWidth(), drawable.getHeight()); - - initPBO(gl); - setKernelConstants(); - } - - private void initCL(GLContext glCtx){ - try { - // create context managing all available GPUs - clContext = CLContext.create(GPU); - - CLDevice[] devices = clContext.getCLDevices(); - - slices = min(devices.length, MAX_PARRALLELISM_LEVEL); - - // create command queues for every GPU, setup colormap and init kernels - queues = new CLCommandQueue[slices]; - kernels = new CLKernel[slices]; - probes = new CLEventList(slices); - colorMap = new CLBuffer[slices]; - - for (int i = 0; i < slices; i++) { - - colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY); - initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED); - - // create command queue and upload color map buffer on each used device - queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload - - } - - // load and build program - program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl")); - buildProgram(); - - } catch (IOException ex) { - Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex); - } catch (CLException ex) { - Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex); - } - - } - - private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) { - - for (int n = 0; n < colors.length - 1; n++) { - - Color color = colors[n]; - int r0 = color.getRed(); - int g0 = color.getGreen(); - int b0 = color.getBlue(); - - color = colors[n + 1]; - int r1 = color.getRed(); - int g1 = color.getGreen(); - int b1 = color.getBlue(); - - int deltaR = r1 - r0; - int deltaG = g1 - g0; - int deltaB = b1 - b0; - - for (int step = 0; step < stepSize; step++) { - float alpha = (float) step / (stepSize - 1); - int r = (int) (r0 + alpha * deltaR); - int g = (int) (g0 + alpha * deltaG); - int b = (int) (b0 + alpha * deltaB); - colorMap.put((r << 16) | (g << 8) | (b << 0)); - } - } - colorMap.rewind(); - - } - - private void initView(GL2 gl, int width, int height) { - - gl.glViewport(0, 0, width, height); - - gl.glMatrixMode(GL_MODELVIEW); - gl.glLoadIdentity(); - - gl.glMatrixMode(GL_PROJECTION); - gl.glLoadIdentity(); - gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0); - } - - @SuppressWarnings("unchecked") - private void initPBO(GL gl) { - - if(pboBuffers != null) { - int[] oldPbos = new int[pboBuffers.length]; - for (int i = 0; i < pboBuffers.length; i++) { - CLBuffer buffer = pboBuffers[i]; -// oldPbos[i] = buffer.GLID; - buffer.release(); - } -// gl.glDeleteBuffers(oldPbos.length, oldPbos, 0); - } - - pboBuffers = new CLBuffer[slices]; - -// int[] pbo = new int[slices]; -// gl.glGenBuffers(slices, pbo, 0); - - // setup one empty PBO per slice - for (int i = 0; i < slices; i++) { - -// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]); -// gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW); -// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - pboBuffers[i] = clContext.createByteBuffer(width*height * SIZEOF_INT / slices, WRITE_ONLY); -// pboBuffers[i] = clContext.createFromGLBuffer(null, pbo[i], WRITE_ONLY); - - } - - buffersInitialized = true; - } - - private void buildProgram() { - - /* - * workaround: The driver keeps using the old binaries for some reason. - * to solve this we simple create a new program and release the old. - * however rebuilding programs should be possible -> remove when drivers are fixed. - */ - if(program != null && rebuild) { - String source = program.getSource(); - program.release(); - program = clContext.createProgram(source); - } - - // disable 64bit floating point math if not available - if(doublePrecision) { - for (CLDevice device : program.getCLDevices()) { - if(!device.isDoubleFPAvailable()) { - doublePrecision = false; - break; - } - } - } - - if(doublePrecision) { - program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP"); - }else{ - program.build(CompilerOptions.FAST_RELAXED_MATH); - } - rebuild = false; - - for (int i = 0; i < kernels.length; i++) { - // init kernel with constants - kernels[i] = program.createCLKernel("mandelbrot"); - } - - } - - // init kernels with constants - private void setKernelConstants() { - for (int i = 0; i < slices; i++) { - kernels[i].setForce32BitArgs(!doublePrecision) - .setArg(6, pboBuffers[i]) - .setArg(7, colorMap[i]) - .setArg(8, colorMap[i].getBuffer().capacity()) - .setArg(9, MAX_ITERATIONS); - } - } - - // rendering cycle - public void display(GLAutoDrawable drawable) { - GL gl = drawable.getGL(); - - if(!buffersInitialized) { - initPBO(gl); - setKernelConstants(); - } - if(rebuild) { - buildProgram(); - setKernelConstants(); - } - // make sure GL does not use our objects before we start computeing - gl.glFinish(); - compute(); - - render(gl.getGL2()); - } - - // OpenCL - private void compute() { - - int sliceWidth = width / slices; - double rangeX = (maxX - minX) / slices; - double rangeY = (maxY - minY); - - // release all old events, you can't reuse events in OpenCL - probes.release(); - - long time = System.currentTimeMillis(); - // start computation - for (int i = 0; i < slices; i++) { - - kernels[i].putArg( sliceWidth).putArg(height) - .putArg(minX + rangeX*i).putArg( minY) - .putArg( rangeX ).putArg(rangeY) - .rewind(); - - // aquire GL objects, and enqueue a kernel with a probe from the list - queues[i] -// .putAcquireGLObject(pboBuffers[i].ID) - .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes) -// .putReleaseGLObject(pboBuffers[i].ID) - ; - - } - - // block until done - for (int i = 0; i < slices; i++) { - queues[i].finish(); - } - System.out.println((System.currentTimeMillis()-time)/1000.0f); - - } - - // OpenGL - private void render(GL2 gl) { - - gl.glClear(GL_COLOR_BUFFER_BIT); - - //draw slices - int sliceWidth = width / slices; - -// for (int i = 0; i < slices; i++) { -// -// int seperatorOffset = drawSeperator?i:0; -// -// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID); -// gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0); -// -// gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0); -// -// } -// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - //draw info text - textRenderer.beginRendering(width, height, false); - - textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15); - - for (int i = 0; i < slices; i++) { - CLEvent event = probes.getEvent(i); - long start = event.getProfilingInfo(START); - long end = event.getProfilingInfo(END); - textRenderer.draw("GPU"+i +" "+((end-start)/1000000000.0f)+"s", 10, height-(20+16*(slices-i))); - } - - textRenderer.endRendering(); - } - - public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) { - - if(this.width == width && this.height == height) - return; - - this.width = width; - this.height = height; - - initPBO(drawable.getGL()); - - initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight()); - } - - private void initSceneInteraction() { - - MouseAdapter mouseAdapter = new MouseAdapter() { - - Point lastpos = new Point(); - - @Override - public void mouseDragged(MouseEvent e) { - - double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width; - double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height; - - minX += offsetX; - minY -= offsetY; - - maxX += offsetX; - maxY -= offsetY; - - lastpos = e.getPoint(); - - canvas.display(); - - } - - @Override - public void mouseMoved(MouseEvent e) { - lastpos = e.getPoint(); - } - - @Override - public void mouseWheelMoved(MouseWheelEvent e) { - float rotation = e.getWheelRotation() / 25.0f; - - double deltaX = rotation * (maxX - minX); - double deltaY = rotation * (maxY - minY); - - // offset for "zoom to cursor" - double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2; - double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2; - - minX += deltaX+offsetX; - minY += deltaY-offsetY; - - maxX +=-deltaX+offsetX; - maxY +=-deltaY-offsetY; - - canvas.display(); - } - }; - - KeyAdapter keyAdapter = new KeyAdapter() { - - @Override - public void keyPressed(KeyEvent e) { - if(e.getKeyCode() == KeyEvent.VK_SPACE) { - drawSeperator = !drawSeperator; - }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') { - int number = e.getKeyChar()-'0'; - slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL)); - buffersInitialized = false; - }else if(e.getKeyCode() == KeyEvent.VK_D) { - doublePrecision = !doublePrecision; - rebuild = true; - } - canvas.display(); - } - - }; - - canvas.addMouseMotionListener(mouseAdapter); - canvas.addMouseWheelListener(mouseAdapter); - canvas.addKeyListener(keyAdapter); - } - - public void dispose(GLAutoDrawable drawable) { - } - - public static void main(String args[]) { - SwingUtilities.invokeLater(new Runnable() { - public void run() { - new MultiDeviceFractal(512, 512); - } - }); - } - -} diff --git a/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java b/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java deleted file mode 100644 index acbd84c..0000000 --- a/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java +++ /dev/null @@ -1,91 +0,0 @@ -package com.mbien.opencl.demos.hellojocl; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLContext; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLProgram; -import java.io.IOException; -import java.nio.FloatBuffer; -import java.util.Random; - -import static java.lang.System.*; -import static com.mbien.opencl.CLMemory.Mem.*; - -/** - * Hello Java OpenCL example. Adds all elements of buffer A to buffer B - * and stores the result in buffer C.
- * Sample was inspired by the Nvidia VectorAdd example written in C/C++ - * which is bundled in the Nvidia OpenCL SDK. - * @author Michael Bien - */ -public class HelloJOCL { - - public static void main(String[] args) throws IOException { - - int elementCount = 11444777; // Length of arrays to process - int localWorkSize = 256; // Local work size dimensions - int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize - - // set up - CLContext context = CLContext.create(); - - CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build(); - - CLBuffer clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY); - CLBuffer clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY); - CLBuffer clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY); - - out.println("used device memory: " - + (clBufferA.getSize()+clBufferB.getSize()+clBufferC.getSize())/1000000 +"MB"); - - // fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between runs). - fillBuffer(clBufferA.getBuffer(), 12345); - fillBuffer(clBufferB.getBuffer(), 67890); - - // get a reference to the kernel functon with the name 'VectorAdd' - // and map the buffers to its input parameters. - CLKernel kernel = program.createCLKernel("VectorAdd"); - kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount); - - // create command queue on fastest device. - CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue(); - - // asynchronous write of data to GPU device, blocking read later to get the computed results back. - long time = nanoTime(); - queue.putWriteBuffer(clBufferA, false) - .putWriteBuffer(clBufferB, false) - .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize) - .putReadBuffer(clBufferC, true); - time = nanoTime() - time; - - // cleanup all resources associated with this context. - context.release(); - - // print first few elements of the resulting buffer to the console. - out.println("a+b=c results snapshot: "); - for(int i = 0; i < 10; i++) - out.print(clBufferC.getBuffer().get() + ", "); - out.println("...; " + clBufferC.getBuffer().remaining() + " more"); - - out.println("computation took: "+(time/1000000)+"ms"); - - } - - private static final void fillBuffer(FloatBuffer buffer, int seed) { - Random rnd = new Random(seed); - while(buffer.remaining() != 0) - buffer.put(rnd.nextFloat()*100); - buffer.rewind(); - } - - private static final int roundUp(int groupSize, int globalSize) { - int r = globalSize % groupSize; - if (r == 0) { - return globalSize; - } else { - return globalSize + groupSize - r; - } - } - -} \ No newline at end of file diff --git a/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl b/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl deleted file mode 100644 index ac9dde2..0000000 --- a/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl +++ /dev/null @@ -1,15 +0,0 @@ - - // OpenCL Kernel Function for element by element vector addition - kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) { - - // get index into global data array - int iGID = get_global_id(0); - - // bound check (equivalent to the limit on a 'for' loop for standard/serial C code - if (iGID >= numElements) { - return; - } - - // add the vector elements - c[iGID] = a[iGID] + b[iGID]; - } \ No newline at end of file diff --git a/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java b/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java deleted file mode 100644 index 74783f3..0000000 --- a/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java +++ /dev/null @@ -1,277 +0,0 @@ -package com.mbien.opencl.demos.joglinterop; - -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLDevice; -import com.mbien.opencl.gl.CLGLBuffer; -import com.mbien.opencl.gl.CLGLContext; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLPlatform; -import com.mbien.opencl.CLProgram; -import com.jogamp.opengl.util.Animator; -import java.io.IOException; -import javax.media.opengl.DebugGL2; -import javax.media.opengl.GL2; -import javax.media.opengl.GLAutoDrawable; -import javax.media.opengl.GLCapabilities; -import javax.media.opengl.GLEventListener; -import javax.media.opengl.GLProfile; -import javax.media.opengl.awt.GLCanvas; -import javax.media.opengl.glu.gl2.GLUgl2; -import javax.swing.JFrame; -import javax.swing.SwingUtilities; - -import static com.jogamp.common.nio.Buffers.*; - -/** - * JOCL - JOGL interoperability example. - * @author Michael Bien - */ -public class GLCLInteroperabilityDemo implements GLEventListener { - - private final GLUgl2 glu = new GLUgl2(); - - private final int MESH_SIZE = 256; - - private int width; - private int height; - -// private final FloatBuffer vb; -// private final IntBuffer ib; - - private final int[] glObjects = new int[2]; - private final int VERTICES = 0; -// private final int INDICES = 1; - - private final UserSceneInteraction usi; - - private CLGLContext clContext; - private CLKernel kernel; - private CLCommandQueue commandQueue; - private CLGLBuffer clBuffer; - - private float step = 0; - - public GLCLInteroperabilityDemo() { - - this.usi = new UserSceneInteraction(); - - // create direct memory buffers -// vb = newFloatBuffer(MESH_SIZE * MESH_SIZE * 4); -// ib = newIntBuffer((MESH_SIZE - 1) * (MESH_SIZE - 1) * 2 * 3); -// -// // build indices -// // 0---3 -// // | \ | -// // 1---2 -// for (int h = 0; h < MESH_SIZE - 1; h++) { -// for (int w = 0; w < MESH_SIZE - 1; w++) { -// -// // 0 - 3 - 2 -// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6, w + (h ) * (MESH_SIZE) ); -// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 1, w + (h ) * (MESH_SIZE) + 1); -// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 2, w + (h + 1) * (MESH_SIZE) + 1); -// -// // 0 - 2 - 1 -// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 3, w + (h ) * (MESH_SIZE) ); -// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 4, w + (h + 1) * (MESH_SIZE) + 1); -// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 5, w + (h + 1) * (MESH_SIZE) ); -// -// } -// } -// ib.rewind(); - - SwingUtilities.invokeLater(new Runnable() { - public void run() { - initUI(); - } - }); - - } - - private void initUI() { - - this.width = 600; - this.height = 400; - - GLCapabilities config = new GLCapabilities(GLProfile.get(GLProfile.GL2)); - config.setSampleBuffers(true); - config.setNumSamples(4); - - GLCanvas canvas = new GLCanvas(config); - canvas.addGLEventListener(this); - usi.init(canvas); - - JFrame frame = new JFrame("JOGL-JOCL Interoperability Example"); - frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - frame.add(canvas); - frame.setSize(width, height); - - frame.setVisible(true); - - } - - - public void init(GLAutoDrawable drawable) { - - if(clContext == null) { - - // find gl compatible device - CLDevice[] devices = CLPlatform.getDefault().listCLDevices(); - CLDevice device = null; - for (CLDevice d : devices) { - if(d.isGLMemorySharingSupported()) { - device = d; - break; - } - } - // create OpenCL context before creating any OpenGL objects - // you want to share with OpenCL (AMD driver requirement) - clContext = CLGLContext.create(drawable.getContext(), device); - - // enable GL error checking using the composable pipeline - drawable.setGL(new DebugGL2(drawable.getGL().getGL2())); - - // OpenGL initialization - GL2 gl = drawable.getGL().getGL2(); - - gl.setSwapInterval(1); - - gl.glPolygonMode(GL2.GL_FRONT_AND_BACK, GL2.GL_LINE); - - gl.glGenBuffers(glObjects.length, glObjects, 0); - - // gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]); - // gl.glBufferData(GL2.GL_ELEMENT_ARRAY_BUFFER, ib.capacity() * SIZEOF_INT, ib, GL2.GL_STATIC_DRAW); - // gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, 0); - - gl.glEnableClientState(GL2.GL_VERTEX_ARRAY); - gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]); - gl.glBufferData(GL2.GL_ARRAY_BUFFER, MESH_SIZE * MESH_SIZE * 4 * SIZEOF_FLOAT, null, GL2.GL_DYNAMIC_DRAW); - gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0); - gl.glDisableClientState(GL2.GL_VERTEX_ARRAY); - - pushPerspectiveView(gl); - gl.glFinish(); - - // init OpenCL - initCL(); - - // start rendering thread - Animator animator = new Animator(drawable); - animator.start(); - - } - } - - private void initCL() { - - CLProgram program; - try { - program = clContext.createProgram(getClass().getResourceAsStream("JoglInterop.cl")); - program.build(); - System.out.println(program.getBuildStatus()); - System.out.println(program.isExecutable()); - System.out.println(program.getBuildLog()); - } catch (IOException ex) { - throw new RuntimeException("can not handle exception", ex); - } - - commandQueue = clContext.getMaxFlopsDevice().createCommandQueue(); - - clBuffer = clContext.createFromGLBuffer(glObjects[VERTICES], CLGLBuffer.Mem.WRITE_ONLY); - - System.out.println("cl buffer type: " + clBuffer.getGLObjectType()); - System.out.println("shared with gl buffer: " + clBuffer.getGLObjectID()); - - kernel = program.createCLKernel("sineWave") - .putArg(clBuffer) - .putArg(MESH_SIZE) - .rewind(); - - System.out.println("cl initialised"); - } - - - public void display(GLAutoDrawable drawable) { - - GL2 gl = drawable.getGL().getGL2(); - - // ensure pipeline is clean before doing cl work - gl.glFinish(); - - computeHeightfield(); - - gl.glClear(GL2.GL_COLOR_BUFFER_BIT | GL2.GL_DEPTH_BUFFER_BIT); - gl.glLoadIdentity(); - - usi.interact(gl); - - gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]); - gl.glVertexPointer(4, GL2.GL_FLOAT, 0, 0); - -// gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]); - - gl.glEnableClientState(GL2.GL_VERTEX_ARRAY); - gl.glDrawArrays(GL2.GL_POINTS, 0, MESH_SIZE * MESH_SIZE); -// gl.glDrawElements(GL2.GL_TRIANGLES, ib.capacity(), GL2.GL_UNSIGNED_INT, 0); - gl.glDisableClientState(GL2.GL_VERTEX_ARRAY); - -// gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0); - - } - - /* - * Computes a heightfield using a OpenCL kernel. - */ - private void computeHeightfield() { - - kernel.setArg(2, step += 0.05f); - - commandQueue.putAcquireGLObject(clBuffer.ID) - .put2DRangeKernel(kernel, 0, 0, MESH_SIZE, MESH_SIZE, 0, 0) - .putReleaseGLObject(clBuffer.ID) - .finish(); - - } - - private void pushPerspectiveView(GL2 gl) { - - gl.glMatrixMode(GL2.GL_PROJECTION); - gl.glPushMatrix(); - - gl.glLoadIdentity(); - - glu.gluPerspective(60, width / (float)height, 1, 1000); - gl.glMatrixMode(GL2.GL_MODELVIEW); - - gl.glPushMatrix(); - gl.glLoadIdentity(); - - } - - private void popView(GL2 gl) { - - gl.glMatrixMode(GL2.GL_PROJECTION); - gl.glPopMatrix(); - - gl.glMatrixMode(GL2.GL_MODELVIEW); - gl.glPopMatrix(); - - } - - - public void reshape(GLAutoDrawable drawable, int arg1, int arg2, int width, int height) { - this.width = width; - this.height = height; - GL2 gl = drawable.getGL().getGL2(); - popView(gl); - pushPerspectiveView(gl); - } - - public void dispose(GLAutoDrawable drawable) { } - - public static void main(String[] args) { - new GLCLInteroperabilityDemo(); - } - -} \ No newline at end of file diff --git a/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl b/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl deleted file mode 100644 index 0f0bcfc..0000000 --- a/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl +++ /dev/null @@ -1,23 +0,0 @@ - -/** -* animated 2D sine pattern. -*/ -kernel void sineWave(global float4 * vertex, int size, float time) { - - unsigned int x = get_global_id(0); - unsigned int y = get_global_id(1); - - // calculate uv coordinates - float u = x / (float) size; - float v = y / (float) size; - - u = u*2.0f - 1.0f; - v = v*2.0f - 1.0f; - - // calculate simple sine wave pattern - float freq = 4.0f; - float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f; - - // write output vertex - vertex[y*size + x] = (float4)(u*10.0f, w*10.0f, v*10.0f, 1.0f); -} diff --git a/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java b/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java deleted file mode 100644 index bec2d28..0000000 --- a/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java +++ /dev/null @@ -1,103 +0,0 @@ -package com.mbien.opencl.demos.joglinterop; - -import java.awt.Component; -import java.awt.Point; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.MouseMotionAdapter; -import java.awt.event.MouseWheelEvent; -import java.awt.event.MouseWheelListener; -import javax.media.opengl.GL2; - -/** - * Utility class for interacting with a scene. Supports rotation and zoom around origin. - * @author Michael Bien - */ -public class UserSceneInteraction { - - private float z = -20; - private float rotx = 45; - private float roty = 30; - - private Point dragstart; - private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM } - private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE; - - - public void init(Component component) { - initMouseListeners(component); - } - - private void initMouseListeners(Component component) { - component.addMouseMotionListener(new MouseMotionAdapter() { - - @Override - public void mouseDragged(MouseEvent e) { - - if (dragstart != null) { - switch (dragmode) { - case DRAG_ROTATE: - rotx += e.getY() - dragstart.getY(); - roty += e.getX() - dragstart.getX(); - break; - case DRAG_ZOOM: - z += (e.getY() - dragstart.getY()) / 5.0f; - break; - } - } - - dragstart = e.getPoint(); - } - }); - component.addMouseWheelListener(new MouseWheelListener() { - - public void mouseWheelMoved(MouseWheelEvent e) { - z += e.getWheelRotation()*5; - } - - }); - component.addMouseListener(new MouseAdapter() { - - @Override - public void mousePressed(MouseEvent e) { - switch (e.getButton()) { - case (MouseEvent.BUTTON1): - dragmode = MOUSE_MODE.DRAG_ROTATE; - break; - case (MouseEvent.BUTTON2): - dragmode = MOUSE_MODE.DRAG_ZOOM; - break; - case (MouseEvent.BUTTON3): - dragmode = MOUSE_MODE.DRAG_ZOOM; - break; - } - } - - @Override - public void mouseReleased(MouseEvent e) { - switch (e.getButton()) { - case (MouseEvent.BUTTON1): - dragmode = MOUSE_MODE.DRAG_ZOOM; - break; - case (MouseEvent.BUTTON2): - dragmode = MOUSE_MODE.DRAG_ROTATE; - break; - case (MouseEvent.BUTTON3): - dragmode = MOUSE_MODE.DRAG_ROTATE; - break; - } - - dragstart = null; - } - }); - } - - - public void interact(GL2 gl) { - gl.glTranslatef(0, 0, z); - gl.glRotatef(rotx, 1f, 0f, 0f); - gl.glRotatef(roty, 0f, 1.0f, 0f); - } - - -} \ No newline at end of file diff --git a/src/com/mbien/opencl/demos/julia3d/Julia3d.java b/src/com/mbien/opencl/demos/julia3d/Julia3d.java deleted file mode 100644 index 5d5378f..0000000 --- a/src/com/mbien/opencl/demos/julia3d/Julia3d.java +++ /dev/null @@ -1,212 +0,0 @@ -package com.mbien.opencl.demos.julia3d; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLContext; -import com.mbien.opencl.CLDevice; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLPlatform; -import com.mbien.opencl.CLProgram; -import com.mbien.opencl.demos.julia3d.structs.Camera; -import com.mbien.opencl.demos.julia3d.structs.RenderingConfig; -import com.mbien.opencl.demos.julia3d.structs.Vec; -import java.io.IOException; -import java.nio.Buffer; -import java.nio.ByteBuffer; -import java.nio.FloatBuffer; -import javax.swing.SwingUtilities; - -import static com.mbien.opencl.CLMemory.Mem.*; -import static com.mbien.opencl.CLProgram.CompilerOptions.*; -import static com.mbien.opencl.demos.julia3d.UserSceneController.*; - -/** - * This sample has been ported from David Buciarelli's juliaGPU v1.2 written in C. - * @author Michael Bien - */ -public class Julia3d { - - private final CLContext context; - private CLBuffer pixelBuffer; - private final CLBuffer configBuffer; - private final CLCommandQueue commandQueue; - private final CLProgram program; - private final CLKernel julia; - private final CLKernel multiply; - - private final int workGroupSize; - private final String kernelFileName = "rendering_kernel.cl"; - - final RenderingConfig config; - - private Julia3d(RenderingConfig renderConfig) { - this.config = renderConfig; - updateCamera(); - - //setup - CLDevice gpu = CLPlatform.getDefault().getMaxFlopsDevice(); - context = CLContext.create(gpu); - - workGroupSize = 256; - - //allocate buffers - configBuffer = context.createBuffer(config.getBuffer(), READ_ONLY); - commandQueue = gpu.createCommandQueue(); -// update(true); - - try { - program = context.createProgram(Julia3d.class.getResourceAsStream(kernelFileName)) - .build(FAST_RELAXED_MATH); - } catch (IOException ex) { - throw new RuntimeException("unable to load program from source", ex); - } - - julia = program.createCLKernel("JuliaGPU"); - multiply = program.createCLKernel("multiply"); - System.out.println(program.getBuildStatus(gpu)); - System.out.println(program.getBuildLog()); - - } - - void update(boolean reallocate) { - - updateCamera(); - - int bufferSize = config.getWidth() * config.getHeight() * 3; - if(reallocate) { - if(pixelBuffer != null) { - pixelBuffer.release(); - } - - pixelBuffer = context.createFloatBuffer(bufferSize, READ_WRITE, USE_BUFFER); - } - - commandQueue.putWriteBuffer(configBuffer, true); - - julia.putArg(pixelBuffer) - .putArg(configBuffer) - .rewind(); - - multiply.putArg(pixelBuffer) - .putArg(bufferSize) - .rewind(); - } - - - void compute(boolean fastRendering) { - - // calculate workgroup size - int globalThreads = config.getWidth() * config.getHeight(); - if(globalThreads % workGroupSize != 0) - globalThreads = (globalThreads / workGroupSize + 1) * workGroupSize; - - int localThreads = workGroupSize; - int superSamplingSize = config.getSuperSamplingSize(); - - if (!fastRendering && superSamplingSize > 1) { - - for (int y = 0; y < superSamplingSize; ++y) { - for (int x = 0; x < superSamplingSize; ++x) { - - float sampleX = (x + 0.5f) / superSamplingSize; - float sampleY = (y + 0.5f) / superSamplingSize; - - if (x == 0 && y == 0) { - // First pass - julia.setArg(2, 0) - .setArg(3, sampleX) - .setArg(4, sampleY); - - commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads); - - } else if (x == (superSamplingSize - 1) && y == (superSamplingSize - 1)) { - // Last pass - julia.setArg(2, 1) - .setArg(3, sampleX) - .setArg(4, sampleY); - - // normalize the values we accumulated - multiply.setArg(2, 1.0f/(superSamplingSize*superSamplingSize)); - - commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads) - .put1DRangeKernel(multiply, 0, globalThreads*3, localThreads); - } else { - julia.setArg(2, 1) - .setArg(3, sampleX) - .setArg(4, sampleY); - - commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads); - - } - } - } - - }else{ - - //fast rendering - julia.setArg(2, 0) - .setArg(3, 0.0f) - .setArg(4, 0.0f); - - commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads); - } - - commandQueue.putBarrier() - .putReadBuffer(pixelBuffer, true); - - } - - private void updateCamera() { - - Camera camera = config.getCamera(); - - Vec dir = camera.getDir(); - Vec target = camera.getTarget(); - Vec camX = camera.getX(); - Vec camY = camera.getY(); - Vec orig = camera.getOrig(); - - vsub(dir, target, orig); - vnorm(dir); - - Vec up = Vec.create().setX(0).setY(1).setZ(0); - vxcross(camX, dir, up); - vnorm(camX); - vmul(camX, config.getWidth() * .5135f / config.getHeight(), camX); - - vxcross(camY, camX, dir); - vnorm(camY); - vmul(camY, .5135f, camY); - } - - - public static void main(String[] args) { - - RenderingConfig config = RenderingConfig.create() - .setWidth(640).setHeight(480) - .setEnableShadow(1) - .setSuperSamplingSize(2) - .setActvateFastRendering(1) - .setMaxIterations(9) - .setEpsilon(0.003f * 0.75f) - .setLight(new float[] {5, 10, 15}) - .setMu(new float[] {-0.2f, 0.4f, -0.4f, -0.4f}); - - config.getCamera().getOrig() .setX(1).setY(2).setZ(8); - config.getCamera().getTarget().setX(0).setY(0).setZ(0); - - final Julia3d julia3d = new Julia3d(config); - - SwingUtilities.invokeLater(new Runnable() { - public void run() { - new Renderer(julia3d); - } - }); - } - - Buffer getPixelBuffer() { - return pixelBuffer.getBuffer(); - } - - -} diff --git a/src/com/mbien/opencl/demos/julia3d/Renderer.java b/src/com/mbien/opencl/demos/julia3d/Renderer.java deleted file mode 100644 index 9266aba..0000000 --- a/src/com/mbien/opencl/demos/julia3d/Renderer.java +++ /dev/null @@ -1,203 +0,0 @@ -package com.mbien.opencl.demos.julia3d; - -import com.mbien.opencl.demos.julia3d.structs.RenderingConfig; -import com.jogamp.opengl.util.awt.TextRenderer; -import java.awt.Dimension; -import java.awt.Font; -import java.nio.FloatBuffer; -import java.util.Timer; -import java.util.TimerTask; -import javax.media.opengl.GL2; -import javax.media.opengl.GLAutoDrawable; -import javax.media.opengl.GLCapabilities; -import javax.media.opengl.GLEventListener; -import javax.media.opengl.GLProfile; -import javax.media.opengl.awt.GLCanvas; -import javax.swing.JFrame; - -import static com.jogamp.common.nio.Buffers.*; -import static javax.media.opengl.GL2.*; -import static java.lang.String.*; - -/** - * JOGL renderer for displaying the julia set. - * @author Michael Bien - */ -public class Renderer implements GLEventListener { - - public final static int MU_RECT_SIZE = 80; - - private final Julia3d julia3d; - private final GLCanvas canvas; - private final RenderingConfig config; - private final FloatBuffer juliaSlice; - private final UserSceneController usi; - private final TextRenderer textRenderer; - - private TimerTask task; - private final Timer timer; - - public Renderer(Julia3d julia3d) { - this.julia3d = julia3d; - this.config = julia3d.config; - - timer = new Timer(); - - juliaSlice = newDirectFloatBuffer(MU_RECT_SIZE * MU_RECT_SIZE * 4); - - canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2))); - canvas.addGLEventListener(this); - - usi = new UserSceneController(); - usi.init(this, canvas, config); - - JFrame frame = new JFrame("Java OpenCL - Julia3D GPU"); - frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - canvas.setPreferredSize(new Dimension(config.getWidth(), config.getHeight())); - frame.add(canvas); - frame.pack(); - - textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false); - - frame.setVisible(true); - } - - public void init(GLAutoDrawable drawable) { - drawable.getGL().getGL2().glMatrixMode(GL_PROJECTION); - } - - void update() { - julia3d.update(false); - canvas.display(); - } - - public void display(GLAutoDrawable drawable) { - - //compute - julia3d.compute(config.getActvateFastRendering() == 1); - - GL2 gl = drawable.getGL().getGL2(); - gl.glClear(GL_COLOR_BUFFER_BIT); - - // draw julia set - gl.glRasterPos2i(0, 0); - gl.glDrawPixels(config.getWidth(), config.getHeight(), GL_RGB, GL_FLOAT, julia3d.getPixelBuffer()); - - - // Draw Mu constant - int width = config.getWidth(); - int height = config.getHeight(); - float[] mu = config.getMu(); - - gl.glEnable(GL_BLEND); - gl.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); - int baseMu1 = width - MU_RECT_SIZE - 2; - int baseMu2 = 1; - drawJuliaSlice(gl, baseMu1, baseMu2, mu[0], mu[1]); - int baseMu3 = width - MU_RECT_SIZE - 2; - int baseMu4 = MU_RECT_SIZE + 2; - drawJuliaSlice(gl, baseMu3, baseMu4, mu[2], mu[3]); - gl.glDisable(GL_BLEND); - - gl.glColor3f(1, 1, 1); - int mu1 = (int) (baseMu1 + MU_RECT_SIZE * (mu[0] + 1.5f) / 3.f); - int mu2 = (int) (baseMu2 + MU_RECT_SIZE * (mu[1] + 1.5f) / 3.f); - gl.glBegin(GL_LINES); - gl.glVertex2i(mu1 - 4, mu2); - gl.glVertex2i(mu1 + 4, mu2); - gl.glVertex2i(mu1, mu2 - 4); - gl.glVertex2i(mu1, mu2 + 4); - gl.glEnd(); - - int mu3 = (int) (baseMu3 + MU_RECT_SIZE * (mu[2] + 1.5f) / 3.f); - int mu4 = (int) (baseMu4 + MU_RECT_SIZE * (mu[3] + 1.5f) / 3.f); - gl.glBegin(GL_LINES); - gl.glVertex2i(mu3 - 4, mu4); - gl.glVertex2i(mu3 + 4, mu4); - gl.glVertex2i(mu3, mu4 - 4); - gl.glVertex2i(mu3, mu4 + 4); - gl.glEnd(); - - // info text - textRenderer.beginRendering(width, height); - textRenderer.draw(format("Epsilon %.5f - Max. Iter. %d", config.getEpsilon(), config.getMaxIterations()), 8, 10); - textRenderer.draw(format("Mu = (%.3f, %.3f, %.3f, %.3f)", mu[0], mu[1], mu[2], mu[3]), 8, 25); - textRenderer.draw(format("Shadow %s - SuperSampling %dx%d - Fast rendering %s", - config.getEnableShadow() == 1 ? "on" : "off", - config.getSuperSamplingSize(), config.getSuperSamplingSize(), - config.getActvateFastRendering() == 1 ? "on" : "off"), 8, 40); - textRenderer.endRendering(); - - // timer task scheduling, delay gpu intensive high quality rendering - if(task != null) { - task.cancel(); - } - if(config.getActvateFastRendering() == 1) { - task = new TimerTask() { - @Override - public void run() { - config.setActvateFastRendering(0); - update(); - config.setActvateFastRendering(1); - } - }; - timer.schedule(task, 2000); - } - } - - private void drawJuliaSlice(GL2 gl, int origX, int origY, float cR, float cI) { - - int index = 0; - float invSize = 3.0f / MU_RECT_SIZE; - for (int i = 0; i < MU_RECT_SIZE; ++i) { - for (int j = 0; j < MU_RECT_SIZE; ++j) { - - float x = i * invSize - 1.5f; - float y = j * invSize - 1.5f; - - int iter; - for (iter = 0; iter < 64; ++iter) { - float x2 = x * x; - float y2 = y * y; - if (x2 + y2 > 4.0f) { - break; - } - - float newx = x2 - y2 + cR; - float newy = 2.f * x * y + cI; - x = newx; - y = newy; - } - - juliaSlice.put(index++, iter / 64.0f); - juliaSlice.put(index++, 0.0f); - juliaSlice.put(index++, 0.0f); - juliaSlice.put(index++, 0.5f); - } - } - - gl.glRasterPos2i(origX, origY); - gl.glDrawPixels(MU_RECT_SIZE, MU_RECT_SIZE, GL_RGBA, GL_FLOAT, juliaSlice); - } - - - public void reshape(GLAutoDrawable drawable, int x, int y, int newWidth, int newHeight) { - - config.setWidth(newWidth); - config.setHeight(newHeight); - - GL2 gl = drawable.getGL().getGL2(); - - gl.glViewport(0, 0, newWidth, newHeight); - gl.glLoadIdentity(); - gl.glOrtho(-0.5f, newWidth - 0.5f, -0.5f, newHeight - 0.5f, -1.0f, 1.0f); - - julia3d.update(true); - - } - - public void dispose(GLAutoDrawable drawable) { - } - - -} diff --git a/src/com/mbien/opencl/demos/julia3d/UserSceneController.java b/src/com/mbien/opencl/demos/julia3d/UserSceneController.java deleted file mode 100644 index 849de3a..0000000 --- a/src/com/mbien/opencl/demos/julia3d/UserSceneController.java +++ /dev/null @@ -1,249 +0,0 @@ -package com.mbien.opencl.demos.julia3d; - -import com.mbien.opencl.demos.julia3d.structs.RenderingConfig; -import com.mbien.opencl.demos.julia3d.structs.Vec; -import java.awt.Component; -import java.awt.Point; -import java.awt.event.KeyAdapter; -import java.awt.event.KeyEvent; -import java.awt.event.MouseAdapter; -import java.awt.event.MouseEvent; -import java.awt.event.MouseWheelEvent; - -import static java.lang.Math.*; -import static com.mbien.opencl.demos.julia3d.Renderer.*; - -/** - * Utility class for interacting with a scene. Supports rotation and zoom around origin. - * @author Michael Bien - */ -public class UserSceneController { - - private Point dragstart; - private RenderingConfig model; - private Renderer view; - - private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM } - private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE; - - - public void init(Renderer view, Component component, RenderingConfig model) { - initMouseListeners(component); - this.view = view; - this.model = model; - } - - private void initMouseListeners(Component component) { - - MouseAdapter mouseAdapter = new MouseAdapter() { - @Override - public void mouseDragged(MouseEvent e) { - - int x = e.getX(); - int y = e.getY(); - - switch (dragmode) { - case DRAG_ROTATE: - if (dragstart != null) { - int height = model.getHeight(); - int width = model.getWidth(); - - int ry = height - y - 1; - int baseMu1 = width - MU_RECT_SIZE - 2; - int baseMu2 = 1; - int baseMu3 = width - MU_RECT_SIZE - 2; - int baseMu4 = MU_RECT_SIZE + 2; - - if ((x >= baseMu1 && x <= baseMu1 + MU_RECT_SIZE) && (ry >= baseMu2 && ry <= baseMu2 + MU_RECT_SIZE)) { - float[] mu = model.getMu(); - mu[0] = 3.f * ( x - baseMu1) / (float)MU_RECT_SIZE - 1.5f; - mu[1] = 3.f * (ry - baseMu2) / (float)MU_RECT_SIZE - 1.5f; - model.setMu(mu); - } else if ((x >= baseMu3 && x <= baseMu3 + MU_RECT_SIZE) && (ry >= baseMu4 && ry <= baseMu4 + MU_RECT_SIZE)) { - float[] mu = model.getMu(); - mu[2] = 3.f * ( x - baseMu3) / (float)MU_RECT_SIZE - 1.5f; - mu[3] = 3.f * (ry - baseMu4) / (float)MU_RECT_SIZE - 1.5f; - model.setMu(mu); - } else { - rotateCameraYbyOrig(0.01f * (x - dragstart.getX())); - rotateCameraXbyOrig(0.01f * (y - dragstart.getY())); - } - } - dragstart = e.getPoint(); - view.update(); - break; - case DRAG_ZOOM: - if (dragstart != null) { - float zoom = (float) ((y - dragstart.getY()) / 10.0f); - zoom(zoom); - } - dragstart = e.getPoint(); - view.update(); - break; - } - - } - - @Override - public void mousePressed(MouseEvent e) { - switch (e.getButton()) { - case (MouseEvent.BUTTON1): - dragmode = MOUSE_MODE.DRAG_ROTATE; - break; - case (MouseEvent.BUTTON2): - dragmode = MOUSE_MODE.DRAG_ZOOM; - break; - case (MouseEvent.BUTTON3): - dragmode = MOUSE_MODE.DRAG_ZOOM; - break; - } - } - - @Override - public void mouseReleased(MouseEvent e) { - switch (e.getButton()) { - case (MouseEvent.BUTTON1): - dragmode = MOUSE_MODE.DRAG_ZOOM; - break; - case (MouseEvent.BUTTON2): - dragmode = MOUSE_MODE.DRAG_ROTATE; - break; - case (MouseEvent.BUTTON3): - dragmode = MOUSE_MODE.DRAG_ROTATE; - break; - } - - dragstart = null; - } - - @Override - public void mouseWheelMoved(MouseWheelEvent e) { - float zoom = e.getWheelRotation() * 0.1f; - zoom(zoom); - view.update(); - } - - }; - - KeyAdapter keyAdapter = new KeyAdapter() { - - @Override - public void keyPressed(KeyEvent e) { - - switch (e.getKeyChar()) { - case 'l': - model.setEnableShadow(model.getEnableShadow()==0 ? 1 : 0); - break; - case '1': - model.setEpsilon(model.getEpsilon() * 0.75f); - break; - case '2': - model.setEpsilon(model.getEpsilon() * 1.f / 0.75f); - break; - case '3': - model.setMaxIterations(max(1, model.getMaxIterations() -1)); - break; - case '4': - model.setMaxIterations(min(12, model.getMaxIterations()+1)); - break; - case '5': - model.setSuperSamplingSize(max(1, model.getSuperSamplingSize() -1)); - break; - case '6': - model.setSuperSamplingSize(min(5, model.getSuperSamplingSize() +1)); - break; - default: - break; - } - view.update(); - - } - - }; - - component.addKeyListener(keyAdapter); - - component.addMouseListener(mouseAdapter); - component.addMouseMotionListener(mouseAdapter); - component.addMouseWheelListener(mouseAdapter); - - } - private void zoom(float zoom) { - Vec orig = model.getCamera().getOrig(); - orig.setX(orig.getX()+zoom) - .setY(orig.getY()+zoom) - .setZ(orig.getZ()+zoom); - } - - private void rotateLightX(float k) { - float[] light = model.getLight(); - float y = light[1]; - float z = light[2]; - light[1] = (float) ( y * cos(k) + z * sin(k)); - light[2] = (float) (-y * sin(k) + z * cos(k)); - model.setLight(light); - } - - private void rotateLightY(float k) { - float[] light = model.getLight(); - float x = light[0]; - float z = light[2]; - light[0] = (float) (x * cos(k) - z * sin(k)); - light[2] = (float) (x * sin(k) + z * cos(k)); - model.setLight(light); - } - - private void rotateCameraXbyOrig(double k) { - Vec orig = model.getCamera().getOrig(); - float y = orig.getY(); - float z = orig.getZ(); - orig.setY((float) ( y * cos(k) + z * sin(k))); - orig.setZ((float) (-y * sin(k) + z * cos(k))); - } - - private void rotateCameraYbyOrig(double k) { - Vec orig = model.getCamera().getOrig(); - float x = orig.getX(); - float z = orig.getZ(); - orig.setX((float) (x * cos(k) - z * sin(k))); - orig.setZ((float) (x * sin(k) + z * cos(k))); - } - - - public final static void vadd(Vec v, Vec a, Vec b) { - v.setX(a.getX() + b.getX()); - v.setY(a.getY() + b.getY()); - v.setZ(a.getZ() + b.getZ()); - } - - public final static void vsub(Vec v, Vec a, Vec b) { - v.setX(a.getX() - b.getX()); - v.setY(a.getY() - b.getY()); - v.setZ(a.getZ() - b.getZ()); - } - - public final static void vmul(Vec v, float s, Vec b) { - v.setX(s * b.getX()); - v.setY(s * b.getY()); - v.setZ(s * b.getZ()); - } - - public final static float vdot(Vec a, Vec b) { - return a.getX() * b.getX() - + a.getY() * b.getY() - + a.getZ() * b.getZ(); - } - - public final static void vnorm(Vec v) { - float s = (float) (1.0f / sqrt(vdot(v, v))); - vmul(v, s, v); - } - - public final static void vxcross(Vec v, Vec a, Vec b) { - v.setX(a.getY() * b.getZ() - a.getZ() * b.getY()); - v.setY(a.getZ() * b.getX() - a.getX() * b.getZ()); - v.setZ(a.getX() * b.getY() - a.getY() * b.getX()); - } - - -} \ No newline at end of file diff --git a/src/com/mbien/opencl/demos/julia3d/config.h b/src/com/mbien/opencl/demos/julia3d/config.h deleted file mode 100644 index 72df3ff..0000000 --- a/src/com/mbien/opencl/demos/julia3d/config.h +++ /dev/null @@ -1,24 +0,0 @@ - -typedef struct { - float x, y, z; // position, also color (r,g,b) -} Vec; - -typedef struct { - /* User defined values */ - Vec orig, target; - /* Calculated values */ - Vec dir, x, y; -} Camera; - -typedef struct { - unsigned int width, height; - int superSamplingSize; - int actvateFastRendering; - int enableShadow; - - unsigned int maxIterations; - float epsilon; - float mu[4]; - float light[3]; - Camera camera; -} RenderingConfig; diff --git a/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl b/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl deleted file mode 100644 index d5acd02..0000000 --- a/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl +++ /dev/null @@ -1,357 +0,0 @@ -/* -Copyright (c) 2009 David Bucciarelli (davibu@interfree.it) - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#define GPU_KERNEL - - -typedef struct { - float x, y, z; // position, also color (r,g,b) -} Vec; - -typedef struct { - /* User defined values */ - Vec orig, target; - /* Calculated values */ - Vec dir, x, y; -} Camera; - -typedef struct { - unsigned int width, height; - int superSamplingSize; - int actvateFastRendering; - int enableShadow; - - unsigned int maxIterations; - float epsilon; - float mu[4]; - float light[3]; - Camera camera; -} RenderingConfig; - -#define BOUNDING_RADIUS_2 4.f - -// Scalar derivative approach by Enforcer: -// http://www.fractalforums.com/mandelbulb-implementation/realtime-renderingoptimisations/ -static float IterateIntersect(const float4 z0, const float4 c0, const uint maxIterations) { - float4 z = z0; - float4 c = c0; - - float dr = 1.0f; - float r2 = dot(z, z); - float r = sqrt(r2); - for (int n = 0; (n < maxIterations) && (r < 2.f); ++n) { - const float zo0 = asin(z.z / r); - const float zi0 = atan2(z.y, z.x); - float zr = r2 * r2 * r2 * r; - const float zo = zo0 * 7.f; - const float zi = zi0 * 7.f; - const float czo = cos(zo); - - dr = zr * dr * 7.f + 1.f; - zr *= r; - - z = zr * (float4)(czo * cos(zi), czo * sin(zi), sin(zo), 0.f); - z += c; - - r2 = dot(z, z); - r = sqrt(r2); - } - - return 0.5f * log(r) * r / dr; -} - -static float IntersectBulb(const float4 eyeRayOrig, const float4 eyeRayDir, - const float4 c, const uint maxIterations, const float epsilon, - const float maxDist, float4 *hitPoint, uint *steps) { - float dist; - float4 r0 = eyeRayOrig; - float distDone = 0.f; - - uint s = 0; - do { - dist = IterateIntersect(r0, c, maxIterations); - distDone += dist; - // We are inside - if (dist <= 0.f) - break; - - r0 += eyeRayDir * dist; - s++; - } while ((dist > epsilon) && (distDone < maxDist)); - - *hitPoint = r0; - *steps = s; - return dist; -} - -#define WORLD_RADIUS 1000.f -#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f)) -float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) { - const float4 op = WORLD_CENTER - eyeRayOrig; - const float b = dot(op, eyeRayDir); - float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS; - - if (det < 0.f) - return -1.f; - else - det = sqrt(det); - - float t = b - det; - if (t > 0.f) - return t; - else { - // We are inside, avoid the hit - return -1.f; - } -} - -int IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir, - float *tmin, float*tmax) { - const float4 op = -eyeRayOrig; - const float b = dot(op, eyeRayDir); - float det = b * b - dot(op, op) + BOUNDING_RADIUS_2; - - if (det < 0.f) - return 0; - else - det = sqrt(det); - - float t1 = b - det; - float t2 = b + det; - if (t1 > 0.f) { - *tmin = t1; - *tmax = t2; - return 1; - } else { - if (t2 > 0.f) { - // We are inside, start from the ray origin - *tmin = 0.f; - *tmax = t2; - - return 1; - } else - return 0; - } -} - -static float4 NormEstimate(const float4 p, const float4 c, - const float delta, const uint maxIterations) { - const float4 qP = p; - const float4 gx1 = qP - (float4)(delta, 0.f, 0.f, 0.f); - const float4 gx2 = qP + (float4)(delta, 0.f, 0.f, 0.f); - const float4 gy1 = qP - (float4)(0.f, delta, 0.f, 0.f); - const float4 gy2 = qP + (float4)(0.f, delta, 0.f, 0.f); - const float4 gz1 = qP - (float4)(0.f, 0.f, delta, 0.f); - const float4 gz2 = qP + (float4)(0.f, 0.f, delta, 0.f); - - const float gradX = length(IterateIntersect(gx2, c, maxIterations)) - - length(IterateIntersect(gx1, c, maxIterations)); - const float gradY = length(IterateIntersect(gy2, c, maxIterations)) - - length(IterateIntersect(gy1, c, maxIterations)); - const float gradZ = length(IterateIntersect(gz2, c, maxIterations)) - - length(IterateIntersect(gz1, c, maxIterations)); - - const float4 N = normalize((float4)(gradX, gradY, gradZ, 0.f)); - - return N; -} - -static float4 Phong(const float4 light, const float4 eye, const float4 pt, - const float4 N, const float4 diffuse) { - const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f); - float4 L = normalize(light - pt); - float NdotL = dot(N, L); - if (NdotL < 0.f) - return diffuse * ambient; - - const float specularExponent = 30.f; - const float specularity = 0.65f; - - float4 E = normalize(eye - pt); - float4 H = (L + E) * (float)0.5f; - - return diffuse * NdotL + - specularity * pow(dot(N, H), specularExponent) + - diffuse * ambient; -} - -__kernel void MandelbulbGPU( - __global float *pixels, - const __global RenderingConfig *config, - const int enableAccumulation, - const float sampleX, - const float sampleY) { - const int gid = get_global_id(0); - const unsigned width = config->width; - const unsigned height = config->height; - - const unsigned int x = gid % width; - const int y = gid / width; - - // Check if we have to do something - if (y >= height) - return; - - const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.5f / 0.75f)) : config->epsilon; - const uint maxIterations = config->actvateFastRendering ? (max(3u, config->maxIterations) - 2u) : config->maxIterations; - - const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]); - const float4 light = (float4)(config->light[0], config->light[1], config->light[2], 0.f); - const __global Camera *camera = &config->camera; - - //-------------------------------------------------------------------------- - // Calculate eye ray - //-------------------------------------------------------------------------- - - const float invWidth = 1.f / width; - const float invHeight = 1.f / height; - const float kcx = (x + sampleX) * invWidth - .5f; - const float4 kcx4 = (float4)kcx; - const float kcy = (y + sampleY) * invHeight - .5f; - const float4 kcy4 = (float4)kcy; - - const float4 cameraX = (float4)(camera->x.x, camera->x.y, camera->x.z, 0.f); - const float4 cameraY = (float4)(camera->y.x, camera->y.y, camera->y.z, 0.f); - const float4 cameraDir = (float4)(camera->dir.x, camera->dir.y, camera->dir.z, 0.f); - const float4 cameraOrig = (float4)(camera->orig.x, camera->orig.y, camera->orig.z, 0.f); - - const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir); - const float4 eyeRayOrig = eyeRayDir * (float4)0.1f + cameraOrig; - - //-------------------------------------------------------------------------- - // Check if we hit the bounding sphere - //-------------------------------------------------------------------------- - - int useAO = 1; - float4 diffuse, n, color; - - float4 hitPoint; - float dist, tmin, tmax; - if (IntersectBoundingSphere(eyeRayOrig, eyeRayDir, &tmin, &tmax)) { - //-------------------------------------------------------------------------- - // Find the intersection with the set - //-------------------------------------------------------------------------- - - uint steps; - float4 rayOrig = eyeRayOrig + eyeRayDir * (float4)tmin; - dist = IntersectBulb(rayOrig, eyeRayDir, mu, maxIterations, - epsilon, tmax - tmin, &hitPoint, &steps); - - if (dist <= epsilon) { - // Set hit - diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f); - n = NormEstimate(hitPoint, mu, dist, maxIterations); - } else - dist = -1.f; - } else - dist = -1.f; - - //-------------------------------------------------------------------------- - // Check if we hit the floor - //-------------------------------------------------------------------------- - - if (dist < 0.f) { - dist = IntersectFloorSphere(eyeRayOrig, eyeRayDir); - - if (dist >= 0.f) { - // Floor hit - hitPoint = eyeRayOrig + eyeRayDir * (float4)dist; - n = hitPoint - WORLD_CENTER; - n = normalize(n); - // The most important feature in a ray tracer: a checker texture ! - const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x); - const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z); - if ((ix + iz) % 2) - diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f); - else - diffuse = (float4) (0.75f, 0.f, 0.f, 0.f); - useAO = 0; - } else { - // Sky hit - color = (float4)(0.f, 0.1f, 0.3f, 0.f); - } - } else { - // Sky hit - color = (float4)(0.f, 0.1f, 0.3f, 0.f); - } - - //-------------------------------------------------------------------------- - // Select the shadow pass - //-------------------------------------------------------------------------- - - if (dist >= 0.f) { - float shadowFactor = 1.f; - if (config->enableShadow) { - float4 L = normalize(light - hitPoint); - float4 rO = hitPoint + n * 1e-2f; - float4 shadowHitPoint; - - // Check bounding sphere - if (IntersectBoundingSphere(rO, L, &tmin, &tmax)) { - float shadowDistSet = tmin; - uint steps; - - rO = rO + L * (float4)shadowDistSet; - shadowDistSet = IntersectBulb(rO, L, mu, maxIterations, epsilon, - tmax - tmin, &shadowHitPoint, &steps); - if (shadowDistSet < epsilon) { - if (useAO) { - // Use steps count to simulate ambient occlusion - shadowFactor = 0.6f - min(steps / 255.f, 0.5f); - } else - shadowFactor = 0.6f; - } - } - } - - //-------------------------------------------------------------------------- - // Direct lighting of hit point - //-------------------------------------------------------------------------- - - color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor; - } - - //-------------------------------------------------------------------------- - // Write pixel - //-------------------------------------------------------------------------- - - int offset = 3 * (x + y * width); - color = clamp(color, (float4)(0.f, 0.f ,0.f, 0.f), (float4)(1.f, 1.f ,1.f, 0.f)); - if (enableAccumulation) { - pixels[offset++] += color.s0; - pixels[offset++] += color.s1; - pixels[offset] += color.s2; - } else { - pixels[offset++] = color.s0; - pixels[offset++] = color.s1; - pixels[offset] = color.s2; - } -} - -kernel void multiply(global float *array, const int numElements, const float s) { - const int gid = get_global_id(0); - if (gid >= numElements) { - return; - } - array[gid] *= s; -} diff --git a/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl b/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl deleted file mode 100644 index 9c25c1b..0000000 --- a/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl +++ /dev/null @@ -1,382 +0,0 @@ -/* -Copyright (c) 2009 David Bucciarelli (davibu@interfree.it) - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#define GPU_KERNEL - - -#define BOUNDING_RADIUS_2 4.f -#define ESCAPE_THRESHOLD 1e1f -#define DELTA 1e-4f - -typedef struct { - float x, y, z; // position, also color (r,g,b) -} Vec; - -typedef struct { - Vec orig, target; - Vec dir, x, y; -} Camera; - -typedef struct { - unsigned int width, height; - int superSamplingSize; - int actvateFastRendering; - int enableShadow; - - unsigned int maxIterations; - float epsilon; - float mu[4]; - float light[3]; - Camera camera; -} RenderingConfig; - - -static float4 QuatMult(const float4 q1, const float4 q2) { - float4 r; - - // a1a2 - b1b2 - c1c2 - d1d2 - r.x = q1.x * q2.x - q1.y * q2.y - q1.z * q2.z - q1.w * q2.w; - // a1b2 + b1a2 + c1d2 - d1c2 - r.y = q1.x * q2.y + q1.y * q2.x + q1.z * q2.w - q1.w * q2.z; - // a1c2 - b1d2 + c1a2 + d1b2 - r.z = q1.x * q2.z - q1.y * q2.w + q1.z * q2.x + q1.w * q2.y; - // a1d2 + b1c2 - c1b2 + d1a2 - r.w = q1.x * q2.w + q1.y * q2.z - q1.z * q2.y + q1.w * q2.x; - - return r; -} - -static float4 QuatSqr(const float4 q) { - float4 r; - - r.x = q.x * q.x - q.y * q.y - q.z * q.z - q.w * q.w; - r.y = 2.f * q.x * q.y; - r.z = 2.f * q.x * q.z; - r.w = 2.f * q.x * q.w; - - return r; -} - -static void IterateIntersect(float4 *q, float4 *qp, const float4 c, const uint maxIterations) { - float4 q0 = *q; - float4 qp0 = *qp; - - for (uint i = 0; i < maxIterations; ++i) { - qp0 = 2.f * QuatMult(q0, qp0); - q0 = QuatSqr(q0) + c; - - if (dot(q0, q0) > ESCAPE_THRESHOLD) - break; - } - - *q = q0; - *qp = qp0; -} - -static float IntersectJulia(const float4 eyeRayOrig, const float4 eyeRayDir, - const float4 c, const uint maxIterations, const float epsilon, - float4 *hitPoint, uint *steps) { - float dist; - float4 r0 = eyeRayOrig; - - uint s = 0; - do { - float4 z = r0; - float4 zp = (float4) (1.f, 0.f, 0.f, 0.f); - - IterateIntersect(&z, &zp, c, maxIterations); - - const float normZP = length(zp); - - // We are inside - if (normZP == 0.f) - break; - - const float normZ = length(z); - dist = 0.5f * normZ * log(normZ) / normZP; - - r0 += eyeRayDir * dist; - s++; - } while ((dist > epsilon) && (dot(r0, r0) < BOUNDING_RADIUS_2)); - - *hitPoint = r0; - *steps = s; - return dist; -} - -#define WORLD_RADIUS 1000.f -#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f)) - -float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) { - const float4 op = WORLD_CENTER - eyeRayOrig; - const float b = dot(op, eyeRayDir); - float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS; - - if (det < 0.f) - return -1.f; - else - det = sqrt(det); - - float t = b - det; - if (t > 0.f) - return t; - else { - // We are inside, avoid the hit - return -1.f; - } -} - -float IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir) { - const float4 op = -eyeRayOrig; - const float b = dot(op, eyeRayDir); - float det = b * b - dot(op, op) + BOUNDING_RADIUS_2; - - if (det < 0.f) - return -1.f; - else - det = sqrt(det); - - float t = b - det; - if (t > 0.f) - return t; - else { - t = b + det; - - if (t > 0.f) { - // We are inside, start from the ray origin - return 0.0f; - } else - return -1.f; - } -} - -static float4 NormEstimate(const float4 p, const float4 c, - const float delta, const uint maxIterations) { - float4 N; - float4 qP = p; - float gradX, gradY, gradZ; - - float4 gx1 = qP - (float4) (DELTA, 0.f, 0.f, 0.f); - float4 gx2 = qP + (float4) (DELTA, 0.f, 0.f, 0.f); - float4 gy1 = qP - (float4) (0.f, DELTA, 0.f, 0.f); - float4 gy2 = qP + (float4) (0.f, DELTA, 0.f, 0.f); - float4 gz1 = qP - (float4) (0.f, 0.f, DELTA, 0.f); - float4 gz2 = qP + (float4) (0.f, 0.f, DELTA, 0.f); - - for (uint i = 0; i < maxIterations; ++i) { - gx1 = QuatSqr(gx1) + c; - gx2 = QuatSqr(gx2) + c; - gy1 = QuatSqr(gy1) + c; - gy2 = QuatSqr(gy2) + c; - gz1 = QuatSqr(gz1) + c; - gz2 = QuatSqr(gz2) + c; - } - - gradX = length(gx2) - length(gx1); - gradY = length(gy2) - length(gy1); - gradZ = length(gz2) - length(gz1); - - N = normalize((float4) (gradX, gradY, gradZ, 0.f)); - - return N; -} - -static float4 Phong(const float4 light, const float4 eye, const float4 pt, const float4 N, const float4 diffuse) { - - const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f); - float4 L = normalize(light - pt); - float NdotL = dot(N, L); - if (NdotL < 0.f) - return diffuse * ambient; - - const float specularExponent = 30.f; - const float specularity = 0.65f; - - float4 E = normalize(eye - pt); - float4 H = (L + E) * (float) 0.5f; - - return diffuse * NdotL + - specularity * pow(dot(N, H), specularExponent) + - diffuse * ambient; -} - -kernel void JuliaGPU( global float *pixels, - const global RenderingConfig *config, - int enableAccumulation, - float sampleX, - float sampleY ) { - - const int gid = get_global_id(0); - unsigned width = config->width; - unsigned height = config->height; - - const unsigned int x = gid % width; - const int y = gid / width; - - // Check if we have to do something - if (y >= height) - return; - - const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.f / 0.75f)) : config->epsilon; - const uint maxIterations = max(1u, config->actvateFastRendering ? (config->maxIterations - 1) : config->maxIterations); - - const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]); - const float4 light = (float4) (config->light[0], config->light[1], config->light[2], 0.f); - const global Camera *camera = &config->camera; - - //-------------------------------------------------------------------------- - // Calculate eye ray - //-------------------------------------------------------------------------- - - const float invWidth = 1.f / width; - const float invHeight = 1.f / height; - const float kcx = (x + sampleX) * invWidth - .5f; - const float4 kcx4 = (float4) kcx; - const float kcy = (y + sampleY) * invHeight - .5f; - const float4 kcy4 = (float4) kcy; - - const float4 cameraX = (float4) (camera->x.x, camera->x.y, camera->x.z, 0.f); - const float4 cameraY = (float4) (camera->y.x, camera->y.y, camera->y.z, 0.f); - const float4 cameraDir = (float4) (camera->dir.x, camera->dir.y, camera->dir.z, 0.f); - const float4 cameraOrig = (float4) (camera->orig.x, camera->orig.y, camera->orig.z, 0.f); - - const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir); - const float4 eyeRayOrig = eyeRayDir * (float4) 0.1f + cameraOrig; - - //-------------------------------------------------------------------------- - // Check if we hit the bounding sphere - //-------------------------------------------------------------------------- - - float distSet = IntersectBoundingSphere(eyeRayOrig, eyeRayDir); - float4 hitPoint; - if (distSet >= 0.f) { - //-------------------------------------------------------------------------- - // Find the intersection with the set - //-------------------------------------------------------------------------- - - uint steps; - float4 rayOrig = eyeRayOrig + eyeRayDir * (float4) distSet; - distSet = IntersectJulia(rayOrig, eyeRayDir, mu, maxIterations, - epsilon, &hitPoint, &steps); - if (distSet > epsilon) - distSet = -1.f; - } - - //-------------------------------------------------------------------------- - // Check if we hit the floor - //-------------------------------------------------------------------------- - - float distFloor = IntersectFloorSphere(eyeRayOrig, eyeRayDir); - - //-------------------------------------------------------------------------- - // Select the hit point - //-------------------------------------------------------------------------- - - int doShade = 0; - int useAO = 1; - float4 diffuse, n, color; - if ((distSet < 0.f) && (distFloor < 0.f)) { - // Sky hit - color = (float4) (0.f, 0.1f, 0.3f, 0.f); - } else if ((distSet >= 0.f) && ((distFloor < 0.f) || (distSet <= distFloor))) { - // Set hit - diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f); - n = NormEstimate(hitPoint, mu, distSet, maxIterations); - doShade = 1; - } else if ((distFloor >= 0.f) && ((distSet < 0.f) || (distFloor <= distSet))) { - // Floor hit - hitPoint = eyeRayOrig + eyeRayDir * (float4) distFloor; - n = hitPoint - WORLD_CENTER; - n = normalize(n); - // The most important feature in a ray tracer: a checker texture ! - const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x); - const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z); - if ((ix + iz) % 2) - diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f); - else - diffuse = (float4) (0.75f, 0.f, 0.f, 0.f); - doShade = 1; - useAO = 0; - } - - //-------------------------------------------------------------------------- - // Select the shadow pass - //-------------------------------------------------------------------------- - - if (doShade) { - float shadowFactor = 1.f; - if (config->enableShadow) { - float4 L = normalize(light - hitPoint); - float4 rO = hitPoint + n * 1e-2f; - float4 shadowHitPoint; - - // Check bounding sphere - float shadowDistSet = IntersectBoundingSphere(rO, L); - if (shadowDistSet >= 0.f) { - uint steps; - - rO = rO + L * (float4) shadowDistSet; - shadowDistSet = IntersectJulia(rO, L, mu, maxIterations, epsilon, - &shadowHitPoint, &steps); - if (shadowDistSet < epsilon) { - if (useAO) { - // Use steps count to simulate ambient occlusion - shadowFactor = 0.6f - min(steps / 255.f, 0.5f); - } else - shadowFactor = 0.6f; - } - } else - shadowDistSet = -1.f; - } - - //-------------------------------------------------------------------------- - // Direct lighting of hit point - //-------------------------------------------------------------------------- - - color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor; - } - - //-------------------------------------------------------------------------- - // Write pixel - //-------------------------------------------------------------------------- - - int offset = 3 * (x + y * width); - color = clamp(color, (float4) (0.f, 0.f, 0.f, 0.f), (float4) (1.f, 1.f, 1.f, 0.f)); - if (enableAccumulation) { - pixels[offset++] += color.s0; - pixels[offset++] += color.s1; - pixels[offset ] += color.s2; - } else { - pixels[offset++] = color.s0; - pixels[offset++] = color.s1; - pixels[offset ] = color.s2; - } -} - -kernel void multiply(global float *array, const int numElements, const float s) { - const int gid = get_global_id(0); - if (gid >= numElements) { - return; - } - array[gid] *= s; -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera.java deleted file mode 100644 index 50d5868..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/Camera.java +++ /dev/null @@ -1,52 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.common.nio.*; - - -public abstract class Camera { - - StructAccessor accessor; - - public static int size() { -// if (CPU.is32Bit()) { -// return Camera32.size(); -// } else { - return Camera64.size(); -// } - } - - public static Camera create() { - return create(Buffers.newDirectByteBuffer(size())); - } - - public static Camera create(java.nio.ByteBuffer buf) { -// if (CPU.is32Bit()) { -// return new Camera32(buf); -// } else { - return new Camera64(buf); -// } - } - - Camera(java.nio.ByteBuffer buf) { - accessor = new StructAccessor(buf); - } - - public java.nio.ByteBuffer getBuffer() { - return accessor.getBuffer(); - } - - public abstract Vec getOrig(); - - public abstract Vec getTarget(); - - public abstract Vec getDir(); - - public abstract Vec getX(); - - public abstract Vec getY(); -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java deleted file mode 100644 index 85dea51..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java +++ /dev/null @@ -1,41 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.gluegen.runtime.*; - - -class Camera32 extends Camera { - - public static int size() { - return 76; - } - - Camera32(java.nio.ByteBuffer buf) { - super(buf); - } - - - public Vec getOrig() { - return Vec.create(accessor.slice(0, 12)); - } - - public Vec getTarget() { - return Vec.create(accessor.slice(16, 12)); - } - - public Vec getDir() { - return Vec.create(accessor.slice(32, 12)); - } - - public Vec getX() { - return Vec.create(accessor.slice(48, 12)); - } - - public Vec getY() { - return Vec.create(accessor.slice(64, 12)); - } -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java deleted file mode 100644 index afe4acd..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java +++ /dev/null @@ -1,52 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.gluegen.runtime.*; - - -class Camera64 extends Camera { - - private final Vec orig; - private final Vec target; - private final Vec dir; - private final Vec x; - private final Vec y; - - public static int size() { - return 60; - } - - Camera64(java.nio.ByteBuffer buf) { - super(buf); - orig = Vec.create(accessor.slice(0, 12)); - target = Vec.create(accessor.slice(12, 12)); - dir = Vec.create(accessor.slice(24, 12)); - x = Vec.create(accessor.slice(36, 12)); - y = Vec.create(accessor.slice(48, 12)); - } - - - public Vec getOrig() { - return orig; - } - - public Vec getTarget() { - return target; - } - - public Vec getDir() { - return dir; - } - - public Vec getX() { - return x; - } - - public Vec getY() { - return y; - } -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java deleted file mode 100644 index 5bc65b5..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java +++ /dev/null @@ -1,80 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.common.nio.*; - - -public abstract class RenderingConfig { - - StructAccessor accessor; - - public static int size() { -// if (CPU.is32Bit()) { -// return RenderingConfig32.size(); -// } else { - return RenderingConfig64.size(); -// } - } - - public static RenderingConfig create() { - return create(Buffers.newDirectByteBuffer(size())); - } - - public static RenderingConfig create(java.nio.ByteBuffer buf) { -// if (CPU.is32Bit()) { -// return new RenderingConfig32(buf); -// } else { - return new RenderingConfig64(buf); -// } - } - - RenderingConfig(java.nio.ByteBuffer buf) { - accessor = new StructAccessor(buf); - } - - public java.nio.ByteBuffer getBuffer() { - return accessor.getBuffer(); - } - - public abstract RenderingConfig setWidth(int val); - - public abstract int getWidth(); - - public abstract RenderingConfig setHeight(int val); - - public abstract int getHeight(); - - public abstract RenderingConfig setSuperSamplingSize(int val); - - public abstract int getSuperSamplingSize(); - - public abstract RenderingConfig setActvateFastRendering(int val); - - public abstract int getActvateFastRendering(); - - public abstract RenderingConfig setEnableShadow(int val); - - public abstract int getEnableShadow(); - - public abstract RenderingConfig setMaxIterations(int val); - - public abstract int getMaxIterations(); - - public abstract RenderingConfig setEpsilon(float val); - - public abstract float getEpsilon(); - - public abstract RenderingConfig setMu(float[] val); - - public abstract float[] getMu(); - - public abstract RenderingConfig setLight(float[] val); - - public abstract float[] getLight(); - - public abstract Camera getCamera(); -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java deleted file mode 100644 index 3898d60..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java +++ /dev/null @@ -1,106 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.gluegen.runtime.*; - - -class RenderingConfig32 extends RenderingConfig { - - public static int size() { - return 140; - } - - RenderingConfig32(java.nio.ByteBuffer buf) { - super(buf); - } - - - public RenderingConfig setWidth(int val) { - accessor.setIntAt(0, val); - return this; - } - - public int getWidth() { - return accessor.getIntAt(0); - } - - public RenderingConfig setHeight(int val) { - accessor.setIntAt(1, val); - return this; - } - - public int getHeight() { - return accessor.getIntAt(1); - } - - public RenderingConfig setSuperSamplingSize(int val) { - accessor.setIntAt(2, val); - return this; - } - - public int getSuperSamplingSize() { - return accessor.getIntAt(2); - } - - public RenderingConfig setActvateFastRendering(int val) { - accessor.setIntAt(3, val); - return this; - } - - public int getActvateFastRendering() { - return accessor.getIntAt(3); - } - - public RenderingConfig setEnableShadow(int val) { - accessor.setIntAt(4, val); - return this; - } - - public int getEnableShadow() { - return accessor.getIntAt(4); - } - - public RenderingConfig setMaxIterations(int val) { - accessor.setIntAt(5, val); - return this; - } - - public int getMaxIterations() { - return accessor.getIntAt(5); - } - - public RenderingConfig setEpsilon(float val) { - accessor.setFloatAt(6, val); - return this; - } - - public float getEpsilon() { - return accessor.getFloatAt(6); - } - - public RenderingConfig setMu(float[] val) { - accessor.setFloatsAt(8, val); - return this; - } - - public float[] getMu() { - return accessor.getFloatsAt(8, new float[4]); - } - - public RenderingConfig setLight(float[] val) { - accessor.setFloatsAt(12, val); - return this; - } - - public float[] getLight() { - return accessor.getFloatsAt(12, new float[3]); - } - - public Camera getCamera() { - return Camera.create(accessor.slice(64, 76)); - } -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java deleted file mode 100644 index e3a0267..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java +++ /dev/null @@ -1,109 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.gluegen.runtime.*; - - -class RenderingConfig64 extends RenderingConfig { - - private final Camera camera; - - public static int size() { - return 116; - } - - RenderingConfig64(java.nio.ByteBuffer buf) { - super(buf); - camera = Camera.create(accessor.slice(56, 60)); - } - - - public RenderingConfig setWidth(int val) { - accessor.setIntAt(0, val); - return this; - } - - public int getWidth() { - return accessor.getIntAt(0); - } - - public RenderingConfig setHeight(int val) { - accessor.setIntAt(1, val); - return this; - } - - public int getHeight() { - return accessor.getIntAt(1); - } - - public RenderingConfig setSuperSamplingSize(int val) { - accessor.setIntAt(2, val); - return this; - } - - public int getSuperSamplingSize() { - return accessor.getIntAt(2); - } - - public RenderingConfig setActvateFastRendering(int val) { - accessor.setIntAt(3, val); - return this; - } - - public int getActvateFastRendering() { - return accessor.getIntAt(3); - } - - public RenderingConfig setEnableShadow(int val) { - accessor.setIntAt(4, val); - return this; - } - - public int getEnableShadow() { - return accessor.getIntAt(4); - } - - public RenderingConfig setMaxIterations(int val) { - accessor.setIntAt(5, val); - return this; - } - - public int getMaxIterations() { - return accessor.getIntAt(5); - } - - public RenderingConfig setEpsilon(float val) { - accessor.setFloatAt(6, val); - return this; - } - - public float getEpsilon() { - return accessor.getFloatAt(6); - } - - public RenderingConfig setMu(float[] val) { - accessor.setFloatsAt(7, val); - return this; - } - - public float[] getMu() { - return accessor.getFloatsAt(7, new float[4]); - } - - public RenderingConfig setLight(float[] val) { - accessor.setFloatsAt(11, val); - return this; - } - - public float[] getLight() { - return accessor.getFloatsAt(11, new float[3]); - } - - public Camera getCamera() { - return camera; - } -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec.java deleted file mode 100644 index 3bd6b18..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/Vec.java +++ /dev/null @@ -1,53 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - - -import com.jogamp.common.nio.*; - - -public abstract class Vec { - - StructAccessor accessor; - - public static int size() { -// if (CPU.is32Bit()) { -// return Vec32.size(); -// } else { - return Vec64.size(); -// } - } - - public static Vec create() { - return create(Buffers.newDirectByteBuffer(size())); - } - - public static Vec create(java.nio.ByteBuffer buf) { -// if (CPU.is32Bit()) { -// return new Vec32(buf); -// } else { - return new Vec64(buf); -// } - } - - Vec(java.nio.ByteBuffer buf) { - accessor = new StructAccessor(buf); - } - - public java.nio.ByteBuffer getBuffer() { - return accessor.getBuffer(); - } - - public abstract Vec setX(float val); - - public abstract float getX(); - - public abstract Vec setY(float val); - - public abstract float getY(); - - public abstract Vec setZ(float val); - - public abstract float getZ(); -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java deleted file mode 100644 index 3c0ee5d..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java +++ /dev/null @@ -1,48 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.gluegen.runtime.*; - - -class Vec32 extends Vec { - - public static int size() { - return 12; - } - - Vec32(java.nio.ByteBuffer buf) { - super(buf); - } - - - public Vec setX(float val) { - accessor.setFloatAt(0, val); - return this; - } - - public float getX() { - return accessor.getFloatAt(0); - } - - public Vec setY(float val) { - accessor.setFloatAt(1, val); - return this; - } - - public float getY() { - return accessor.getFloatAt(1); - } - - public Vec setZ(float val) { - accessor.setFloatAt(2, val); - return this; - } - - public float getZ() { - return accessor.getFloatAt(2); - } -} diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java deleted file mode 100644 index 5952021..0000000 --- a/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java +++ /dev/null @@ -1,48 +0,0 @@ -/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */ - - -package com.mbien.opencl.demos.julia3d.structs; - -import java.nio.*; - -import com.jogamp.gluegen.runtime.*; - - -class Vec64 extends Vec { - - public static int size() { - return 12; - } - - Vec64(java.nio.ByteBuffer buf) { - super(buf); - } - - - public Vec setX(float val) { - accessor.setFloatAt(0, val); - return this; - } - - public float getX() { - return accessor.getFloatAt(0); - } - - public Vec setY(float val) { - accessor.setFloatAt(1, val); - return this; - } - - public float getY() { - return accessor.getFloatAt(1); - } - - public Vec setZ(float val) { - accessor.setFloatAt(2, val); - return this; - } - - public float getZ() { - return accessor.getFloatAt(2); - } -} diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl b/src/com/mbien/opencl/demos/radixsort/RadixSort.cl deleted file mode 100644 index d014692..0000000 --- a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl +++ /dev/null @@ -1,358 +0,0 @@ -/* -* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. -* -* NVIDIA Corporation and its licensors retain all intellectual property and -* proprietary rights in and to this software and related documentation. -* Any use, reproduction, disclosure, or distribution of this software -* and related documentation without an express license agreement from -* NVIDIA Corporation is strictly prohibited. -* -* Please refer to the applicable NVIDIA end user license agreement (EULA) -* associated with this source code for terms and conditions that govern -* your use of this NVIDIA software. -* -*/ - -//---------------------------------------------------------------------------- -// Scans each warp in parallel ("warp-scan"), one element per thread. -// uses 2 numElements of shared memory per thread (64 = elements per warp) -//---------------------------------------------------------------------------- -//#define WARP_SIZE 32 -uint scanwarp(uint val, __local uint* sData, int maxlevel) -{ - // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = - // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1)) - int localId = get_local_id(0); - int idx = 2 * localId - (localId & (WARP_SIZE - 1)); - sData[idx] = 0; - idx += WARP_SIZE; - sData[idx] = val; - - if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } - if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } - if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } - if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } - if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } - - return sData[idx] - val; // convert inclusive -> exclusive -} - -//---------------------------------------------------------------------------- -// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using -// a warp-scan algorithm -//---------------------------------------------------------------------------- -uint4 scan4(uint4 idata, __local uint* ptr) -{ - - uint idx = get_local_id(0); - - uint4 val4 = idata; - uint sum[3]; - sum[0] = val4.x; - sum[1] = val4.y + sum[0]; - sum[2] = val4.z + sum[1]; - - uint val = val4.w + sum[2]; - - val = scanwarp(val, ptr, 4); - barrier(CLK_LOCAL_MEM_FENCE); - - if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1) - { - ptr[idx >> 5] = val + val4.w + sum[2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (idx < WARP_SIZE) - ptr[idx] = scanwarp(ptr[idx], ptr, 2); - - barrier(CLK_LOCAL_MEM_FENCE); - - val += ptr[idx >> 5]; - - val4.x = val; - val4.y = val + sum[0]; - val4.z = val + sum[1]; - val4.w = val + sum[2]; - - return val4; -} - -#ifdef MAC -__kernel uint4 rank4(uint4 preds, __local uint* sMem) -#else -uint4 rank4(uint4 preds, __local uint* sMem) -#endif -{ - int localId = get_local_id(0); - int localSize = get_local_size(0); - - uint4 address = scan4(preds, sMem); - - __local uint numtrue; - if (localId == localSize - 1) - { - numtrue = address.w + preds.w; - } - barrier(CLK_LOCAL_MEM_FENCE); - - uint4 rank; - int idx = localId*4; - rank.x = (preds.x) ? address.x : numtrue + idx - address.x; - rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y; - rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z; - rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w; - - return rank; -} - -void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem) -{ - int localId = get_local_id(0); - int localSize = get_local_size(0); - - for(uint shift = startbit; shift < (startbit + nbits); ++shift) - { - uint4 lsb; - lsb.x = !(((*key).x >> shift) & 0x1); - lsb.y = !(((*key).y >> shift) & 0x1); - lsb.z = !(((*key).z >> shift) & 0x1); - lsb.w = !(((*key).w >> shift) & 0x1); - - uint4 r; - - r = rank4(lsb, sMem); - - // This arithmetic strides the ranks across 4 CTA_SIZE regions - sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x; - sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y; - sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z; - sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w; - barrier(CLK_LOCAL_MEM_FENCE); - - // The above allows us to read without 4-way bank conflicts: - (*key).x = sMem[localId]; - (*key).y = sMem[localId + localSize]; - (*key).z = sMem[localId + 2 * localSize]; - (*key).w = sMem[localId + 3 * localSize]; - - barrier(CLK_LOCAL_MEM_FENCE); - } -} - -__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, - __global uint4* keysOut, - uint nbits, - uint startbit, - uint numElements, - uint totalBlocks, - __local uint* sMem) -{ - int globalId = get_global_id(0); - - uint4 key; - key = keysIn[globalId]; - - barrier(CLK_LOCAL_MEM_FENCE); - - radixSortBlockKeysOnly(&key, nbits, startbit, sMem); - - keysOut[globalId] = key; -} - -//---------------------------------------------------------------------------- -// Given an array with blocks sorted according to a 4-bit radix group, each -// block counts the number of keys that fall into each radix in the group, and -// finds the starting offset of each radix in the block. It then writes the radix -// counts to the counters array, and the starting offsets to the blockOffsets array. -// -// Template parameters are used to generate efficient code for various special cases -// For example, we have to handle arrays that are a multiple of the block size -// (fullBlocks) differently than arrays that are not. "loop" is used when persistent -// CTAs are used. -// -// By persistent CTAs we mean that we launch only as many thread blocks as can -// be resident in the GPU and no more, rather than launching as many threads as -// we have elements. Persistent CTAs loop over blocks of elements until all work -// is complete. This can be faster in some cases. In our tests it is faster -// for large sorts (and the threshold is higher on compute version 1.1 and earlier -// GPUs than it is on compute version 1.2 GPUs. -// -//---------------------------------------------------------------------------- -__kernel void findRadixOffsets(__global uint2* keys, - __global uint* counters, - __global uint* blockOffsets, - uint startbit, - uint numElements, - uint totalBlocks, - __local uint* sRadix1) -{ - __local uint sStartPointers[16]; - - uint groupId = get_group_id(0); - uint localId = get_local_id(0); - uint groupSize = get_local_size(0); - - uint2 radix2; - - radix2 = keys[get_global_id(0)]; - - - sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF; - sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF; - - // Finds the position where the sRadix1 entries differ and stores start - // index for each radix. - if(localId < 16) - { - sStartPointers[localId] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) - { - sStartPointers[sRadix1[localId]] = localId; - } - if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) - { - sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(localId < 16) - { - blockOffsets[groupId*16 + localId] = sStartPointers[localId]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - // Compute the sizes of each block. - if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) - { - sStartPointers[sRadix1[localId - 1]] = - localId - sStartPointers[sRadix1[localId - 1]]; - } - if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) - { - sStartPointers[sRadix1[localId + groupSize - 1]] = - localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]]; - } - - - if(localId == groupSize - 1) - { - sStartPointers[sRadix1[2 * groupSize - 1]] = - 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(localId < 16) - { - counters[localId * totalBlocks + groupId] = sStartPointers[localId]; - } -} - -// a naive scan routine that works only for array that -// can fit into a single block, just for debugging purpose, -// not used in the sort now -__kernel void scanNaive(__global uint *g_odata, - __global uint *g_idata, - uint n, - __local uint* temp) -{ - - int localId = get_local_id(0); - - int pout = 0; - int pin = 1; - - // Cache the computational window in shared memory - temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0; - - for (int offset = 1; offset < n; offset *= 2) - { - pout = 1 - pout; - pin = 1 - pout; - barrier(CLK_LOCAL_MEM_FENCE); - - temp[pout*n+localId] = temp[pin*n+localId]; - - if (localId >= offset) - temp[pout*n+localId] += temp[pin*n+localId - offset]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - g_odata[localId] = temp[pout*n+localId]; -} - -//---------------------------------------------------------------------------- -// reorderData shuffles data in the array globally after the radix offsets -// have been found. On compute version 1.1 and earlier GPUs, this code depends -// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). -// -// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures -// that all writes are coalesced using extra work in the kernel. On later -// GPUs coalescing rules have been relaxed, so this extra overhead hurts -// performance. On these GPUs we set manualCoalesce=false and directly store -// the results. -// -// Template parameters are used to generate efficient code for various special cases -// For example, we have to handle arrays that are a multiple of the block size -// (fullBlocks) differently than arrays that are not. "loop" is used when persistent -// CTAs are used. -// -// By persistent CTAs we mean that we launch only as many thread blocks as can -// be resident in the GPU and no more, rather than launching as many threads as -// we have elements. Persistent CTAs loop over blocks of elements until all work -// is complete. This can be faster in some cases. In our tests it is faster -// for large sorts (and the threshold is higher on compute version 1.1 and earlier -// GPUs than it is on compute version 1.2 GPUs. -//---------------------------------------------------------------------------- -__kernel void reorderDataKeysOnly(__global uint *outKeys, - __global uint2 *keys, - __global uint *blockOffsets, - __global uint *offsets, - __global uint *sizes, - uint startbit, - uint numElements, - uint totalBlocks, - __local uint2* sKeys2) -{ - __local uint sOffsets[16]; - __local uint sBlockOffsets[16]; - - __local uint *sKeys1 = (__local uint*)sKeys2; - - uint groupId = get_group_id(0); - - uint globalId = get_global_id(0); - uint localId = get_local_id(0); - uint groupSize = get_local_size(0); - - sKeys2[localId] = keys[globalId]; - - if(localId < 16) - { - sOffsets[localId] = offsets[localId * totalBlocks + groupId]; - sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - uint radix = (sKeys1[localId] >> startbit) & 0xF; - uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix]; - - if (globalOffset < numElements) - { - outKeys[globalOffset] = sKeys1[localId]; - } - - radix = (sKeys1[localId + groupSize] >> startbit) & 0xF; - globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix]; - - if (globalOffset < numElements) - { - outKeys[globalOffset] = sKeys1[localId + groupSize]; - } - - -} diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.java b/src/com/mbien/opencl/demos/radixsort/RadixSort.java deleted file mode 100644 index 783c094..0000000 --- a/src/com/mbien/opencl/demos/radixsort/RadixSort.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * 20:38 Sunday, February 28 2010 - */ - -package com.mbien.opencl.demos.radixsort; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLContext; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLProgram; -import com.mbien.opencl.CLResource; -import java.io.IOException; -import java.nio.IntBuffer; - -import static com.mbien.opencl.CLMemory.Mem.*; -import static com.mbien.opencl.CLProgram.*; -import static com.mbien.opencl.CLProgram.CompilerOptions.*; -import static java.lang.System.*; - -/** - * - * @author Michael Bien - */ -public class RadixSort implements CLResource { - - private static final int NUM_BANKS = 16; - private static final int WARP_SIZE = 32; - private static final int bitStep = 4; - - private final int CTA_SIZE; - - private final CLKernel ckRadixSortBlocksKeysOnly; - private final CLKernel ckFindRadixOffsets; - private final CLKernel ckScanNaive; - private final CLKernel ckReorderDataKeysOnly; - - private final CLBuffer tempKeys; - private final CLBuffer mCounters; - private final CLBuffer mCountersSum; - private final CLBuffer mBlockOffsets; - - private final CLCommandQueue queue; - private final Scan scan; - private final CLProgram program; - - public RadixSort(CLCommandQueue queue, int maxElements, int CTA_SIZE) throws IOException { - - this.CTA_SIZE = CTA_SIZE; - scan = new Scan(queue, maxElements / 2 / CTA_SIZE * 16); - - int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0) - ? (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1); - - this.queue = queue; - - CLContext context = queue.getContext(); - this.tempKeys = context.createBuffer(4 * maxElements, READ_WRITE); - this.mCounters = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); - this.mCountersSum = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); - this.mBlockOffsets = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE); - - program = context.createProgram(getClass().getResourceAsStream("RadixSort.cl")) - .build(ENABLE_MAD, define("WARP_SIZE", WARP_SIZE)); - -// out.println(program.getBuildLog()); - - ckRadixSortBlocksKeysOnly = program.createCLKernel("radixSortBlocksKeysOnly"); - ckFindRadixOffsets = program.createCLKernel("findRadixOffsets"); - ckScanNaive = program.createCLKernel("scanNaive"); - ckReorderDataKeysOnly = program.createCLKernel("reorderDataKeysOnly"); - - } - - void sort(CLBuffer d_keys, int numElements, int keyBits) { - radixSortKeysOnly(d_keys, numElements, keyBits); - } - - //---------------------------------------------------------------------------- - // Main key-only radix sort function. Sorts in place in the keys and values - // arrays, but uses the other device arrays as temporary storage. All pointer - // parameters are device pointers. Uses cudppScan() for the prefix sum of - // radix counters. - //---------------------------------------------------------------------------- - void radixSortKeysOnly(CLBuffer keys, int numElements, int keyBits) { - int i = 0; - while (keyBits > i * bitStep) { - radixSortStepKeysOnly(keys, bitStep, i * bitStep, numElements); - i++; - } - } - - //---------------------------------------------------------------------------- - // Perform one step of the radix sort. Sorts by nbits key bits per step, - // starting at startbit. - //---------------------------------------------------------------------------- - void radixSortStepKeysOnly(CLBuffer keys, int nbits, int startbit, int numElements) { - - // Four step algorithms from Satish, Harris & Garland - radixSortBlocksKeysOnlyOCL(keys, nbits, startbit, numElements); - - findRadixOffsetsOCL(startbit, numElements); - - scan.scanExclusiveLarge(mCountersSum, mCounters, 1, numElements / 2 / CTA_SIZE * 16); - - reorderDataKeysOnlyOCL(keys, startbit, numElements); - } - - //---------------------------------------------------------------------------- - // Wrapper for the kernels of the four steps - //---------------------------------------------------------------------------- - void radixSortBlocksKeysOnlyOCL(CLBuffer keys, int nbits, int startbit, int numElements) { - - int totalBlocks = numElements / 4 / CTA_SIZE; - int globalWorkSize = CTA_SIZE * totalBlocks; - int localWorkSize = CTA_SIZE; - - ckRadixSortBlocksKeysOnly.putArg(keys).putArg(tempKeys).putArg(nbits).putArg(startbit) - .putArg(numElements).putArg(totalBlocks).putNullArg(4 * CTA_SIZE * 4) - .rewind(); - - queue.put1DRangeKernel(ckRadixSortBlocksKeysOnly, 0, globalWorkSize, localWorkSize); - } - - void findRadixOffsetsOCL(int startbit, int numElements) { - - int totalBlocks = numElements / 2 / CTA_SIZE; - int globalWorkSize = CTA_SIZE * totalBlocks; - int localWorkSize = CTA_SIZE; - - ckFindRadixOffsets.putArg(tempKeys).putArg(mCounters).putArg(mBlockOffsets) - .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4) - .rewind(); - - queue.put1DRangeKernel(ckFindRadixOffsets, 0, globalWorkSize, localWorkSize); - } - - void scanNaiveOCL(int numElements) { - - int nHist = numElements / 2 / CTA_SIZE * 16; - int globalWorkSize = nHist; - int localWorkSize = nHist; - int extra_space = nHist / NUM_BANKS; - int shared_mem_size = 4 * (nHist + extra_space); - - ckScanNaive.putArg(mCountersSum).putArg(mCounters).putArg(nHist).putNullArg(2 * shared_mem_size).rewind(); - - queue.put1DRangeKernel(ckScanNaive, 0, globalWorkSize, localWorkSize); - } - - void reorderDataKeysOnlyOCL(CLBuffer keys, int startbit, int numElements) { - - int totalBlocks = numElements / 2 / CTA_SIZE; - int globalWorkSize = CTA_SIZE * totalBlocks; - int localWorkSize = CTA_SIZE; - - ckReorderDataKeysOnly.putArg(keys).putArg(tempKeys).putArg(mBlockOffsets).putArg(mCountersSum).putArg(mCounters) - .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4).rewind(); - - queue.put1DRangeKernel(ckReorderDataKeysOnly, 0, globalWorkSize, localWorkSize); - } - - public void release() { - - scan.release(); - - //program & kernels - program.release(); - - //buffers - tempKeys.release(); - mCounters.release(); - mCountersSum.release(); - mBlockOffsets.release(); - } - - public void close() { - release(); - } - - - -} diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java b/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java deleted file mode 100644 index 93bc071..0000000 --- a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * 20:48 Sunday, February 28 2010 - */ - -package com.mbien.opencl.demos.radixsort; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLContext; -import com.mbien.opencl.CLPlatform; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.Random; - -import static com.mbien.opencl.CLMemory.Mem.*; -import static java.lang.System.*; -import static com.mbien.opencl.CLDevice.Type.*; - -/** - * GPU radix sort demo. - * @author Michael Bien - */ -public class RadixSortDemo { - - public RadixSortDemo() throws IOException { - - CLContext context = null; - try{ - //single GPU setup - context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU)); - CLCommandQueue queue = context.getDevices()[0].createCommandQueue(); - - int maxValue = Integer.MAX_VALUE; - int samples = 10; - - int[] workgroupSizes = new int[] {128, 256}; - - int[] runs = new int[] { 32768, - 65536, - 131072, - 262144, - 524288, - 1048576, - 2097152, - 4194304, - 8388608 }; - - for (int i = 0; i < workgroupSizes.length; i++) { - - int workgroupSize = workgroupSizes[i]; - - out.println("\n = = = workgroup size: "+workgroupSize+" = = = "); - - for(int run = 0; run < runs.length; run++) { - - if( workgroupSize==128 && runs[run] >= 8388608 - || workgroupSize==256 && runs[run] <= 32768) { - continue; // we can only sort up to 4MB with wg size of 128 - } - - int numElements = runs[run]; - - CLBuffer array = context.createIntBuffer(numElements, READ_WRITE); - out.print("array size: " + array.getCLSize()/1000000.0f+"MB; "); - out.println("elements: " + array.getCapacity()/1000+"K"); - - fillBuffer(array, maxValue); - - RadixSort radixSort = new RadixSort(queue, numElements, workgroupSize); - for(int a = 0; a < samples; a++) { - - queue.finish(); - - long time = nanoTime(); - - queue.putWriteBuffer(array, false); - radixSort.sort(array, numElements, 32); - queue.putReadBuffer(array, true); - - out.println("time: " + (nanoTime() - time)/1000000.0f+"ms"); - } - - out.print("snapshot: "); - printSnapshot(array.getBuffer(), 20); - - out.println("validating..."); - checkIfSorted(array.getBuffer()); - out.println("values sorted"); - - array.release(); - radixSort.release(); - } - } - - }finally{ - if(context != null) { - context.release(); - } - } - - } - - private void fillBuffer(CLBuffer array, int maxValue) { - Random random = new Random(42); - for (int n = 0; n < array.getBuffer().capacity(); n++) { - int rnd = random.nextInt(maxValue); - array.getBuffer().put(n, rnd); - } - } - - private void printSnapshot(IntBuffer buffer, int snapshot) { - for(int i = 0; i < snapshot; i++) - out.print(buffer.get() + ", "); - out.println("...; " + buffer.remaining() + " more"); - buffer.rewind(); - } - - private void checkIfSorted(IntBuffer keys) { - for (int i = 1; i < keys.capacity(); i++) { - if (keys.get(i - 1) > keys.get(i)) { - throw new RuntimeException("not sorted "+ keys.get(i - 1) +" !> "+ keys.get(i)); - } - } - } - - public static void main(String[] args) throws IOException { - new RadixSortDemo(); - } -} diff --git a/src/com/mbien/opencl/demos/radixsort/Scan.java b/src/com/mbien/opencl/demos/radixsort/Scan.java deleted file mode 100644 index 1426e9b..0000000 --- a/src/com/mbien/opencl/demos/radixsort/Scan.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * 22:12 Sunday, February 28 2010 - */ -package com.mbien.opencl.demos.radixsort; - -import com.mbien.opencl.CLBuffer; -import com.mbien.opencl.CLCommandQueue; -import com.mbien.opencl.CLContext; -import com.mbien.opencl.CLKernel; -import com.mbien.opencl.CLProgram; -import com.mbien.opencl.CLResource; -import java.io.IOException; - -import static com.mbien.opencl.CLMemory.Mem.*; -import static com.mbien.opencl.CLProgram.CompilerOptions.*; - -/** - * - * @author Michael Bien - */ -public class Scan implements CLResource { - - private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024; - private final static int MAX_LOCAL_GROUP_SIZE = 256; - private final static int WORKGROUP_SIZE = 256; - private final static int MAX_BATCH_ELEMENTS = 64 * 1048576; - private final static int MIN_SHORT_ARRAY_SIZE = 4; - private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE; - private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE; - private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE; - - private final CLKernel ckScanExclusiveLocal1; - private final CLKernel ckScanExclusiveLocal2; - private final CLKernel ckUniformUpdate; - - private final CLCommandQueue queue; - private final CLProgram program; - private CLBuffer buffer; - - public Scan(CLCommandQueue queue, int numElements) throws IOException { - - this.queue = queue; - - CLContext context = queue.getContext(); - if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) { - buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE); - } - program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl")) - .build(ENABLE_MAD); - - ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1"); - ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2"); - ckUniformUpdate = program.createCLKernel("uniformUpdate"); - } - - // main exclusive scan routine - void scanExclusiveLarge(CLBuffer dst, CLBuffer src, int batchSize, int arrayLength) { - - //Check power-of-two factorization - if(!isPowerOf2(arrayLength)) { - throw new RuntimeException(); - } - - //Check supported size range - if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) { - throw new RuntimeException(); - } - - //Check total batch size limit - if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) { - throw new RuntimeException(); - } - - scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE); - scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE)); - uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE)); - } - - void scanExclusiveLocal1(CLBuffer dst, CLBuffer src, int n, int size) { - - ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size) - .rewind(); - - int localWorkSize = WORKGROUP_SIZE; - int globalWorkSize = (n * size) / 4; - - queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize); - } - - void scanExclusiveLocal2(CLBuffer buffer, CLBuffer dst, CLBuffer src, int n, int size) { - - int elements = n * size; - ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4) - .putArg(elements).putArg(size).rewind(); - - int localWorkSize = WORKGROUP_SIZE; - int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE); - - queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize); - } - - void uniformUpdate(CLBuffer dst, CLBuffer buffer, int n) { - - ckUniformUpdate.setArgs(dst, buffer); - - int localWorkSize = WORKGROUP_SIZE; - int globalWorkSize = n * WORKGROUP_SIZE; - - queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize); - } - - private int iSnapUp(int dividend, int divisor) { - return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor); - } - - public static boolean isPowerOf2(int x) { - return ((x - 1) & x) == 0; - } - - public void release() { - program.release(); - - if(buffer!=null) { - buffer.release(); - } - } - - public void close() { - release(); - } -} diff --git a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl b/src/com/mbien/opencl/demos/radixsort/Scan_b.cl deleted file mode 100644 index 32fd4dd..0000000 --- a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ - - - -//All three kernels run 512 threads per workgroup -//Must be a power of two -#define WORKGROUP_SIZE 256 - - - -//////////////////////////////////////////////////////////////////////////////// -// Scan codelets -//////////////////////////////////////////////////////////////////////////////// -#if(1) - //Naive inclusive scan: O(N * log2(N)) operations - //Allocate 2 * 'size' local memory, initialize the first half - //with 'size' zeros avoiding if(pos >= offset) condition evaluation - //and saving instructions - inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ - uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); - l_Data[pos] = 0; - pos += size; - l_Data[pos] = idata; - - for(uint offset = 1; offset < size; offset <<= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - uint t = l_Data[pos] + l_Data[pos - offset]; - barrier(CLK_LOCAL_MEM_FENCE); - l_Data[pos] = t; - } - - return l_Data[pos]; - } - - inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ - return scan1Inclusive(idata, l_Data, size) - idata; - } - -#else - #define LOG2_WARP_SIZE 5U - #define WARP_SIZE (1U << LOG2_WARP_SIZE) - - //Almost the same as naiveScan1 but doesn't need barriers - //assuming size <= WARP_SIZE - inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){ - uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); - l_Data[pos] = 0; - pos += size; - l_Data[pos] = idata; - - for(uint offset = 1; offset < size; offset <<= 1) - l_Data[pos] += l_Data[pos - offset]; - - return l_Data[pos]; - } - - inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){ - return warpScanInclusive(idata, l_Data, size) - idata; - } - - inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ - if(size > WARP_SIZE){ - //Bottom-level inclusive warp scan - uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE); - - //Save top elements of each warp for exclusive warp scan - //sync to wait for warp scans to complete (because l_Data is being overwritten) - barrier(CLK_LOCAL_MEM_FENCE); - if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) ) - l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult; - - //wait for warp scans to complete - barrier(CLK_LOCAL_MEM_FENCE); - if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){ - //grab top warp elements - uint val = l_Data[get_local_id(0)]; - //calculate exclsive scan and write back to shared memory - l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE); - } - - //return updated warp scans with exclusive scan results - barrier(CLK_LOCAL_MEM_FENCE); - return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE]; - }else{ - return warpScanInclusive(idata, l_Data, size); - } - } - - inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ - return scan1Inclusive(idata, l_Data, size) - idata; - } -#endif - - -//Vector scan: the array to be scanned is stored -//in work-item private memory as uint4 -inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){ - //Level-0 inclusive scan - data4.y += data4.x; - data4.z += data4.y; - data4.w += data4.z; - - //Level-1 exclusive scan - uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w; - - return (data4 + (uint4)val); -} - -inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){ - return scan4Inclusive(data4, l_Data, size) - data4; -} - - -//////////////////////////////////////////////////////////////////////////////// -// Scan kernels -//////////////////////////////////////////////////////////////////////////////// -__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) -void scanExclusiveLocal1( - __global uint4 *d_Dst, - __global uint4 *d_Src, - __local uint* l_Data, - uint size -){ - //Load data - uint4 idata4 = d_Src[get_global_id(0)]; - - //Calculate exclusive scan - uint4 odata4 = scan4Exclusive(idata4, l_Data, size); - - //Write back - d_Dst[get_global_id(0)] = odata4; -} - -//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE) -__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) -void scanExclusiveLocal2( - __global uint *d_Buf, - __global uint *d_Dst, - __global uint *d_Src, - __local uint* l_Data, - uint N, - uint arrayLength -){ - //Load top elements - //Convert results of bottom-level scan back to inclusive - //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N) - uint data = 0; - if(get_global_id(0) < N) - data = - d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + - d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)]; - - //Compute - uint odata = scan1Exclusive(data, l_Data, arrayLength); - - //Avoid out-of-bound access - if(get_global_id(0) < N) - d_Buf[get_global_id(0)] = odata; -} - -//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays -__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1))) -void uniformUpdate( - __global uint4 *d_Data, - __global uint *d_Buf -){ - __local uint buf[1]; - - uint4 data4 = d_Data[get_global_id(0)]; - - if(get_local_id(0) == 0) - buf[0] = d_Buf[get_group_id(0)]; - - barrier(CLK_LOCAL_MEM_FENCE); - data4 += (uint4)buf[0]; - d_Data[get_global_id(0)] = data4; -} -- cgit v1.2.3