summaryrefslogtreecommitdiffstats
path: root/src/com/mbien/opencl
diff options
context:
space:
mode:
authorMichael Bien <[email protected]>2010-04-12 22:27:03 +0200
committerMichael Bien <[email protected]>2010-04-12 22:27:03 +0200
commit2c85c416d85205ab98b33e1a0b0daab32d4d81ff (patch)
tree4187ba06dec81da46495a300bb4f914e931f0c58 /src/com/mbien/opencl
parentb51f2e1c254cdd74c9e43904c62694f64e6ae7e6 (diff)
changes due to package renaming in jocl.
Diffstat (limited to 'src/com/mbien/opencl')
-rw-r--r--src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl214
-rw-r--r--src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java201
-rw-r--r--src/com/mbien/opencl/demos/fractal/Mandelbrot.cl51
-rw-r--r--src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java486
-rw-r--r--src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig484
-rw-r--r--src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java91
-rw-r--r--src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl15
-rw-r--r--src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java277
-rw-r--r--src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl23
-rw-r--r--src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java103
-rw-r--r--src/com/mbien/opencl/demos/julia3d/Julia3d.java212
-rw-r--r--src/com/mbien/opencl/demos/julia3d/Renderer.java203
-rw-r--r--src/com/mbien/opencl/demos/julia3d/UserSceneController.java249
-rw-r--r--src/com/mbien/opencl/demos/julia3d/config.h24
-rw-r--r--src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl357
-rw-r--r--src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl382
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/Camera.java52
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/Camera32.java41
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/Camera64.java52
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java80
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java106
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java109
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/Vec.java53
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/Vec32.java48
-rw-r--r--src/com/mbien/opencl/demos/julia3d/structs/Vec64.java48
-rw-r--r--src/com/mbien/opencl/demos/radixsort/RadixSort.cl358
-rw-r--r--src/com/mbien/opencl/demos/radixsort/RadixSort.java183
-rw-r--r--src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java129
-rw-r--r--src/com/mbien/opencl/demos/radixsort/Scan.java131
-rw-r--r--src/com/mbien/opencl/demos/radixsort/Scan_b.cl190
30 files changed, 0 insertions, 4952 deletions
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
deleted file mode 100644
index a8d0e1d..0000000
--- a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and
- * proprietary rights in and to this software and related documentation.
- * Any use, reproduction, disclosure, or distribution of this software
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA)
- * associated with this source code for terms and conditions that govern
- * your use of this NVIDIA software.
- *
- */
-
-
-
-//Passed down by clBuildProgram
-//#define LOCAL_SIZE_LIMIT 1024
-
-
-
-inline void ComparatorPrivate(
- uint *keyA,
- uint *keyB,
- uint arrowDir
-){
- if( (*keyA > *keyB) == arrowDir ){
- uint t;
- t = *keyA; *keyA = *keyB; *keyB = t;
- }
-}
-
-inline void ComparatorLocal(
- __local uint *keyA,
- __local uint *keyB,
- uint arrowDir
-){
- if( (*keyA > *keyB) == arrowDir ){
- uint t;
- t = *keyA; *keyA = *keyB; *keyB = t;
- }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Monolithic bitonic sort kernel for short arrays fitting into local memory
-////////////////////////////////////////////////////////////////////////////////
-__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
-void bitonicSortLocal(
- __global uint *d_DstKey,
- __global uint *d_SrcKey,
- uint arrayLength,
- uint sortDir
-){
- __local uint l_key[LOCAL_SIZE_LIMIT];
-
- //Offset to the beginning of subbatch and load data
- d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
- l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
-
- for(uint size = 2; size < arrayLength; size <<= 1){
- //Bitonic merge
- uint dir = ( (get_local_id(0) & (size / 2)) != 0 );
- for(uint stride = size / 2; stride > 0; stride >>= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
- ComparatorLocal(
- &l_key[pos + 0],
- &l_key[pos + stride],
- dir
- );
- }
- }
-
- //dir == sortDir for the last bitonic merge step
- {
- for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
- ComparatorLocal(
- &l_key[pos + 0],
- &l_key[pos + stride],
- sortDir
- );
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- d_DstKey[ 0] = l_key[get_local_id(0) + 0];
- d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Bitonic sort kernel for large arrays (not fitting into local memory)
-////////////////////////////////////////////////////////////////////////////////
-//Bottom-level bitonic sort
-//Almost the same as bitonicSortLocal with the only exception
-//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
-//sorted in opposite directions
-__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
-void bitonicSortLocal1(
- __global uint *d_DstKey,
- __global uint *d_SrcKey
-){
- __local uint l_key[LOCAL_SIZE_LIMIT];
-
- //Offset to the beginning of subarray and load data
- d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
- l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
-
- uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
-
- for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
- //Bitonic merge
- uint dir = (comparatorI & (size / 2)) != 0;
- for(uint stride = size / 2; stride > 0; stride >>= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
- ComparatorLocal(
- &l_key[pos + 0],
- &l_key[pos + stride],
- dir
- );
- }
- }
-
- //Odd / even arrays of LOCAL_SIZE_LIMIT elements
- //sorted in opposite directions
- {
- uint dir = (get_group_id(0) & 1);
- for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
- ComparatorLocal(
- &l_key[pos + 0],
- &l_key[pos + stride],
- dir
- );
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- d_DstKey[ 0] = l_key[get_local_id(0) + 0];
- d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
-}
-
-//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
-__kernel void bitonicMergeGlobal(
- __global uint *d_DstKey,
- __global uint *d_SrcKey,
- uint arrayLength,
- uint size,
- uint stride,
- uint sortDir
-){
- uint global_comparatorI = get_global_id(0);
- uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
-
- //Bitonic merge
- uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
- uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
-
- uint keyA = d_SrcKey[pos + 0];
- uint keyB = d_SrcKey[pos + stride];
-
- ComparatorPrivate(
- &keyA,
- &keyB,
- dir
- );
-
- d_DstKey[pos + 0] = keyA;
- d_DstKey[pos + stride] = keyB;
-}
-
-//Combined bitonic merge steps for
-//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
-__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
-void bitonicMergeLocal(
- __global uint *d_DstKey,
- __global uint *d_SrcKey,
- uint arrayLength,
- uint stride,
- uint size,
- uint sortDir
-){
- __local uint l_key[LOCAL_SIZE_LIMIT];
-
- d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
- l_key[get_local_id(0) + 0] = d_SrcKey[ 0];
- l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
-
- //Bitonic merge
- uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
- uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
- for(; stride > 0; stride >>= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
- ComparatorLocal(
- &l_key[pos + 0],
- &l_key[pos + stride],
- dir
- );
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- d_DstKey[ 0] = l_key[get_local_id(0) + 0];
- d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
-}
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
deleted file mode 100644
index ed5b32e..0000000
--- a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * 18:42 Saturday, February 27 2010
- */
-package com.mbien.opencl.demos.bitonicsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.Map;
-import java.util.Random;
-
-import static java.lang.System.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.*;
-
-/**
- * Bitonic sort optimized for GPUs.
- * Uses NVIDIA's bitonic merge sort kernel.
- * @author Michael Bien
- */
-public class BitonicSort {
-
- private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal";
- private static final String BITONIC_MERGE_LOCAL = "bitonicMergeLocal";
- private static final String BITONIC_SORT_LOCAL = "bitonicSortLocal";
- private static final String BITONIC_SORT_LOCAL1 = "bitonicSortLocal1";
-
- private final static int LOCAL_SIZE_LIMIT = 1024;
- private final Map<String, CLKernel> kernels;
-
- public BitonicSort() throws IOException {
-
- final int sortDir = 1;
- final int elements = 1048576;
- final int maxvalue = 1000000;
-
- out.println("Initializing OpenCL...");
-
- //Create the context
- CLContext context = null;
-
- try{
-
- context = CLContext.create();
- CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
-
- out.println("Initializing OpenCL bitonic sorter...");
- kernels = initBitonicSort(queue);
-
- out.println("Creating OpenCL memory objects...");
- CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
- System.out.println(keyBuffer.getCLSize()/1000000.0f);
-
- out.println("Initializing data...\n");
- Random random = new Random();
- for (int i = 0; i < elements; i++) {
- int rnd = random.nextInt(maxvalue);
- keyBuffer.getBuffer().put(i, rnd);
- }
-
- int arrayLength = elements;
- int batch = elements / arrayLength;
-
- out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
-
- long time = currentTimeMillis();
-
- bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir);
- queue.putReadBuffer(keyBuffer, true);
-
- out.println(currentTimeMillis() - time+"ms");
-
- IntBuffer keys = keyBuffer.getBuffer();
- printSnapshot(keys, 20);
- checkIfSorted(keys);
-
- out.println("\nTEST PASSED");
-
- }finally{
- if(context!=null) {
- context.release();
- }
- }
-
- }
-
- private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException {
-
- out.println(" creating bitonic sort program");
-
- CLContext context = queue.getContext();
-
- CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
- .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
-
- Map<String, CLKernel> kernelMap = program.createCLKernels();
-
- out.println(" checking minimum supported workgroup size");
- //Check for work group size
- CLDevice device = queue.getDevice();
- long szBitonicSortLocal = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
- long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
- long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
-
- if ( (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
- || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
- || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2)) ) {
- throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2
- +" required by this application is not supported on this device.");
- }
-
- return kernelMap;
-
- }
-
- public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) {
-
- if (arrayLength < 2) {
- throw new IllegalArgumentException("arrayLength was "+arrayLength);
- }
-
- // TODO Only power-of-two array lengths are supported so far
-
- dir = (dir != 0) ? 1 : 0;
-
- CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1);
- CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL);
- CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL);
- CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL);
-
- if (arrayLength <= LOCAL_SIZE_LIMIT) {
-
- // oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
-
- //Launch bitonicSortLocal
- sortlocal.putArgs(dstKey, srcKey)
- .putArg(arrayLength).putArg(dir).rewind();
-
- int localWorkSize = LOCAL_SIZE_LIMIT / 2;
- int globalWorkSize = batch * arrayLength / 2;
- queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize);
-
- } else {
-
- //Launch bitonicSortLocal1
- sortlocal1.setArgs(dstKey, srcKey);
-
- int localWorkSize = LOCAL_SIZE_LIMIT / 2;
- int globalWorkSize = batch * arrayLength / 2;
-
- queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize);
-
- for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
- for (int stride = size / 2; stride > 0; stride >>= 1) {
- if (stride >= LOCAL_SIZE_LIMIT) {
- //Launch bitonicMergeGlobal
- mergeGlobal.putArgs(dstKey, dstKey)
- .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
-
- globalWorkSize = batch * arrayLength / 2;
- queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0);
- } else {
- //Launch bitonicMergeLocal
- mergeLocal.putArgs(dstKey, dstKey)
- .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
-
- localWorkSize = LOCAL_SIZE_LIMIT / 2;
- globalWorkSize = batch * arrayLength / 2;
-
- queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize);
- break;
- }
- }
- }
- }
- }
-
- private void printSnapshot(IntBuffer buffer, int snapshot) {
- for(int i = 0; i < snapshot; i++)
- out.print(buffer.get() + ", ");
- out.println("...; " + buffer.remaining() + " more");
- buffer.rewind();
- }
-
- private void checkIfSorted(IntBuffer keys) {
- for (int i = 1; i < keys.capacity(); i++) {
- if (keys.get(i - 1) > keys.get(i)) {
- throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i));
- }
- }
- }
-
- public static void main(String[] args) throws IOException {
- new BitonicSort();
- }
-}
diff --git a/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl b/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl
deleted file mode 100644
index 640c775..0000000
--- a/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifdef DOUBLE_FP
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- typedef double varfloat;
-#else
- typedef float varfloat;
-#endif
-
-/**
- * For a description of this algorithm please refer to
- * http://en.wikipedia.org/wiki/Mandelbrot_set
- * @author Michael Bien
- */
-kernel void mandelbrot(
- const int width, const int height,
- const varfloat x0, const varfloat y0,
- const varfloat rangeX, const varfloat rangeY,
- global uint *output, global uint *colorMap,
- const int colorMapSize, const int maxIterations) {
-
- unsigned int ix = get_global_id(0);
- unsigned int iy = get_global_id(1);
-
- varfloat r = x0 + ix * rangeX / width;
- varfloat i = y0 + iy * rangeY / height;
-
- varfloat x = 0;
- varfloat y = 0;
-
- varfloat magnitudeSquared = 0;
- int iteration = 0;
-
- while (magnitudeSquared < 4 && iteration < maxIterations) {
- varfloat x2 = x*x;
- varfloat y2 = y*y;
- y = 2 * x * y + i;
- x = x2 - y2 + r;
- magnitudeSquared = x2+y2;
- iteration++;
- }
-
- if (iteration == maxIterations) {
- output[iy * width + ix] = 0;
- }else {
- varfloat alpha = (varfloat)iteration / maxIterations;
- int colorIndex = (int)(alpha * colorMapSize);
- output[iy * width + ix] = colorMap[colorIndex];
- // monochrom
- // output[iy * width + ix] = 255*iteration/maxIterations;
- }
-
-} \ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java b/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java
deleted file mode 100644
index 265add8..0000000
--- a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java
+++ /dev/null
@@ -1,486 +0,0 @@
-package com.mbien.opencl.demos.fractal;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLEvent;
-import com.mbien.opencl.CLEventList;
-import com.mbien.opencl.CLException;
-import com.mbien.opencl.gl.CLGLBuffer;
-import com.mbien.opencl.gl.CLGLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLPlatform;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLProgram.CompilerOptions;
-import com.jogamp.opengl.util.awt.TextRenderer;
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Font;
-import java.awt.Point;
-import java.awt.event.KeyAdapter;
-import java.awt.event.KeyEvent;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseWheelEvent;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import javax.media.opengl.DebugGL2;
-import javax.media.opengl.GL;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLContext;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.swing.JFrame;
-import javax.swing.SwingUtilities;
-
-import static com.jogamp.common.nio.Buffers.*;
-import static javax.media.opengl.GL2.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLEvent.ProfilingCommand.*;
-import static com.mbien.opencl.CLCommandQueue.Mode.*;
-import static com.mbien.opencl.CLDevice.Type.*;
-import static java.lang.Math.*;
-
-/**
- * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL.
- * A shared PBO is used as storage for the fractal image.<br/>
- * http://en.wikipedia.org/wiki/Mandelbrot_set
- * <p>
- * controls:<br/>
- * keys 1-9 control parallelism level<br/>
- * space enables/disables slice seperator<br/>
- * 'd' toggles between 32/64bit floatingpoint precision<br/>
- * mouse/mousewheel to drag and zoom<br/>
- * </p>
- * @author Michael Bien
- */
-public class MultiDeviceFractal implements GLEventListener {
-
- // max number of used GPUs
- private static final int MAX_PARRALLELISM_LEVEL = 8;
-
- // max per pixel iterations to compute the fractal
- private static final int MAX_ITERATIONS = 500;
-
- private GLCanvas canvas;
-
- private CLGLContext clContext;
- private CLCommandQueue[] queues;
- private CLKernel[] kernels;
- private CLProgram program;
- private CLEventList probes;
- private CLGLBuffer<?>[] pboBuffers;
- private CLBuffer<IntBuffer>[] colorMap;
-
- private int width = 0;
- private int height = 0;
-
- private double minX = -2f;
- private double minY = -1.2f;
- private double maxX = 0.6f;
- private double maxY = 1.3f;
-
- private int slices;
-
- private boolean drawSeperator;
- private boolean doublePrecision;
- private boolean buffersInitialized;
- private boolean rebuild;
-
- private final TextRenderer textRenderer;
-
- public MultiDeviceFractal(int width, int height) {
-
- this.width = width;
- this.height = height;
-
- canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
- canvas.addGLEventListener(this);
- initSceneInteraction();
-
- JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set");
- frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
- canvas.setPreferredSize(new Dimension(width, height));
- frame.add(canvas);
- frame.pack();
-
- frame.setVisible(true);
-
- textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
- }
-
- public void init(GLAutoDrawable drawable) {
-
- if(clContext == null) {
- // enable GL error checking using the composable pipeline
- drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
-
- drawable.getGL().glFinish();
- initCL(drawable.getContext());
-
- GL2 gl = drawable.getGL().getGL2();
-
- gl.setSwapInterval(0);
- gl.glDisable(GL_DEPTH_TEST);
- gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
-
- initView(gl, drawable.getWidth(), drawable.getHeight());
-
- initPBO(gl);
- drawable.getGL().glFinish();
-
- setKernelConstants();
- }
- }
-
- private void initCL(GLContext glCtx){
- try {
- // create context managing all available GPUs
-// clContext = CLGLContext.create(glCtx, GPU);
- clContext = CLGLContext.create(glCtx, CLPlatform.getDefault().listCLDevices()[0]);
-
-
- CLDevice[] devices = clContext.getDevices();
-
- slices = min(devices.length, MAX_PARRALLELISM_LEVEL);
-
- // create command queues for every GPU, setup colormap and init kernels
- queues = new CLCommandQueue[slices];
- kernels = new CLKernel[slices];
- probes = new CLEventList(slices);
- colorMap = new CLBuffer[slices];
-
- for (int i = 0; i < slices; i++) {
-
- colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY);
- initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED);
-
- // create command queue and upload color map buffer on each used device
- queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload
-
- }
-
- // load and build program
- program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl"));
- buildProgram();
-
- } catch (IOException ex) {
- Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex);
- } catch (CLException ex) {
- Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex);
- }
-
- }
-
- private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) {
-
- for (int n = 0; n < colors.length - 1; n++) {
-
- Color color = colors[n];
- int r0 = color.getRed();
- int g0 = color.getGreen();
- int b0 = color.getBlue();
-
- color = colors[n + 1];
- int r1 = color.getRed();
- int g1 = color.getGreen();
- int b1 = color.getBlue();
-
- int deltaR = r1 - r0;
- int deltaG = g1 - g0;
- int deltaB = b1 - b0;
-
- for (int step = 0; step < stepSize; step++) {
- float alpha = (float) step / (stepSize - 1);
- int r = (int) (r0 + alpha * deltaR);
- int g = (int) (g0 + alpha * deltaG);
- int b = (int) (b0 + alpha * deltaB);
- colorMap.put((r << 16) | (g << 8) | (b << 0));
- }
- }
- colorMap.rewind();
-
- }
-
- private void initView(GL2 gl, int width, int height) {
-
- gl.glViewport(0, 0, width, height);
-
- gl.glMatrixMode(GL_MODELVIEW);
- gl.glLoadIdentity();
-
- gl.glMatrixMode(GL_PROJECTION);
- gl.glLoadIdentity();
- gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0);
- }
-
- @SuppressWarnings("unchecked")
- private void initPBO(GL gl) {
-
- if(pboBuffers != null) {
- int[] oldPbos = new int[pboBuffers.length];
- for (int i = 0; i < pboBuffers.length; i++) {
- CLGLBuffer<?> buffer = pboBuffers[i];
- oldPbos[i] = buffer.GLID;
- buffer.release();
- }
- gl.glDeleteBuffers(oldPbos.length, oldPbos, 0);
- }
-
- pboBuffers = new CLGLBuffer[slices];
-
- int[] pbo = new int[slices];
- gl.glGenBuffers(slices, pbo, 0);
-
- // setup one empty PBO per slice
- for (int i = 0; i < slices; i++) {
-
- gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]);
- gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW);
- gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- pboBuffers[i] = clContext.createFromGLBuffer(pbo[i], WRITE_ONLY);
-
- }
-
- buffersInitialized = true;
- }
-
- private void buildProgram() {
-
- /*
- * workaround: The driver keeps using the old binaries for some reason.
- * to solve this we simple create a new program and release the old.
- * however rebuilding programs should be possible -> remove when drivers are fixed.
- */
- if(program != null && rebuild) {
- String source = program.getSource();
- program.release();
- program = clContext.createProgram(source);
- }
-
- // disable 64bit floating point math if not available
- if(doublePrecision) {
- for (CLDevice device : program.getCLDevices()) {
- if(!device.isDoubleFPAvailable()) {
- doublePrecision = false;
- break;
- }
- }
- }
-
- if(doublePrecision) {
- program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP");
- }else{
- program.build(CompilerOptions.FAST_RELAXED_MATH);
- }
- rebuild = false;
-
- for (int i = 0; i < kernels.length; i++) {
- // init kernel with constants
- kernels[i] = program.createCLKernel("mandelbrot");
- }
-
- }
-
- // init kernels with constants
- private void setKernelConstants() {
- for (int i = 0; i < slices; i++) {
- kernels[i].setForce32BitArgs(!doublePrecision)
- .setArg(6, pboBuffers[i])
- .setArg(7, colorMap[i])
- .setArg(8, colorMap[i].getBuffer().capacity())
- .setArg(9, MAX_ITERATIONS);
- }
- }
-
- // rendering cycle
- public void display(GLAutoDrawable drawable) {
- GL gl = drawable.getGL();
-
- // make sure GL does not use our objects before we start computeing
- gl.glFinish();
- if(!buffersInitialized) {
- initPBO(gl);
- setKernelConstants();
- }
- if(rebuild) {
- buildProgram();
- setKernelConstants();
- }
- compute();
-
- render(gl.getGL2());
- }
-
- // OpenCL
- private void compute() {
-
- int sliceWidth = width / slices;
- double rangeX = (maxX - minX) / slices;
- double rangeY = (maxY - minY);
-
- // release all old events, you can't reuse events in OpenCL
- probes.release();
-
- // start computation
- for (int i = 0; i < slices; i++) {
-
- kernels[i].putArg( sliceWidth).putArg(height)
- .putArg(minX + rangeX*i).putArg( minY)
- .putArg( rangeX ).putArg(rangeY)
- .rewind();
-
- // aquire GL objects, and enqueue a kernel with a probe from the list
- queues[i].putAcquireGLObject(pboBuffers[i].ID)
- .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes)
- .putReleaseGLObject(pboBuffers[i].ID);
-
- }
-
- // block until done (important: finish before doing further gl work)
- for (int i = 0; i < slices; i++) {
- queues[i].finish();
- }
-
- }
-
- // OpenGL
- private void render(GL2 gl) {
-
- gl.glClear(GL_COLOR_BUFFER_BIT);
-
- //draw slices
- int sliceWidth = width / slices;
-
- for (int i = 0; i < slices; i++) {
-
- int seperatorOffset = drawSeperator?i:0;
-
- gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID);
- gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0);
-
- gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0);
-
- }
- gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- //draw info text
- textRenderer.beginRendering(width, height, false);
-
- textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15);
-
- for (int i = 0; i < slices; i++) {
- CLEvent event = probes.getEvent(i);
- long start = event.getProfilingInfo(START);
- long end = event.getProfilingInfo(END);
- textRenderer.draw("GPU"+i +" "+(int)((end-start)/1000000.0f)+"ms", 10, height-(20+16*(slices-i)));
- }
-
- textRenderer.endRendering();
- }
-
- public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) {
-
- if(this.width == width && this.height == height)
- return;
-
- this.width = width;
- this.height = height;
-
- initPBO(drawable.getGL());
-
- initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight());
- }
-
- private void initSceneInteraction() {
-
- MouseAdapter mouseAdapter = new MouseAdapter() {
-
- Point lastpos = new Point();
-
- @Override
- public void mouseDragged(MouseEvent e) {
-
- double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width;
- double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height;
-
- minX += offsetX;
- minY -= offsetY;
-
- maxX += offsetX;
- maxY -= offsetY;
-
- lastpos = e.getPoint();
-
- canvas.display();
-
- }
-
- @Override
- public void mouseMoved(MouseEvent e) {
- lastpos = e.getPoint();
- }
-
- @Override
- public void mouseWheelMoved(MouseWheelEvent e) {
- float rotation = e.getWheelRotation() / 25.0f;
-
- double deltaX = rotation * (maxX - minX);
- double deltaY = rotation * (maxY - minY);
-
- // offset for "zoom to cursor"
- double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2;
- double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2;
-
- minX += deltaX+offsetX;
- minY += deltaY-offsetY;
-
- maxX +=-deltaX+offsetX;
- maxY +=-deltaY-offsetY;
-
- canvas.display();
- }
- };
-
- KeyAdapter keyAdapter = new KeyAdapter() {
-
- @Override
- public void keyPressed(KeyEvent e) {
- if(e.getKeyCode() == KeyEvent.VK_SPACE) {
- drawSeperator = !drawSeperator;
- }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') {
- int number = e.getKeyChar()-'0';
- slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL));
- buffersInitialized = false;
- }else if(e.getKeyCode() == KeyEvent.VK_D) {
- doublePrecision = !doublePrecision;
- rebuild = true;
- }
- canvas.display();
- }
-
- };
-
- canvas.addMouseMotionListener(mouseAdapter);
- canvas.addMouseWheelListener(mouseAdapter);
- canvas.addKeyListener(keyAdapter);
- }
-
- public void dispose(GLAutoDrawable drawable) {
- }
-
- public static void main(String args[]) {
- SwingUtilities.invokeLater(new Runnable() {
- public void run() {
- new MultiDeviceFractal(512, 512);
- }
- });
- }
-
-}
diff --git a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig b/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig
deleted file mode 100644
index fc09d19..0000000
--- a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig
+++ /dev/null
@@ -1,484 +0,0 @@
-package com.mbien.opencl.demos.fractal;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLEvent;
-import com.mbien.opencl.CLEventList;
-import com.mbien.opencl.CLException;
-import com.mbien.opencl.CLGLBuffer;
-import com.mbien.opencl.CLGLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLProgram.CompilerOptions;
-import com.sun.opengl.util.awt.TextRenderer;
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Font;
-import java.awt.Point;
-import java.awt.event.KeyAdapter;
-import java.awt.event.KeyEvent;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseWheelEvent;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import javax.media.opengl.DebugGL2;
-import javax.media.opengl.GL;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLContext;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.swing.JFrame;
-import javax.swing.SwingUtilities;
-
-import static com.sun.gluegen.runtime.BufferFactory.*;
-import static javax.media.opengl.GL2.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLEvent.ProfilingCommand.*;
-import static com.mbien.opencl.CLCommandQueue.Mode.*;
-import static com.mbien.opencl.CLDevice.Type.*;
-import static java.lang.Math.*;
-
-/**
- * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL.
- * A shared PBO is used as storage for the fractal image.<br/>
- * http://en.wikipedia.org/wiki/Mandelbrot_set
- * <p>
- * controls:<br/>
- * keys 1-9 control parallelism level<br/>
- * space enables/disables slice seperator<br/>
- * 'd' toggles between 32/64bit floatingpoint precision<br/>
- * mouse/mousewheel to drag and zoom<br/>
- * </p>
- * @author Michael Bien
- */
-public class MultiDeviceFractal implements GLEventListener {
-
- // max number of used GPUs
- private static final int MAX_PARRALLELISM_LEVEL = 8;
-
- // max per pixel iterations to compute the fractal
- private static final int MAX_ITERATIONS = 1000;
-
- private GLCanvas canvas;
-
- private CLContext clContext;
- private CLCommandQueue[] queues;
- private CLKernel[] kernels;
- private CLProgram program;
- private CLEventList probes;
- private CLBuffer<?>[] pboBuffers;
- private CLBuffer<IntBuffer>[] colorMap;
-
- private int width = 0;
- private int height = 0;
-
- private double minX = -2f;
- private double minY = -1.2f;
- private double maxX = 0.6f;
- private double maxY = 1.3f;
-
- private int slices;
-
- private boolean drawSeperator;
- private boolean doublePrecision;
- private boolean buffersInitialized;
- private boolean rebuild;
-
- private final TextRenderer textRenderer;
-
- public MultiDeviceFractal(int width, int height) {
-
- this.width = width;
- this.height = height;
-
- canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
- canvas.addGLEventListener(this);
- initSceneInteraction();
-
- JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set");
- frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
- canvas.setPreferredSize(new Dimension(width, height));
- frame.add(canvas);
- frame.pack();
-
- frame.setVisible(true);
-
- textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
- }
-
- public void init(GLAutoDrawable drawable) {
-
- // enable GL error checking using the composable pipeline
- drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
-
- initCL(drawable.getContext());
-
- GL2 gl = drawable.getGL().getGL2();
-
- gl.setSwapInterval(0);
- gl.glDisable(GL_DEPTH_TEST);
- gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
-
- initView(gl, drawable.getWidth(), drawable.getHeight());
-
- initPBO(gl);
- setKernelConstants();
- }
-
- private void initCL(GLContext glCtx){
- try {
- // create context managing all available GPUs
- clContext = CLContext.create(GPU);
-
- CLDevice[] devices = clContext.getCLDevices();
-
- slices = min(devices.length, MAX_PARRALLELISM_LEVEL);
-
- // create command queues for every GPU, setup colormap and init kernels
- queues = new CLCommandQueue[slices];
- kernels = new CLKernel[slices];
- probes = new CLEventList(slices);
- colorMap = new CLBuffer[slices];
-
- for (int i = 0; i < slices; i++) {
-
- colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY);
- initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED);
-
- // create command queue and upload color map buffer on each used device
- queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload
-
- }
-
- // load and build program
- program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl"));
- buildProgram();
-
- } catch (IOException ex) {
- Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex);
- } catch (CLException ex) {
- Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex);
- }
-
- }
-
- private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) {
-
- for (int n = 0; n < colors.length - 1; n++) {
-
- Color color = colors[n];
- int r0 = color.getRed();
- int g0 = color.getGreen();
- int b0 = color.getBlue();
-
- color = colors[n + 1];
- int r1 = color.getRed();
- int g1 = color.getGreen();
- int b1 = color.getBlue();
-
- int deltaR = r1 - r0;
- int deltaG = g1 - g0;
- int deltaB = b1 - b0;
-
- for (int step = 0; step < stepSize; step++) {
- float alpha = (float) step / (stepSize - 1);
- int r = (int) (r0 + alpha * deltaR);
- int g = (int) (g0 + alpha * deltaG);
- int b = (int) (b0 + alpha * deltaB);
- colorMap.put((r << 16) | (g << 8) | (b << 0));
- }
- }
- colorMap.rewind();
-
- }
-
- private void initView(GL2 gl, int width, int height) {
-
- gl.glViewport(0, 0, width, height);
-
- gl.glMatrixMode(GL_MODELVIEW);
- gl.glLoadIdentity();
-
- gl.glMatrixMode(GL_PROJECTION);
- gl.glLoadIdentity();
- gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0);
- }
-
- @SuppressWarnings("unchecked")
- private void initPBO(GL gl) {
-
- if(pboBuffers != null) {
- int[] oldPbos = new int[pboBuffers.length];
- for (int i = 0; i < pboBuffers.length; i++) {
- CLBuffer<?> buffer = pboBuffers[i];
-// oldPbos[i] = buffer.GLID;
- buffer.release();
- }
-// gl.glDeleteBuffers(oldPbos.length, oldPbos, 0);
- }
-
- pboBuffers = new CLBuffer[slices];
-
-// int[] pbo = new int[slices];
-// gl.glGenBuffers(slices, pbo, 0);
-
- // setup one empty PBO per slice
- for (int i = 0; i < slices; i++) {
-
-// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]);
-// gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW);
-// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- pboBuffers[i] = clContext.createByteBuffer(width*height * SIZEOF_INT / slices, WRITE_ONLY);
-// pboBuffers[i] = clContext.createFromGLBuffer(null, pbo[i], WRITE_ONLY);
-
- }
-
- buffersInitialized = true;
- }
-
- private void buildProgram() {
-
- /*
- * workaround: The driver keeps using the old binaries for some reason.
- * to solve this we simple create a new program and release the old.
- * however rebuilding programs should be possible -> remove when drivers are fixed.
- */
- if(program != null && rebuild) {
- String source = program.getSource();
- program.release();
- program = clContext.createProgram(source);
- }
-
- // disable 64bit floating point math if not available
- if(doublePrecision) {
- for (CLDevice device : program.getCLDevices()) {
- if(!device.isDoubleFPAvailable()) {
- doublePrecision = false;
- break;
- }
- }
- }
-
- if(doublePrecision) {
- program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP");
- }else{
- program.build(CompilerOptions.FAST_RELAXED_MATH);
- }
- rebuild = false;
-
- for (int i = 0; i < kernels.length; i++) {
- // init kernel with constants
- kernels[i] = program.createCLKernel("mandelbrot");
- }
-
- }
-
- // init kernels with constants
- private void setKernelConstants() {
- for (int i = 0; i < slices; i++) {
- kernels[i].setForce32BitArgs(!doublePrecision)
- .setArg(6, pboBuffers[i])
- .setArg(7, colorMap[i])
- .setArg(8, colorMap[i].getBuffer().capacity())
- .setArg(9, MAX_ITERATIONS);
- }
- }
-
- // rendering cycle
- public void display(GLAutoDrawable drawable) {
- GL gl = drawable.getGL();
-
- if(!buffersInitialized) {
- initPBO(gl);
- setKernelConstants();
- }
- if(rebuild) {
- buildProgram();
- setKernelConstants();
- }
- // make sure GL does not use our objects before we start computeing
- gl.glFinish();
- compute();
-
- render(gl.getGL2());
- }
-
- // OpenCL
- private void compute() {
-
- int sliceWidth = width / slices;
- double rangeX = (maxX - minX) / slices;
- double rangeY = (maxY - minY);
-
- // release all old events, you can't reuse events in OpenCL
- probes.release();
-
- long time = System.currentTimeMillis();
- // start computation
- for (int i = 0; i < slices; i++) {
-
- kernels[i].putArg( sliceWidth).putArg(height)
- .putArg(minX + rangeX*i).putArg( minY)
- .putArg( rangeX ).putArg(rangeY)
- .rewind();
-
- // aquire GL objects, and enqueue a kernel with a probe from the list
- queues[i]
-// .putAcquireGLObject(pboBuffers[i].ID)
- .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes)
-// .putReleaseGLObject(pboBuffers[i].ID)
- ;
-
- }
-
- // block until done
- for (int i = 0; i < slices; i++) {
- queues[i].finish();
- }
- System.out.println((System.currentTimeMillis()-time)/1000.0f);
-
- }
-
- // OpenGL
- private void render(GL2 gl) {
-
- gl.glClear(GL_COLOR_BUFFER_BIT);
-
- //draw slices
- int sliceWidth = width / slices;
-
-// for (int i = 0; i < slices; i++) {
-//
-// int seperatorOffset = drawSeperator?i:0;
-//
-// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID);
-// gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0);
-//
-// gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0);
-//
-// }
-// gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- //draw info text
- textRenderer.beginRendering(width, height, false);
-
- textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15);
-
- for (int i = 0; i < slices; i++) {
- CLEvent event = probes.getEvent(i);
- long start = event.getProfilingInfo(START);
- long end = event.getProfilingInfo(END);
- textRenderer.draw("GPU"+i +" "+((end-start)/1000000000.0f)+"s", 10, height-(20+16*(slices-i)));
- }
-
- textRenderer.endRendering();
- }
-
- public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) {
-
- if(this.width == width && this.height == height)
- return;
-
- this.width = width;
- this.height = height;
-
- initPBO(drawable.getGL());
-
- initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight());
- }
-
- private void initSceneInteraction() {
-
- MouseAdapter mouseAdapter = new MouseAdapter() {
-
- Point lastpos = new Point();
-
- @Override
- public void mouseDragged(MouseEvent e) {
-
- double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width;
- double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height;
-
- minX += offsetX;
- minY -= offsetY;
-
- maxX += offsetX;
- maxY -= offsetY;
-
- lastpos = e.getPoint();
-
- canvas.display();
-
- }
-
- @Override
- public void mouseMoved(MouseEvent e) {
- lastpos = e.getPoint();
- }
-
- @Override
- public void mouseWheelMoved(MouseWheelEvent e) {
- float rotation = e.getWheelRotation() / 25.0f;
-
- double deltaX = rotation * (maxX - minX);
- double deltaY = rotation * (maxY - minY);
-
- // offset for "zoom to cursor"
- double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2;
- double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2;
-
- minX += deltaX+offsetX;
- minY += deltaY-offsetY;
-
- maxX +=-deltaX+offsetX;
- maxY +=-deltaY-offsetY;
-
- canvas.display();
- }
- };
-
- KeyAdapter keyAdapter = new KeyAdapter() {
-
- @Override
- public void keyPressed(KeyEvent e) {
- if(e.getKeyCode() == KeyEvent.VK_SPACE) {
- drawSeperator = !drawSeperator;
- }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') {
- int number = e.getKeyChar()-'0';
- slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL));
- buffersInitialized = false;
- }else if(e.getKeyCode() == KeyEvent.VK_D) {
- doublePrecision = !doublePrecision;
- rebuild = true;
- }
- canvas.display();
- }
-
- };
-
- canvas.addMouseMotionListener(mouseAdapter);
- canvas.addMouseWheelListener(mouseAdapter);
- canvas.addKeyListener(keyAdapter);
- }
-
- public void dispose(GLAutoDrawable drawable) {
- }
-
- public static void main(String args[]) {
- SwingUtilities.invokeLater(new Runnable() {
- public void run() {
- new MultiDeviceFractal(512, 512);
- }
- });
- }
-
-}
diff --git a/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java b/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java
deleted file mode 100644
index acbd84c..0000000
--- a/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java
+++ /dev/null
@@ -1,91 +0,0 @@
-package com.mbien.opencl.demos.hellojocl;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import java.io.IOException;
-import java.nio.FloatBuffer;
-import java.util.Random;
-
-import static java.lang.System.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-
-/**
- * Hello Java OpenCL example. Adds all elements of buffer A to buffer B
- * and stores the result in buffer C.<br/>
- * Sample was inspired by the Nvidia VectorAdd example written in C/C++
- * which is bundled in the Nvidia OpenCL SDK.
- * @author Michael Bien
- */
-public class HelloJOCL {
-
- public static void main(String[] args) throws IOException {
-
- int elementCount = 11444777; // Length of arrays to process
- int localWorkSize = 256; // Local work size dimensions
- int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize
-
- // set up
- CLContext context = CLContext.create();
-
- CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
-
- CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
- CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
- CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
-
- out.println("used device memory: "
- + (clBufferA.getSize()+clBufferB.getSize()+clBufferC.getSize())/1000000 +"MB");
-
- // fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between runs).
- fillBuffer(clBufferA.getBuffer(), 12345);
- fillBuffer(clBufferB.getBuffer(), 67890);
-
- // get a reference to the kernel functon with the name 'VectorAdd'
- // and map the buffers to its input parameters.
- CLKernel kernel = program.createCLKernel("VectorAdd");
- kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
-
- // create command queue on fastest device.
- CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
-
- // asynchronous write of data to GPU device, blocking read later to get the computed results back.
- long time = nanoTime();
- queue.putWriteBuffer(clBufferA, false)
- .putWriteBuffer(clBufferB, false)
- .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
- .putReadBuffer(clBufferC, true);
- time = nanoTime() - time;
-
- // cleanup all resources associated with this context.
- context.release();
-
- // print first few elements of the resulting buffer to the console.
- out.println("a+b=c results snapshot: ");
- for(int i = 0; i < 10; i++)
- out.print(clBufferC.getBuffer().get() + ", ");
- out.println("...; " + clBufferC.getBuffer().remaining() + " more");
-
- out.println("computation took: "+(time/1000000)+"ms");
-
- }
-
- private static final void fillBuffer(FloatBuffer buffer, int seed) {
- Random rnd = new Random(seed);
- while(buffer.remaining() != 0)
- buffer.put(rnd.nextFloat()*100);
- buffer.rewind();
- }
-
- private static final int roundUp(int groupSize, int globalSize) {
- int r = globalSize % groupSize;
- if (r == 0) {
- return globalSize;
- } else {
- return globalSize + groupSize - r;
- }
- }
-
-} \ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl b/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl
deleted file mode 100644
index ac9dde2..0000000
--- a/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
- // OpenCL Kernel Function for element by element vector addition
- kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {
-
- // get index into global data array
- int iGID = get_global_id(0);
-
- // bound check (equivalent to the limit on a 'for' loop for standard/serial C code
- if (iGID >= numElements) {
- return;
- }
-
- // add the vector elements
- c[iGID] = a[iGID] + b[iGID];
- } \ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java b/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
deleted file mode 100644
index 74783f3..0000000
--- a/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
+++ /dev/null
@@ -1,277 +0,0 @@
-package com.mbien.opencl.demos.joglinterop;
-
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.gl.CLGLBuffer;
-import com.mbien.opencl.gl.CLGLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLPlatform;
-import com.mbien.opencl.CLProgram;
-import com.jogamp.opengl.util.Animator;
-import java.io.IOException;
-import javax.media.opengl.DebugGL2;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.media.opengl.glu.gl2.GLUgl2;
-import javax.swing.JFrame;
-import javax.swing.SwingUtilities;
-
-import static com.jogamp.common.nio.Buffers.*;
-
-/**
- * JOCL - JOGL interoperability example.
- * @author Michael Bien
- */
-public class GLCLInteroperabilityDemo implements GLEventListener {
-
- private final GLUgl2 glu = new GLUgl2();
-
- private final int MESH_SIZE = 256;
-
- private int width;
- private int height;
-
-// private final FloatBuffer vb;
-// private final IntBuffer ib;
-
- private final int[] glObjects = new int[2];
- private final int VERTICES = 0;
-// private final int INDICES = 1;
-
- private final UserSceneInteraction usi;
-
- private CLGLContext clContext;
- private CLKernel kernel;
- private CLCommandQueue commandQueue;
- private CLGLBuffer<?> clBuffer;
-
- private float step = 0;
-
- public GLCLInteroperabilityDemo() {
-
- this.usi = new UserSceneInteraction();
-
- // create direct memory buffers
-// vb = newFloatBuffer(MESH_SIZE * MESH_SIZE * 4);
-// ib = newIntBuffer((MESH_SIZE - 1) * (MESH_SIZE - 1) * 2 * 3);
-//
-// // build indices
-// // 0---3
-// // | \ |
-// // 1---2
-// for (int h = 0; h < MESH_SIZE - 1; h++) {
-// for (int w = 0; w < MESH_SIZE - 1; w++) {
-//
-// // 0 - 3 - 2
-// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6, w + (h ) * (MESH_SIZE) );
-// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 1, w + (h ) * (MESH_SIZE) + 1);
-// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 2, w + (h + 1) * (MESH_SIZE) + 1);
-//
-// // 0 - 2 - 1
-// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 3, w + (h ) * (MESH_SIZE) );
-// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 4, w + (h + 1) * (MESH_SIZE) + 1);
-// ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 5, w + (h + 1) * (MESH_SIZE) );
-//
-// }
-// }
-// ib.rewind();
-
- SwingUtilities.invokeLater(new Runnable() {
- public void run() {
- initUI();
- }
- });
-
- }
-
- private void initUI() {
-
- this.width = 600;
- this.height = 400;
-
- GLCapabilities config = new GLCapabilities(GLProfile.get(GLProfile.GL2));
- config.setSampleBuffers(true);
- config.setNumSamples(4);
-
- GLCanvas canvas = new GLCanvas(config);
- canvas.addGLEventListener(this);
- usi.init(canvas);
-
- JFrame frame = new JFrame("JOGL-JOCL Interoperability Example");
- frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
- frame.add(canvas);
- frame.setSize(width, height);
-
- frame.setVisible(true);
-
- }
-
-
- public void init(GLAutoDrawable drawable) {
-
- if(clContext == null) {
-
- // find gl compatible device
- CLDevice[] devices = CLPlatform.getDefault().listCLDevices();
- CLDevice device = null;
- for (CLDevice d : devices) {
- if(d.isGLMemorySharingSupported()) {
- device = d;
- break;
- }
- }
- // create OpenCL context before creating any OpenGL objects
- // you want to share with OpenCL (AMD driver requirement)
- clContext = CLGLContext.create(drawable.getContext(), device);
-
- // enable GL error checking using the composable pipeline
- drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
-
- // OpenGL initialization
- GL2 gl = drawable.getGL().getGL2();
-
- gl.setSwapInterval(1);
-
- gl.glPolygonMode(GL2.GL_FRONT_AND_BACK, GL2.GL_LINE);
-
- gl.glGenBuffers(glObjects.length, glObjects, 0);
-
- // gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]);
- // gl.glBufferData(GL2.GL_ELEMENT_ARRAY_BUFFER, ib.capacity() * SIZEOF_INT, ib, GL2.GL_STATIC_DRAW);
- // gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, 0);
-
- gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
- gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]);
- gl.glBufferData(GL2.GL_ARRAY_BUFFER, MESH_SIZE * MESH_SIZE * 4 * SIZEOF_FLOAT, null, GL2.GL_DYNAMIC_DRAW);
- gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
- gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
-
- pushPerspectiveView(gl);
- gl.glFinish();
-
- // init OpenCL
- initCL();
-
- // start rendering thread
- Animator animator = new Animator(drawable);
- animator.start();
-
- }
- }
-
- private void initCL() {
-
- CLProgram program;
- try {
- program = clContext.createProgram(getClass().getResourceAsStream("JoglInterop.cl"));
- program.build();
- System.out.println(program.getBuildStatus());
- System.out.println(program.isExecutable());
- System.out.println(program.getBuildLog());
- } catch (IOException ex) {
- throw new RuntimeException("can not handle exception", ex);
- }
-
- commandQueue = clContext.getMaxFlopsDevice().createCommandQueue();
-
- clBuffer = clContext.createFromGLBuffer(glObjects[VERTICES], CLGLBuffer.Mem.WRITE_ONLY);
-
- System.out.println("cl buffer type: " + clBuffer.getGLObjectType());
- System.out.println("shared with gl buffer: " + clBuffer.getGLObjectID());
-
- kernel = program.createCLKernel("sineWave")
- .putArg(clBuffer)
- .putArg(MESH_SIZE)
- .rewind();
-
- System.out.println("cl initialised");
- }
-
-
- public void display(GLAutoDrawable drawable) {
-
- GL2 gl = drawable.getGL().getGL2();
-
- // ensure pipeline is clean before doing cl work
- gl.glFinish();
-
- computeHeightfield();
-
- gl.glClear(GL2.GL_COLOR_BUFFER_BIT | GL2.GL_DEPTH_BUFFER_BIT);
- gl.glLoadIdentity();
-
- usi.interact(gl);
-
- gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]);
- gl.glVertexPointer(4, GL2.GL_FLOAT, 0, 0);
-
-// gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]);
-
- gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
- gl.glDrawArrays(GL2.GL_POINTS, 0, MESH_SIZE * MESH_SIZE);
-// gl.glDrawElements(GL2.GL_TRIANGLES, ib.capacity(), GL2.GL_UNSIGNED_INT, 0);
- gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
-
-// gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
-
- }
-
- /*
- * Computes a heightfield using a OpenCL kernel.
- */
- private void computeHeightfield() {
-
- kernel.setArg(2, step += 0.05f);
-
- commandQueue.putAcquireGLObject(clBuffer.ID)
- .put2DRangeKernel(kernel, 0, 0, MESH_SIZE, MESH_SIZE, 0, 0)
- .putReleaseGLObject(clBuffer.ID)
- .finish();
-
- }
-
- private void pushPerspectiveView(GL2 gl) {
-
- gl.glMatrixMode(GL2.GL_PROJECTION);
- gl.glPushMatrix();
-
- gl.glLoadIdentity();
-
- glu.gluPerspective(60, width / (float)height, 1, 1000);
- gl.glMatrixMode(GL2.GL_MODELVIEW);
-
- gl.glPushMatrix();
- gl.glLoadIdentity();
-
- }
-
- private void popView(GL2 gl) {
-
- gl.glMatrixMode(GL2.GL_PROJECTION);
- gl.glPopMatrix();
-
- gl.glMatrixMode(GL2.GL_MODELVIEW);
- gl.glPopMatrix();
-
- }
-
-
- public void reshape(GLAutoDrawable drawable, int arg1, int arg2, int width, int height) {
- this.width = width;
- this.height = height;
- GL2 gl = drawable.getGL().getGL2();
- popView(gl);
- pushPerspectiveView(gl);
- }
-
- public void dispose(GLAutoDrawable drawable) { }
-
- public static void main(String[] args) {
- new GLCLInteroperabilityDemo();
- }
-
-} \ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl b/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl
deleted file mode 100644
index 0f0bcfc..0000000
--- a/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-/**
-* animated 2D sine pattern.
-*/
-kernel void sineWave(global float4 * vertex, int size, float time) {
-
- unsigned int x = get_global_id(0);
- unsigned int y = get_global_id(1);
-
- // calculate uv coordinates
- float u = x / (float) size;
- float v = y / (float) size;
-
- u = u*2.0f - 1.0f;
- v = v*2.0f - 1.0f;
-
- // calculate simple sine wave pattern
- float freq = 4.0f;
- float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;
-
- // write output vertex
- vertex[y*size + x] = (float4)(u*10.0f, w*10.0f, v*10.0f, 1.0f);
-}
diff --git a/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java b/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java
deleted file mode 100644
index bec2d28..0000000
--- a/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java
+++ /dev/null
@@ -1,103 +0,0 @@
-package com.mbien.opencl.demos.joglinterop;
-
-import java.awt.Component;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseMotionAdapter;
-import java.awt.event.MouseWheelEvent;
-import java.awt.event.MouseWheelListener;
-import javax.media.opengl.GL2;
-
-/**
- * Utility class for interacting with a scene. Supports rotation and zoom around origin.
- * @author Michael Bien
- */
-public class UserSceneInteraction {
-
- private float z = -20;
- private float rotx = 45;
- private float roty = 30;
-
- private Point dragstart;
- private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM }
- private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE;
-
-
- public void init(Component component) {
- initMouseListeners(component);
- }
-
- private void initMouseListeners(Component component) {
- component.addMouseMotionListener(new MouseMotionAdapter() {
-
- @Override
- public void mouseDragged(MouseEvent e) {
-
- if (dragstart != null) {
- switch (dragmode) {
- case DRAG_ROTATE:
- rotx += e.getY() - dragstart.getY();
- roty += e.getX() - dragstart.getX();
- break;
- case DRAG_ZOOM:
- z += (e.getY() - dragstart.getY()) / 5.0f;
- break;
- }
- }
-
- dragstart = e.getPoint();
- }
- });
- component.addMouseWheelListener(new MouseWheelListener() {
-
- public void mouseWheelMoved(MouseWheelEvent e) {
- z += e.getWheelRotation()*5;
- }
-
- });
- component.addMouseListener(new MouseAdapter() {
-
- @Override
- public void mousePressed(MouseEvent e) {
- switch (e.getButton()) {
- case (MouseEvent.BUTTON1):
- dragmode = MOUSE_MODE.DRAG_ROTATE;
- break;
- case (MouseEvent.BUTTON2):
- dragmode = MOUSE_MODE.DRAG_ZOOM;
- break;
- case (MouseEvent.BUTTON3):
- dragmode = MOUSE_MODE.DRAG_ZOOM;
- break;
- }
- }
-
- @Override
- public void mouseReleased(MouseEvent e) {
- switch (e.getButton()) {
- case (MouseEvent.BUTTON1):
- dragmode = MOUSE_MODE.DRAG_ZOOM;
- break;
- case (MouseEvent.BUTTON2):
- dragmode = MOUSE_MODE.DRAG_ROTATE;
- break;
- case (MouseEvent.BUTTON3):
- dragmode = MOUSE_MODE.DRAG_ROTATE;
- break;
- }
-
- dragstart = null;
- }
- });
- }
-
-
- public void interact(GL2 gl) {
- gl.glTranslatef(0, 0, z);
- gl.glRotatef(rotx, 1f, 0f, 0f);
- gl.glRotatef(roty, 0f, 1.0f, 0f);
- }
-
-
-} \ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/julia3d/Julia3d.java b/src/com/mbien/opencl/demos/julia3d/Julia3d.java
deleted file mode 100644
index 5d5378f..0000000
--- a/src/com/mbien/opencl/demos/julia3d/Julia3d.java
+++ /dev/null
@@ -1,212 +0,0 @@
-package com.mbien.opencl.demos.julia3d;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLPlatform;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.demos.julia3d.structs.Camera;
-import com.mbien.opencl.demos.julia3d.structs.RenderingConfig;
-import com.mbien.opencl.demos.julia3d.structs.Vec;
-import java.io.IOException;
-import java.nio.Buffer;
-import java.nio.ByteBuffer;
-import java.nio.FloatBuffer;
-import javax.swing.SwingUtilities;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.CompilerOptions.*;
-import static com.mbien.opencl.demos.julia3d.UserSceneController.*;
-
-/**
- * This sample has been ported from David Buciarelli's juliaGPU v1.2 written in C.
- * @author Michael Bien
- */
-public class Julia3d {
-
- private final CLContext context;
- private CLBuffer<FloatBuffer> pixelBuffer;
- private final CLBuffer<ByteBuffer> configBuffer;
- private final CLCommandQueue commandQueue;
- private final CLProgram program;
- private final CLKernel julia;
- private final CLKernel multiply;
-
- private final int workGroupSize;
- private final String kernelFileName = "rendering_kernel.cl";
-
- final RenderingConfig config;
-
- private Julia3d(RenderingConfig renderConfig) {
- this.config = renderConfig;
- updateCamera();
-
- //setup
- CLDevice gpu = CLPlatform.getDefault().getMaxFlopsDevice();
- context = CLContext.create(gpu);
-
- workGroupSize = 256;
-
- //allocate buffers
- configBuffer = context.createBuffer(config.getBuffer(), READ_ONLY);
- commandQueue = gpu.createCommandQueue();
-// update(true);
-
- try {
- program = context.createProgram(Julia3d.class.getResourceAsStream(kernelFileName))
- .build(FAST_RELAXED_MATH);
- } catch (IOException ex) {
- throw new RuntimeException("unable to load program from source", ex);
- }
-
- julia = program.createCLKernel("JuliaGPU");
- multiply = program.createCLKernel("multiply");
- System.out.println(program.getBuildStatus(gpu));
- System.out.println(program.getBuildLog());
-
- }
-
- void update(boolean reallocate) {
-
- updateCamera();
-
- int bufferSize = config.getWidth() * config.getHeight() * 3;
- if(reallocate) {
- if(pixelBuffer != null) {
- pixelBuffer.release();
- }
-
- pixelBuffer = context.createFloatBuffer(bufferSize, READ_WRITE, USE_BUFFER);
- }
-
- commandQueue.putWriteBuffer(configBuffer, true);
-
- julia.putArg(pixelBuffer)
- .putArg(configBuffer)
- .rewind();
-
- multiply.putArg(pixelBuffer)
- .putArg(bufferSize)
- .rewind();
- }
-
-
- void compute(boolean fastRendering) {
-
- // calculate workgroup size
- int globalThreads = config.getWidth() * config.getHeight();
- if(globalThreads % workGroupSize != 0)
- globalThreads = (globalThreads / workGroupSize + 1) * workGroupSize;
-
- int localThreads = workGroupSize;
- int superSamplingSize = config.getSuperSamplingSize();
-
- if (!fastRendering && superSamplingSize > 1) {
-
- for (int y = 0; y < superSamplingSize; ++y) {
- for (int x = 0; x < superSamplingSize; ++x) {
-
- float sampleX = (x + 0.5f) / superSamplingSize;
- float sampleY = (y + 0.5f) / superSamplingSize;
-
- if (x == 0 && y == 0) {
- // First pass
- julia.setArg(2, 0)
- .setArg(3, sampleX)
- .setArg(4, sampleY);
-
- commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
-
- } else if (x == (superSamplingSize - 1) && y == (superSamplingSize - 1)) {
- // Last pass
- julia.setArg(2, 1)
- .setArg(3, sampleX)
- .setArg(4, sampleY);
-
- // normalize the values we accumulated
- multiply.setArg(2, 1.0f/(superSamplingSize*superSamplingSize));
-
- commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads)
- .put1DRangeKernel(multiply, 0, globalThreads*3, localThreads);
- } else {
- julia.setArg(2, 1)
- .setArg(3, sampleX)
- .setArg(4, sampleY);
-
- commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
-
- }
- }
- }
-
- }else{
-
- //fast rendering
- julia.setArg(2, 0)
- .setArg(3, 0.0f)
- .setArg(4, 0.0f);
-
- commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
- }
-
- commandQueue.putBarrier()
- .putReadBuffer(pixelBuffer, true);
-
- }
-
- private void updateCamera() {
-
- Camera camera = config.getCamera();
-
- Vec dir = camera.getDir();
- Vec target = camera.getTarget();
- Vec camX = camera.getX();
- Vec camY = camera.getY();
- Vec orig = camera.getOrig();
-
- vsub(dir, target, orig);
- vnorm(dir);
-
- Vec up = Vec.create().setX(0).setY(1).setZ(0);
- vxcross(camX, dir, up);
- vnorm(camX);
- vmul(camX, config.getWidth() * .5135f / config.getHeight(), camX);
-
- vxcross(camY, camX, dir);
- vnorm(camY);
- vmul(camY, .5135f, camY);
- }
-
-
- public static void main(String[] args) {
-
- RenderingConfig config = RenderingConfig.create()
- .setWidth(640).setHeight(480)
- .setEnableShadow(1)
- .setSuperSamplingSize(2)
- .setActvateFastRendering(1)
- .setMaxIterations(9)
- .setEpsilon(0.003f * 0.75f)
- .setLight(new float[] {5, 10, 15})
- .setMu(new float[] {-0.2f, 0.4f, -0.4f, -0.4f});
-
- config.getCamera().getOrig() .setX(1).setY(2).setZ(8);
- config.getCamera().getTarget().setX(0).setY(0).setZ(0);
-
- final Julia3d julia3d = new Julia3d(config);
-
- SwingUtilities.invokeLater(new Runnable() {
- public void run() {
- new Renderer(julia3d);
- }
- });
- }
-
- Buffer getPixelBuffer() {
- return pixelBuffer.getBuffer();
- }
-
-
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/Renderer.java b/src/com/mbien/opencl/demos/julia3d/Renderer.java
deleted file mode 100644
index 9266aba..0000000
--- a/src/com/mbien/opencl/demos/julia3d/Renderer.java
+++ /dev/null
@@ -1,203 +0,0 @@
-package com.mbien.opencl.demos.julia3d;
-
-import com.mbien.opencl.demos.julia3d.structs.RenderingConfig;
-import com.jogamp.opengl.util.awt.TextRenderer;
-import java.awt.Dimension;
-import java.awt.Font;
-import java.nio.FloatBuffer;
-import java.util.Timer;
-import java.util.TimerTask;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.swing.JFrame;
-
-import static com.jogamp.common.nio.Buffers.*;
-import static javax.media.opengl.GL2.*;
-import static java.lang.String.*;
-
-/**
- * JOGL renderer for displaying the julia set.
- * @author Michael Bien
- */
-public class Renderer implements GLEventListener {
-
- public final static int MU_RECT_SIZE = 80;
-
- private final Julia3d julia3d;
- private final GLCanvas canvas;
- private final RenderingConfig config;
- private final FloatBuffer juliaSlice;
- private final UserSceneController usi;
- private final TextRenderer textRenderer;
-
- private TimerTask task;
- private final Timer timer;
-
- public Renderer(Julia3d julia3d) {
- this.julia3d = julia3d;
- this.config = julia3d.config;
-
- timer = new Timer();
-
- juliaSlice = newDirectFloatBuffer(MU_RECT_SIZE * MU_RECT_SIZE * 4);
-
- canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
- canvas.addGLEventListener(this);
-
- usi = new UserSceneController();
- usi.init(this, canvas, config);
-
- JFrame frame = new JFrame("Java OpenCL - Julia3D GPU");
- frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
- canvas.setPreferredSize(new Dimension(config.getWidth(), config.getHeight()));
- frame.add(canvas);
- frame.pack();
-
- textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
-
- frame.setVisible(true);
- }
-
- public void init(GLAutoDrawable drawable) {
- drawable.getGL().getGL2().glMatrixMode(GL_PROJECTION);
- }
-
- void update() {
- julia3d.update(false);
- canvas.display();
- }
-
- public void display(GLAutoDrawable drawable) {
-
- //compute
- julia3d.compute(config.getActvateFastRendering() == 1);
-
- GL2 gl = drawable.getGL().getGL2();
- gl.glClear(GL_COLOR_BUFFER_BIT);
-
- // draw julia set
- gl.glRasterPos2i(0, 0);
- gl.glDrawPixels(config.getWidth(), config.getHeight(), GL_RGB, GL_FLOAT, julia3d.getPixelBuffer());
-
-
- // Draw Mu constant
- int width = config.getWidth();
- int height = config.getHeight();
- float[] mu = config.getMu();
-
- gl.glEnable(GL_BLEND);
- gl.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
- int baseMu1 = width - MU_RECT_SIZE - 2;
- int baseMu2 = 1;
- drawJuliaSlice(gl, baseMu1, baseMu2, mu[0], mu[1]);
- int baseMu3 = width - MU_RECT_SIZE - 2;
- int baseMu4 = MU_RECT_SIZE + 2;
- drawJuliaSlice(gl, baseMu3, baseMu4, mu[2], mu[3]);
- gl.glDisable(GL_BLEND);
-
- gl.glColor3f(1, 1, 1);
- int mu1 = (int) (baseMu1 + MU_RECT_SIZE * (mu[0] + 1.5f) / 3.f);
- int mu2 = (int) (baseMu2 + MU_RECT_SIZE * (mu[1] + 1.5f) / 3.f);
- gl.glBegin(GL_LINES);
- gl.glVertex2i(mu1 - 4, mu2);
- gl.glVertex2i(mu1 + 4, mu2);
- gl.glVertex2i(mu1, mu2 - 4);
- gl.glVertex2i(mu1, mu2 + 4);
- gl.glEnd();
-
- int mu3 = (int) (baseMu3 + MU_RECT_SIZE * (mu[2] + 1.5f) / 3.f);
- int mu4 = (int) (baseMu4 + MU_RECT_SIZE * (mu[3] + 1.5f) / 3.f);
- gl.glBegin(GL_LINES);
- gl.glVertex2i(mu3 - 4, mu4);
- gl.glVertex2i(mu3 + 4, mu4);
- gl.glVertex2i(mu3, mu4 - 4);
- gl.glVertex2i(mu3, mu4 + 4);
- gl.glEnd();
-
- // info text
- textRenderer.beginRendering(width, height);
- textRenderer.draw(format("Epsilon %.5f - Max. Iter. %d", config.getEpsilon(), config.getMaxIterations()), 8, 10);
- textRenderer.draw(format("Mu = (%.3f, %.3f, %.3f, %.3f)", mu[0], mu[1], mu[2], mu[3]), 8, 25);
- textRenderer.draw(format("Shadow %s - SuperSampling %dx%d - Fast rendering %s",
- config.getEnableShadow() == 1 ? "on" : "off",
- config.getSuperSamplingSize(), config.getSuperSamplingSize(),
- config.getActvateFastRendering() == 1 ? "on" : "off"), 8, 40);
- textRenderer.endRendering();
-
- // timer task scheduling, delay gpu intensive high quality rendering
- if(task != null) {
- task.cancel();
- }
- if(config.getActvateFastRendering() == 1) {
- task = new TimerTask() {
- @Override
- public void run() {
- config.setActvateFastRendering(0);
- update();
- config.setActvateFastRendering(1);
- }
- };
- timer.schedule(task, 2000);
- }
- }
-
- private void drawJuliaSlice(GL2 gl, int origX, int origY, float cR, float cI) {
-
- int index = 0;
- float invSize = 3.0f / MU_RECT_SIZE;
- for (int i = 0; i < MU_RECT_SIZE; ++i) {
- for (int j = 0; j < MU_RECT_SIZE; ++j) {
-
- float x = i * invSize - 1.5f;
- float y = j * invSize - 1.5f;
-
- int iter;
- for (iter = 0; iter < 64; ++iter) {
- float x2 = x * x;
- float y2 = y * y;
- if (x2 + y2 > 4.0f) {
- break;
- }
-
- float newx = x2 - y2 + cR;
- float newy = 2.f * x * y + cI;
- x = newx;
- y = newy;
- }
-
- juliaSlice.put(index++, iter / 64.0f);
- juliaSlice.put(index++, 0.0f);
- juliaSlice.put(index++, 0.0f);
- juliaSlice.put(index++, 0.5f);
- }
- }
-
- gl.glRasterPos2i(origX, origY);
- gl.glDrawPixels(MU_RECT_SIZE, MU_RECT_SIZE, GL_RGBA, GL_FLOAT, juliaSlice);
- }
-
-
- public void reshape(GLAutoDrawable drawable, int x, int y, int newWidth, int newHeight) {
-
- config.setWidth(newWidth);
- config.setHeight(newHeight);
-
- GL2 gl = drawable.getGL().getGL2();
-
- gl.glViewport(0, 0, newWidth, newHeight);
- gl.glLoadIdentity();
- gl.glOrtho(-0.5f, newWidth - 0.5f, -0.5f, newHeight - 0.5f, -1.0f, 1.0f);
-
- julia3d.update(true);
-
- }
-
- public void dispose(GLAutoDrawable drawable) {
- }
-
-
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/UserSceneController.java b/src/com/mbien/opencl/demos/julia3d/UserSceneController.java
deleted file mode 100644
index 849de3a..0000000
--- a/src/com/mbien/opencl/demos/julia3d/UserSceneController.java
+++ /dev/null
@@ -1,249 +0,0 @@
-package com.mbien.opencl.demos.julia3d;
-
-import com.mbien.opencl.demos.julia3d.structs.RenderingConfig;
-import com.mbien.opencl.demos.julia3d.structs.Vec;
-import java.awt.Component;
-import java.awt.Point;
-import java.awt.event.KeyAdapter;
-import java.awt.event.KeyEvent;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseWheelEvent;
-
-import static java.lang.Math.*;
-import static com.mbien.opencl.demos.julia3d.Renderer.*;
-
-/**
- * Utility class for interacting with a scene. Supports rotation and zoom around origin.
- * @author Michael Bien
- */
-public class UserSceneController {
-
- private Point dragstart;
- private RenderingConfig model;
- private Renderer view;
-
- private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM }
- private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE;
-
-
- public void init(Renderer view, Component component, RenderingConfig model) {
- initMouseListeners(component);
- this.view = view;
- this.model = model;
- }
-
- private void initMouseListeners(Component component) {
-
- MouseAdapter mouseAdapter = new MouseAdapter() {
- @Override
- public void mouseDragged(MouseEvent e) {
-
- int x = e.getX();
- int y = e.getY();
-
- switch (dragmode) {
- case DRAG_ROTATE:
- if (dragstart != null) {
- int height = model.getHeight();
- int width = model.getWidth();
-
- int ry = height - y - 1;
- int baseMu1 = width - MU_RECT_SIZE - 2;
- int baseMu2 = 1;
- int baseMu3 = width - MU_RECT_SIZE - 2;
- int baseMu4 = MU_RECT_SIZE + 2;
-
- if ((x >= baseMu1 && x <= baseMu1 + MU_RECT_SIZE) && (ry >= baseMu2 && ry <= baseMu2 + MU_RECT_SIZE)) {
- float[] mu = model.getMu();
- mu[0] = 3.f * ( x - baseMu1) / (float)MU_RECT_SIZE - 1.5f;
- mu[1] = 3.f * (ry - baseMu2) / (float)MU_RECT_SIZE - 1.5f;
- model.setMu(mu);
- } else if ((x >= baseMu3 && x <= baseMu3 + MU_RECT_SIZE) && (ry >= baseMu4 && ry <= baseMu4 + MU_RECT_SIZE)) {
- float[] mu = model.getMu();
- mu[2] = 3.f * ( x - baseMu3) / (float)MU_RECT_SIZE - 1.5f;
- mu[3] = 3.f * (ry - baseMu4) / (float)MU_RECT_SIZE - 1.5f;
- model.setMu(mu);
- } else {
- rotateCameraYbyOrig(0.01f * (x - dragstart.getX()));
- rotateCameraXbyOrig(0.01f * (y - dragstart.getY()));
- }
- }
- dragstart = e.getPoint();
- view.update();
- break;
- case DRAG_ZOOM:
- if (dragstart != null) {
- float zoom = (float) ((y - dragstart.getY()) / 10.0f);
- zoom(zoom);
- }
- dragstart = e.getPoint();
- view.update();
- break;
- }
-
- }
-
- @Override
- public void mousePressed(MouseEvent e) {
- switch (e.getButton()) {
- case (MouseEvent.BUTTON1):
- dragmode = MOUSE_MODE.DRAG_ROTATE;
- break;
- case (MouseEvent.BUTTON2):
- dragmode = MOUSE_MODE.DRAG_ZOOM;
- break;
- case (MouseEvent.BUTTON3):
- dragmode = MOUSE_MODE.DRAG_ZOOM;
- break;
- }
- }
-
- @Override
- public void mouseReleased(MouseEvent e) {
- switch (e.getButton()) {
- case (MouseEvent.BUTTON1):
- dragmode = MOUSE_MODE.DRAG_ZOOM;
- break;
- case (MouseEvent.BUTTON2):
- dragmode = MOUSE_MODE.DRAG_ROTATE;
- break;
- case (MouseEvent.BUTTON3):
- dragmode = MOUSE_MODE.DRAG_ROTATE;
- break;
- }
-
- dragstart = null;
- }
-
- @Override
- public void mouseWheelMoved(MouseWheelEvent e) {
- float zoom = e.getWheelRotation() * 0.1f;
- zoom(zoom);
- view.update();
- }
-
- };
-
- KeyAdapter keyAdapter = new KeyAdapter() {
-
- @Override
- public void keyPressed(KeyEvent e) {
-
- switch (e.getKeyChar()) {
- case 'l':
- model.setEnableShadow(model.getEnableShadow()==0 ? 1 : 0);
- break;
- case '1':
- model.setEpsilon(model.getEpsilon() * 0.75f);
- break;
- case '2':
- model.setEpsilon(model.getEpsilon() * 1.f / 0.75f);
- break;
- case '3':
- model.setMaxIterations(max(1, model.getMaxIterations() -1));
- break;
- case '4':
- model.setMaxIterations(min(12, model.getMaxIterations()+1));
- break;
- case '5':
- model.setSuperSamplingSize(max(1, model.getSuperSamplingSize() -1));
- break;
- case '6':
- model.setSuperSamplingSize(min(5, model.getSuperSamplingSize() +1));
- break;
- default:
- break;
- }
- view.update();
-
- }
-
- };
-
- component.addKeyListener(keyAdapter);
-
- component.addMouseListener(mouseAdapter);
- component.addMouseMotionListener(mouseAdapter);
- component.addMouseWheelListener(mouseAdapter);
-
- }
- private void zoom(float zoom) {
- Vec orig = model.getCamera().getOrig();
- orig.setX(orig.getX()+zoom)
- .setY(orig.getY()+zoom)
- .setZ(orig.getZ()+zoom);
- }
-
- private void rotateLightX(float k) {
- float[] light = model.getLight();
- float y = light[1];
- float z = light[2];
- light[1] = (float) ( y * cos(k) + z * sin(k));
- light[2] = (float) (-y * sin(k) + z * cos(k));
- model.setLight(light);
- }
-
- private void rotateLightY(float k) {
- float[] light = model.getLight();
- float x = light[0];
- float z = light[2];
- light[0] = (float) (x * cos(k) - z * sin(k));
- light[2] = (float) (x * sin(k) + z * cos(k));
- model.setLight(light);
- }
-
- private void rotateCameraXbyOrig(double k) {
- Vec orig = model.getCamera().getOrig();
- float y = orig.getY();
- float z = orig.getZ();
- orig.setY((float) ( y * cos(k) + z * sin(k)));
- orig.setZ((float) (-y * sin(k) + z * cos(k)));
- }
-
- private void rotateCameraYbyOrig(double k) {
- Vec orig = model.getCamera().getOrig();
- float x = orig.getX();
- float z = orig.getZ();
- orig.setX((float) (x * cos(k) - z * sin(k)));
- orig.setZ((float) (x * sin(k) + z * cos(k)));
- }
-
-
- public final static void vadd(Vec v, Vec a, Vec b) {
- v.setX(a.getX() + b.getX());
- v.setY(a.getY() + b.getY());
- v.setZ(a.getZ() + b.getZ());
- }
-
- public final static void vsub(Vec v, Vec a, Vec b) {
- v.setX(a.getX() - b.getX());
- v.setY(a.getY() - b.getY());
- v.setZ(a.getZ() - b.getZ());
- }
-
- public final static void vmul(Vec v, float s, Vec b) {
- v.setX(s * b.getX());
- v.setY(s * b.getY());
- v.setZ(s * b.getZ());
- }
-
- public final static float vdot(Vec a, Vec b) {
- return a.getX() * b.getX()
- + a.getY() * b.getY()
- + a.getZ() * b.getZ();
- }
-
- public final static void vnorm(Vec v) {
- float s = (float) (1.0f / sqrt(vdot(v, v)));
- vmul(v, s, v);
- }
-
- public final static void vxcross(Vec v, Vec a, Vec b) {
- v.setX(a.getY() * b.getZ() - a.getZ() * b.getY());
- v.setY(a.getZ() * b.getX() - a.getX() * b.getZ());
- v.setZ(a.getX() * b.getY() - a.getY() * b.getX());
- }
-
-
-} \ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/julia3d/config.h b/src/com/mbien/opencl/demos/julia3d/config.h
deleted file mode 100644
index 72df3ff..0000000
--- a/src/com/mbien/opencl/demos/julia3d/config.h
+++ /dev/null
@@ -1,24 +0,0 @@
-
-typedef struct {
- float x, y, z; // position, also color (r,g,b)
-} Vec;
-
-typedef struct {
- /* User defined values */
- Vec orig, target;
- /* Calculated values */
- Vec dir, x, y;
-} Camera;
-
-typedef struct {
- unsigned int width, height;
- int superSamplingSize;
- int actvateFastRendering;
- int enableShadow;
-
- unsigned int maxIterations;
- float epsilon;
- float mu[4];
- float light[3];
- Camera camera;
-} RenderingConfig;
diff --git a/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl b/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl
deleted file mode 100644
index d5acd02..0000000
--- a/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
-Copyright (c) 2009 David Bucciarelli ([email protected])
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#define GPU_KERNEL
-
-
-typedef struct {
- float x, y, z; // position, also color (r,g,b)
-} Vec;
-
-typedef struct {
- /* User defined values */
- Vec orig, target;
- /* Calculated values */
- Vec dir, x, y;
-} Camera;
-
-typedef struct {
- unsigned int width, height;
- int superSamplingSize;
- int actvateFastRendering;
- int enableShadow;
-
- unsigned int maxIterations;
- float epsilon;
- float mu[4];
- float light[3];
- Camera camera;
-} RenderingConfig;
-
-#define BOUNDING_RADIUS_2 4.f
-
-// Scalar derivative approach by Enforcer:
-// http://www.fractalforums.com/mandelbulb-implementation/realtime-renderingoptimisations/
-static float IterateIntersect(const float4 z0, const float4 c0, const uint maxIterations) {
- float4 z = z0;
- float4 c = c0;
-
- float dr = 1.0f;
- float r2 = dot(z, z);
- float r = sqrt(r2);
- for (int n = 0; (n < maxIterations) && (r < 2.f); ++n) {
- const float zo0 = asin(z.z / r);
- const float zi0 = atan2(z.y, z.x);
- float zr = r2 * r2 * r2 * r;
- const float zo = zo0 * 7.f;
- const float zi = zi0 * 7.f;
- const float czo = cos(zo);
-
- dr = zr * dr * 7.f + 1.f;
- zr *= r;
-
- z = zr * (float4)(czo * cos(zi), czo * sin(zi), sin(zo), 0.f);
- z += c;
-
- r2 = dot(z, z);
- r = sqrt(r2);
- }
-
- return 0.5f * log(r) * r / dr;
-}
-
-static float IntersectBulb(const float4 eyeRayOrig, const float4 eyeRayDir,
- const float4 c, const uint maxIterations, const float epsilon,
- const float maxDist, float4 *hitPoint, uint *steps) {
- float dist;
- float4 r0 = eyeRayOrig;
- float distDone = 0.f;
-
- uint s = 0;
- do {
- dist = IterateIntersect(r0, c, maxIterations);
- distDone += dist;
- // We are inside
- if (dist <= 0.f)
- break;
-
- r0 += eyeRayDir * dist;
- s++;
- } while ((dist > epsilon) && (distDone < maxDist));
-
- *hitPoint = r0;
- *steps = s;
- return dist;
-}
-
-#define WORLD_RADIUS 1000.f
-#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f))
-float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
- const float4 op = WORLD_CENTER - eyeRayOrig;
- const float b = dot(op, eyeRayDir);
- float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS;
-
- if (det < 0.f)
- return -1.f;
- else
- det = sqrt(det);
-
- float t = b - det;
- if (t > 0.f)
- return t;
- else {
- // We are inside, avoid the hit
- return -1.f;
- }
-}
-
-int IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir,
- float *tmin, float*tmax) {
- const float4 op = -eyeRayOrig;
- const float b = dot(op, eyeRayDir);
- float det = b * b - dot(op, op) + BOUNDING_RADIUS_2;
-
- if (det < 0.f)
- return 0;
- else
- det = sqrt(det);
-
- float t1 = b - det;
- float t2 = b + det;
- if (t1 > 0.f) {
- *tmin = t1;
- *tmax = t2;
- return 1;
- } else {
- if (t2 > 0.f) {
- // We are inside, start from the ray origin
- *tmin = 0.f;
- *tmax = t2;
-
- return 1;
- } else
- return 0;
- }
-}
-
-static float4 NormEstimate(const float4 p, const float4 c,
- const float delta, const uint maxIterations) {
- const float4 qP = p;
- const float4 gx1 = qP - (float4)(delta, 0.f, 0.f, 0.f);
- const float4 gx2 = qP + (float4)(delta, 0.f, 0.f, 0.f);
- const float4 gy1 = qP - (float4)(0.f, delta, 0.f, 0.f);
- const float4 gy2 = qP + (float4)(0.f, delta, 0.f, 0.f);
- const float4 gz1 = qP - (float4)(0.f, 0.f, delta, 0.f);
- const float4 gz2 = qP + (float4)(0.f, 0.f, delta, 0.f);
-
- const float gradX = length(IterateIntersect(gx2, c, maxIterations)) -
- length(IterateIntersect(gx1, c, maxIterations));
- const float gradY = length(IterateIntersect(gy2, c, maxIterations)) -
- length(IterateIntersect(gy1, c, maxIterations));
- const float gradZ = length(IterateIntersect(gz2, c, maxIterations)) -
- length(IterateIntersect(gz1, c, maxIterations));
-
- const float4 N = normalize((float4)(gradX, gradY, gradZ, 0.f));
-
- return N;
-}
-
-static float4 Phong(const float4 light, const float4 eye, const float4 pt,
- const float4 N, const float4 diffuse) {
- const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f);
- float4 L = normalize(light - pt);
- float NdotL = dot(N, L);
- if (NdotL < 0.f)
- return diffuse * ambient;
-
- const float specularExponent = 30.f;
- const float specularity = 0.65f;
-
- float4 E = normalize(eye - pt);
- float4 H = (L + E) * (float)0.5f;
-
- return diffuse * NdotL +
- specularity * pow(dot(N, H), specularExponent) +
- diffuse * ambient;
-}
-
-__kernel void MandelbulbGPU(
- __global float *pixels,
- const __global RenderingConfig *config,
- const int enableAccumulation,
- const float sampleX,
- const float sampleY) {
- const int gid = get_global_id(0);
- const unsigned width = config->width;
- const unsigned height = config->height;
-
- const unsigned int x = gid % width;
- const int y = gid / width;
-
- // Check if we have to do something
- if (y >= height)
- return;
-
- const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.5f / 0.75f)) : config->epsilon;
- const uint maxIterations = config->actvateFastRendering ? (max(3u, config->maxIterations) - 2u) : config->maxIterations;
-
- const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]);
- const float4 light = (float4)(config->light[0], config->light[1], config->light[2], 0.f);
- const __global Camera *camera = &config->camera;
-
- //--------------------------------------------------------------------------
- // Calculate eye ray
- //--------------------------------------------------------------------------
-
- const float invWidth = 1.f / width;
- const float invHeight = 1.f / height;
- const float kcx = (x + sampleX) * invWidth - .5f;
- const float4 kcx4 = (float4)kcx;
- const float kcy = (y + sampleY) * invHeight - .5f;
- const float4 kcy4 = (float4)kcy;
-
- const float4 cameraX = (float4)(camera->x.x, camera->x.y, camera->x.z, 0.f);
- const float4 cameraY = (float4)(camera->y.x, camera->y.y, camera->y.z, 0.f);
- const float4 cameraDir = (float4)(camera->dir.x, camera->dir.y, camera->dir.z, 0.f);
- const float4 cameraOrig = (float4)(camera->orig.x, camera->orig.y, camera->orig.z, 0.f);
-
- const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir);
- const float4 eyeRayOrig = eyeRayDir * (float4)0.1f + cameraOrig;
-
- //--------------------------------------------------------------------------
- // Check if we hit the bounding sphere
- //--------------------------------------------------------------------------
-
- int useAO = 1;
- float4 diffuse, n, color;
-
- float4 hitPoint;
- float dist, tmin, tmax;
- if (IntersectBoundingSphere(eyeRayOrig, eyeRayDir, &tmin, &tmax)) {
- //--------------------------------------------------------------------------
- // Find the intersection with the set
- //--------------------------------------------------------------------------
-
- uint steps;
- float4 rayOrig = eyeRayOrig + eyeRayDir * (float4)tmin;
- dist = IntersectBulb(rayOrig, eyeRayDir, mu, maxIterations,
- epsilon, tmax - tmin, &hitPoint, &steps);
-
- if (dist <= epsilon) {
- // Set hit
- diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f);
- n = NormEstimate(hitPoint, mu, dist, maxIterations);
- } else
- dist = -1.f;
- } else
- dist = -1.f;
-
- //--------------------------------------------------------------------------
- // Check if we hit the floor
- //--------------------------------------------------------------------------
-
- if (dist < 0.f) {
- dist = IntersectFloorSphere(eyeRayOrig, eyeRayDir);
-
- if (dist >= 0.f) {
- // Floor hit
- hitPoint = eyeRayOrig + eyeRayDir * (float4)dist;
- n = hitPoint - WORLD_CENTER;
- n = normalize(n);
- // The most important feature in a ray tracer: a checker texture !
- const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x);
- const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z);
- if ((ix + iz) % 2)
- diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f);
- else
- diffuse = (float4) (0.75f, 0.f, 0.f, 0.f);
- useAO = 0;
- } else {
- // Sky hit
- color = (float4)(0.f, 0.1f, 0.3f, 0.f);
- }
- } else {
- // Sky hit
- color = (float4)(0.f, 0.1f, 0.3f, 0.f);
- }
-
- //--------------------------------------------------------------------------
- // Select the shadow pass
- //--------------------------------------------------------------------------
-
- if (dist >= 0.f) {
- float shadowFactor = 1.f;
- if (config->enableShadow) {
- float4 L = normalize(light - hitPoint);
- float4 rO = hitPoint + n * 1e-2f;
- float4 shadowHitPoint;
-
- // Check bounding sphere
- if (IntersectBoundingSphere(rO, L, &tmin, &tmax)) {
- float shadowDistSet = tmin;
- uint steps;
-
- rO = rO + L * (float4)shadowDistSet;
- shadowDistSet = IntersectBulb(rO, L, mu, maxIterations, epsilon,
- tmax - tmin, &shadowHitPoint, &steps);
- if (shadowDistSet < epsilon) {
- if (useAO) {
- // Use steps count to simulate ambient occlusion
- shadowFactor = 0.6f - min(steps / 255.f, 0.5f);
- } else
- shadowFactor = 0.6f;
- }
- }
- }
-
- //--------------------------------------------------------------------------
- // Direct lighting of hit point
- //--------------------------------------------------------------------------
-
- color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor;
- }
-
- //--------------------------------------------------------------------------
- // Write pixel
- //--------------------------------------------------------------------------
-
- int offset = 3 * (x + y * width);
- color = clamp(color, (float4)(0.f, 0.f ,0.f, 0.f), (float4)(1.f, 1.f ,1.f, 0.f));
- if (enableAccumulation) {
- pixels[offset++] += color.s0;
- pixels[offset++] += color.s1;
- pixels[offset] += color.s2;
- } else {
- pixels[offset++] = color.s0;
- pixels[offset++] = color.s1;
- pixels[offset] = color.s2;
- }
-}
-
-kernel void multiply(global float *array, const int numElements, const float s) {
- const int gid = get_global_id(0);
- if (gid >= numElements) {
- return;
- }
- array[gid] *= s;
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl b/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl
deleted file mode 100644
index 9c25c1b..0000000
--- a/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
-Copyright (c) 2009 David Bucciarelli ([email protected])
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#define GPU_KERNEL
-
-
-#define BOUNDING_RADIUS_2 4.f
-#define ESCAPE_THRESHOLD 1e1f
-#define DELTA 1e-4f
-
-typedef struct {
- float x, y, z; // position, also color (r,g,b)
-} Vec;
-
-typedef struct {
- Vec orig, target;
- Vec dir, x, y;
-} Camera;
-
-typedef struct {
- unsigned int width, height;
- int superSamplingSize;
- int actvateFastRendering;
- int enableShadow;
-
- unsigned int maxIterations;
- float epsilon;
- float mu[4];
- float light[3];
- Camera camera;
-} RenderingConfig;
-
-
-static float4 QuatMult(const float4 q1, const float4 q2) {
- float4 r;
-
- // a1a2 - b1b2 - c1c2 - d1d2
- r.x = q1.x * q2.x - q1.y * q2.y - q1.z * q2.z - q1.w * q2.w;
- // a1b2 + b1a2 + c1d2 - d1c2
- r.y = q1.x * q2.y + q1.y * q2.x + q1.z * q2.w - q1.w * q2.z;
- // a1c2 - b1d2 + c1a2 + d1b2
- r.z = q1.x * q2.z - q1.y * q2.w + q1.z * q2.x + q1.w * q2.y;
- // a1d2 + b1c2 - c1b2 + d1a2
- r.w = q1.x * q2.w + q1.y * q2.z - q1.z * q2.y + q1.w * q2.x;
-
- return r;
-}
-
-static float4 QuatSqr(const float4 q) {
- float4 r;
-
- r.x = q.x * q.x - q.y * q.y - q.z * q.z - q.w * q.w;
- r.y = 2.f * q.x * q.y;
- r.z = 2.f * q.x * q.z;
- r.w = 2.f * q.x * q.w;
-
- return r;
-}
-
-static void IterateIntersect(float4 *q, float4 *qp, const float4 c, const uint maxIterations) {
- float4 q0 = *q;
- float4 qp0 = *qp;
-
- for (uint i = 0; i < maxIterations; ++i) {
- qp0 = 2.f * QuatMult(q0, qp0);
- q0 = QuatSqr(q0) + c;
-
- if (dot(q0, q0) > ESCAPE_THRESHOLD)
- break;
- }
-
- *q = q0;
- *qp = qp0;
-}
-
-static float IntersectJulia(const float4 eyeRayOrig, const float4 eyeRayDir,
- const float4 c, const uint maxIterations, const float epsilon,
- float4 *hitPoint, uint *steps) {
- float dist;
- float4 r0 = eyeRayOrig;
-
- uint s = 0;
- do {
- float4 z = r0;
- float4 zp = (float4) (1.f, 0.f, 0.f, 0.f);
-
- IterateIntersect(&z, &zp, c, maxIterations);
-
- const float normZP = length(zp);
-
- // We are inside
- if (normZP == 0.f)
- break;
-
- const float normZ = length(z);
- dist = 0.5f * normZ * log(normZ) / normZP;
-
- r0 += eyeRayDir * dist;
- s++;
- } while ((dist > epsilon) && (dot(r0, r0) < BOUNDING_RADIUS_2));
-
- *hitPoint = r0;
- *steps = s;
- return dist;
-}
-
-#define WORLD_RADIUS 1000.f
-#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f))
-
-float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
- const float4 op = WORLD_CENTER - eyeRayOrig;
- const float b = dot(op, eyeRayDir);
- float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS;
-
- if (det < 0.f)
- return -1.f;
- else
- det = sqrt(det);
-
- float t = b - det;
- if (t > 0.f)
- return t;
- else {
- // We are inside, avoid the hit
- return -1.f;
- }
-}
-
-float IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
- const float4 op = -eyeRayOrig;
- const float b = dot(op, eyeRayDir);
- float det = b * b - dot(op, op) + BOUNDING_RADIUS_2;
-
- if (det < 0.f)
- return -1.f;
- else
- det = sqrt(det);
-
- float t = b - det;
- if (t > 0.f)
- return t;
- else {
- t = b + det;
-
- if (t > 0.f) {
- // We are inside, start from the ray origin
- return 0.0f;
- } else
- return -1.f;
- }
-}
-
-static float4 NormEstimate(const float4 p, const float4 c,
- const float delta, const uint maxIterations) {
- float4 N;
- float4 qP = p;
- float gradX, gradY, gradZ;
-
- float4 gx1 = qP - (float4) (DELTA, 0.f, 0.f, 0.f);
- float4 gx2 = qP + (float4) (DELTA, 0.f, 0.f, 0.f);
- float4 gy1 = qP - (float4) (0.f, DELTA, 0.f, 0.f);
- float4 gy2 = qP + (float4) (0.f, DELTA, 0.f, 0.f);
- float4 gz1 = qP - (float4) (0.f, 0.f, DELTA, 0.f);
- float4 gz2 = qP + (float4) (0.f, 0.f, DELTA, 0.f);
-
- for (uint i = 0; i < maxIterations; ++i) {
- gx1 = QuatSqr(gx1) + c;
- gx2 = QuatSqr(gx2) + c;
- gy1 = QuatSqr(gy1) + c;
- gy2 = QuatSqr(gy2) + c;
- gz1 = QuatSqr(gz1) + c;
- gz2 = QuatSqr(gz2) + c;
- }
-
- gradX = length(gx2) - length(gx1);
- gradY = length(gy2) - length(gy1);
- gradZ = length(gz2) - length(gz1);
-
- N = normalize((float4) (gradX, gradY, gradZ, 0.f));
-
- return N;
-}
-
-static float4 Phong(const float4 light, const float4 eye, const float4 pt, const float4 N, const float4 diffuse) {
-
- const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f);
- float4 L = normalize(light - pt);
- float NdotL = dot(N, L);
- if (NdotL < 0.f)
- return diffuse * ambient;
-
- const float specularExponent = 30.f;
- const float specularity = 0.65f;
-
- float4 E = normalize(eye - pt);
- float4 H = (L + E) * (float) 0.5f;
-
- return diffuse * NdotL +
- specularity * pow(dot(N, H), specularExponent) +
- diffuse * ambient;
-}
-
-kernel void JuliaGPU( global float *pixels,
- const global RenderingConfig *config,
- int enableAccumulation,
- float sampleX,
- float sampleY ) {
-
- const int gid = get_global_id(0);
- unsigned width = config->width;
- unsigned height = config->height;
-
- const unsigned int x = gid % width;
- const int y = gid / width;
-
- // Check if we have to do something
- if (y >= height)
- return;
-
- const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.f / 0.75f)) : config->epsilon;
- const uint maxIterations = max(1u, config->actvateFastRendering ? (config->maxIterations - 1) : config->maxIterations);
-
- const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]);
- const float4 light = (float4) (config->light[0], config->light[1], config->light[2], 0.f);
- const global Camera *camera = &config->camera;
-
- //--------------------------------------------------------------------------
- // Calculate eye ray
- //--------------------------------------------------------------------------
-
- const float invWidth = 1.f / width;
- const float invHeight = 1.f / height;
- const float kcx = (x + sampleX) * invWidth - .5f;
- const float4 kcx4 = (float4) kcx;
- const float kcy = (y + sampleY) * invHeight - .5f;
- const float4 kcy4 = (float4) kcy;
-
- const float4 cameraX = (float4) (camera->x.x, camera->x.y, camera->x.z, 0.f);
- const float4 cameraY = (float4) (camera->y.x, camera->y.y, camera->y.z, 0.f);
- const float4 cameraDir = (float4) (camera->dir.x, camera->dir.y, camera->dir.z, 0.f);
- const float4 cameraOrig = (float4) (camera->orig.x, camera->orig.y, camera->orig.z, 0.f);
-
- const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir);
- const float4 eyeRayOrig = eyeRayDir * (float4) 0.1f + cameraOrig;
-
- //--------------------------------------------------------------------------
- // Check if we hit the bounding sphere
- //--------------------------------------------------------------------------
-
- float distSet = IntersectBoundingSphere(eyeRayOrig, eyeRayDir);
- float4 hitPoint;
- if (distSet >= 0.f) {
- //--------------------------------------------------------------------------
- // Find the intersection with the set
- //--------------------------------------------------------------------------
-
- uint steps;
- float4 rayOrig = eyeRayOrig + eyeRayDir * (float4) distSet;
- distSet = IntersectJulia(rayOrig, eyeRayDir, mu, maxIterations,
- epsilon, &hitPoint, &steps);
- if (distSet > epsilon)
- distSet = -1.f;
- }
-
- //--------------------------------------------------------------------------
- // Check if we hit the floor
- //--------------------------------------------------------------------------
-
- float distFloor = IntersectFloorSphere(eyeRayOrig, eyeRayDir);
-
- //--------------------------------------------------------------------------
- // Select the hit point
- //--------------------------------------------------------------------------
-
- int doShade = 0;
- int useAO = 1;
- float4 diffuse, n, color;
- if ((distSet < 0.f) && (distFloor < 0.f)) {
- // Sky hit
- color = (float4) (0.f, 0.1f, 0.3f, 0.f);
- } else if ((distSet >= 0.f) && ((distFloor < 0.f) || (distSet <= distFloor))) {
- // Set hit
- diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f);
- n = NormEstimate(hitPoint, mu, distSet, maxIterations);
- doShade = 1;
- } else if ((distFloor >= 0.f) && ((distSet < 0.f) || (distFloor <= distSet))) {
- // Floor hit
- hitPoint = eyeRayOrig + eyeRayDir * (float4) distFloor;
- n = hitPoint - WORLD_CENTER;
- n = normalize(n);
- // The most important feature in a ray tracer: a checker texture !
- const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x);
- const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z);
- if ((ix + iz) % 2)
- diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f);
- else
- diffuse = (float4) (0.75f, 0.f, 0.f, 0.f);
- doShade = 1;
- useAO = 0;
- }
-
- //--------------------------------------------------------------------------
- // Select the shadow pass
- //--------------------------------------------------------------------------
-
- if (doShade) {
- float shadowFactor = 1.f;
- if (config->enableShadow) {
- float4 L = normalize(light - hitPoint);
- float4 rO = hitPoint + n * 1e-2f;
- float4 shadowHitPoint;
-
- // Check bounding sphere
- float shadowDistSet = IntersectBoundingSphere(rO, L);
- if (shadowDistSet >= 0.f) {
- uint steps;
-
- rO = rO + L * (float4) shadowDistSet;
- shadowDistSet = IntersectJulia(rO, L, mu, maxIterations, epsilon,
- &shadowHitPoint, &steps);
- if (shadowDistSet < epsilon) {
- if (useAO) {
- // Use steps count to simulate ambient occlusion
- shadowFactor = 0.6f - min(steps / 255.f, 0.5f);
- } else
- shadowFactor = 0.6f;
- }
- } else
- shadowDistSet = -1.f;
- }
-
- //--------------------------------------------------------------------------
- // Direct lighting of hit point
- //--------------------------------------------------------------------------
-
- color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor;
- }
-
- //--------------------------------------------------------------------------
- // Write pixel
- //--------------------------------------------------------------------------
-
- int offset = 3 * (x + y * width);
- color = clamp(color, (float4) (0.f, 0.f, 0.f, 0.f), (float4) (1.f, 1.f, 1.f, 0.f));
- if (enableAccumulation) {
- pixels[offset++] += color.s0;
- pixels[offset++] += color.s1;
- pixels[offset ] += color.s2;
- } else {
- pixels[offset++] = color.s0;
- pixels[offset++] = color.s1;
- pixels[offset ] = color.s2;
- }
-}
-
-kernel void multiply(global float *array, const int numElements, const float s) {
- const int gid = get_global_id(0);
- if (gid >= numElements) {
- return;
- }
- array[gid] *= s;
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera.java
deleted file mode 100644
index 50d5868..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Camera.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.common.nio.*;
-
-
-public abstract class Camera {
-
- StructAccessor accessor;
-
- public static int size() {
-// if (CPU.is32Bit()) {
-// return Camera32.size();
-// } else {
- return Camera64.size();
-// }
- }
-
- public static Camera create() {
- return create(Buffers.newDirectByteBuffer(size()));
- }
-
- public static Camera create(java.nio.ByteBuffer buf) {
-// if (CPU.is32Bit()) {
-// return new Camera32(buf);
-// } else {
- return new Camera64(buf);
-// }
- }
-
- Camera(java.nio.ByteBuffer buf) {
- accessor = new StructAccessor(buf);
- }
-
- public java.nio.ByteBuffer getBuffer() {
- return accessor.getBuffer();
- }
-
- public abstract Vec getOrig();
-
- public abstract Vec getTarget();
-
- public abstract Vec getDir();
-
- public abstract Vec getX();
-
- public abstract Vec getY();
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java
deleted file mode 100644
index 85dea51..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Camera32 extends Camera {
-
- public static int size() {
- return 76;
- }
-
- Camera32(java.nio.ByteBuffer buf) {
- super(buf);
- }
-
-
- public Vec getOrig() {
- return Vec.create(accessor.slice(0, 12));
- }
-
- public Vec getTarget() {
- return Vec.create(accessor.slice(16, 12));
- }
-
- public Vec getDir() {
- return Vec.create(accessor.slice(32, 12));
- }
-
- public Vec getX() {
- return Vec.create(accessor.slice(48, 12));
- }
-
- public Vec getY() {
- return Vec.create(accessor.slice(64, 12));
- }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java
deleted file mode 100644
index afe4acd..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Camera64 extends Camera {
-
- private final Vec orig;
- private final Vec target;
- private final Vec dir;
- private final Vec x;
- private final Vec y;
-
- public static int size() {
- return 60;
- }
-
- Camera64(java.nio.ByteBuffer buf) {
- super(buf);
- orig = Vec.create(accessor.slice(0, 12));
- target = Vec.create(accessor.slice(12, 12));
- dir = Vec.create(accessor.slice(24, 12));
- x = Vec.create(accessor.slice(36, 12));
- y = Vec.create(accessor.slice(48, 12));
- }
-
-
- public Vec getOrig() {
- return orig;
- }
-
- public Vec getTarget() {
- return target;
- }
-
- public Vec getDir() {
- return dir;
- }
-
- public Vec getX() {
- return x;
- }
-
- public Vec getY() {
- return y;
- }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java
deleted file mode 100644
index 5bc65b5..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.common.nio.*;
-
-
-public abstract class RenderingConfig {
-
- StructAccessor accessor;
-
- public static int size() {
-// if (CPU.is32Bit()) {
-// return RenderingConfig32.size();
-// } else {
- return RenderingConfig64.size();
-// }
- }
-
- public static RenderingConfig create() {
- return create(Buffers.newDirectByteBuffer(size()));
- }
-
- public static RenderingConfig create(java.nio.ByteBuffer buf) {
-// if (CPU.is32Bit()) {
-// return new RenderingConfig32(buf);
-// } else {
- return new RenderingConfig64(buf);
-// }
- }
-
- RenderingConfig(java.nio.ByteBuffer buf) {
- accessor = new StructAccessor(buf);
- }
-
- public java.nio.ByteBuffer getBuffer() {
- return accessor.getBuffer();
- }
-
- public abstract RenderingConfig setWidth(int val);
-
- public abstract int getWidth();
-
- public abstract RenderingConfig setHeight(int val);
-
- public abstract int getHeight();
-
- public abstract RenderingConfig setSuperSamplingSize(int val);
-
- public abstract int getSuperSamplingSize();
-
- public abstract RenderingConfig setActvateFastRendering(int val);
-
- public abstract int getActvateFastRendering();
-
- public abstract RenderingConfig setEnableShadow(int val);
-
- public abstract int getEnableShadow();
-
- public abstract RenderingConfig setMaxIterations(int val);
-
- public abstract int getMaxIterations();
-
- public abstract RenderingConfig setEpsilon(float val);
-
- public abstract float getEpsilon();
-
- public abstract RenderingConfig setMu(float[] val);
-
- public abstract float[] getMu();
-
- public abstract RenderingConfig setLight(float[] val);
-
- public abstract float[] getLight();
-
- public abstract Camera getCamera();
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java
deleted file mode 100644
index 3898d60..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class RenderingConfig32 extends RenderingConfig {
-
- public static int size() {
- return 140;
- }
-
- RenderingConfig32(java.nio.ByteBuffer buf) {
- super(buf);
- }
-
-
- public RenderingConfig setWidth(int val) {
- accessor.setIntAt(0, val);
- return this;
- }
-
- public int getWidth() {
- return accessor.getIntAt(0);
- }
-
- public RenderingConfig setHeight(int val) {
- accessor.setIntAt(1, val);
- return this;
- }
-
- public int getHeight() {
- return accessor.getIntAt(1);
- }
-
- public RenderingConfig setSuperSamplingSize(int val) {
- accessor.setIntAt(2, val);
- return this;
- }
-
- public int getSuperSamplingSize() {
- return accessor.getIntAt(2);
- }
-
- public RenderingConfig setActvateFastRendering(int val) {
- accessor.setIntAt(3, val);
- return this;
- }
-
- public int getActvateFastRendering() {
- return accessor.getIntAt(3);
- }
-
- public RenderingConfig setEnableShadow(int val) {
- accessor.setIntAt(4, val);
- return this;
- }
-
- public int getEnableShadow() {
- return accessor.getIntAt(4);
- }
-
- public RenderingConfig setMaxIterations(int val) {
- accessor.setIntAt(5, val);
- return this;
- }
-
- public int getMaxIterations() {
- return accessor.getIntAt(5);
- }
-
- public RenderingConfig setEpsilon(float val) {
- accessor.setFloatAt(6, val);
- return this;
- }
-
- public float getEpsilon() {
- return accessor.getFloatAt(6);
- }
-
- public RenderingConfig setMu(float[] val) {
- accessor.setFloatsAt(8, val);
- return this;
- }
-
- public float[] getMu() {
- return accessor.getFloatsAt(8, new float[4]);
- }
-
- public RenderingConfig setLight(float[] val) {
- accessor.setFloatsAt(12, val);
- return this;
- }
-
- public float[] getLight() {
- return accessor.getFloatsAt(12, new float[3]);
- }
-
- public Camera getCamera() {
- return Camera.create(accessor.slice(64, 76));
- }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java
deleted file mode 100644
index e3a0267..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class RenderingConfig64 extends RenderingConfig {
-
- private final Camera camera;
-
- public static int size() {
- return 116;
- }
-
- RenderingConfig64(java.nio.ByteBuffer buf) {
- super(buf);
- camera = Camera.create(accessor.slice(56, 60));
- }
-
-
- public RenderingConfig setWidth(int val) {
- accessor.setIntAt(0, val);
- return this;
- }
-
- public int getWidth() {
- return accessor.getIntAt(0);
- }
-
- public RenderingConfig setHeight(int val) {
- accessor.setIntAt(1, val);
- return this;
- }
-
- public int getHeight() {
- return accessor.getIntAt(1);
- }
-
- public RenderingConfig setSuperSamplingSize(int val) {
- accessor.setIntAt(2, val);
- return this;
- }
-
- public int getSuperSamplingSize() {
- return accessor.getIntAt(2);
- }
-
- public RenderingConfig setActvateFastRendering(int val) {
- accessor.setIntAt(3, val);
- return this;
- }
-
- public int getActvateFastRendering() {
- return accessor.getIntAt(3);
- }
-
- public RenderingConfig setEnableShadow(int val) {
- accessor.setIntAt(4, val);
- return this;
- }
-
- public int getEnableShadow() {
- return accessor.getIntAt(4);
- }
-
- public RenderingConfig setMaxIterations(int val) {
- accessor.setIntAt(5, val);
- return this;
- }
-
- public int getMaxIterations() {
- return accessor.getIntAt(5);
- }
-
- public RenderingConfig setEpsilon(float val) {
- accessor.setFloatAt(6, val);
- return this;
- }
-
- public float getEpsilon() {
- return accessor.getFloatAt(6);
- }
-
- public RenderingConfig setMu(float[] val) {
- accessor.setFloatsAt(7, val);
- return this;
- }
-
- public float[] getMu() {
- return accessor.getFloatsAt(7, new float[4]);
- }
-
- public RenderingConfig setLight(float[] val) {
- accessor.setFloatsAt(11, val);
- return this;
- }
-
- public float[] getLight() {
- return accessor.getFloatsAt(11, new float[3]);
- }
-
- public Camera getCamera() {
- return camera;
- }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec.java
deleted file mode 100644
index 3bd6b18..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Vec.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-
-import com.jogamp.common.nio.*;
-
-
-public abstract class Vec {
-
- StructAccessor accessor;
-
- public static int size() {
-// if (CPU.is32Bit()) {
-// return Vec32.size();
-// } else {
- return Vec64.size();
-// }
- }
-
- public static Vec create() {
- return create(Buffers.newDirectByteBuffer(size()));
- }
-
- public static Vec create(java.nio.ByteBuffer buf) {
-// if (CPU.is32Bit()) {
-// return new Vec32(buf);
-// } else {
- return new Vec64(buf);
-// }
- }
-
- Vec(java.nio.ByteBuffer buf) {
- accessor = new StructAccessor(buf);
- }
-
- public java.nio.ByteBuffer getBuffer() {
- return accessor.getBuffer();
- }
-
- public abstract Vec setX(float val);
-
- public abstract float getX();
-
- public abstract Vec setY(float val);
-
- public abstract float getY();
-
- public abstract Vec setZ(float val);
-
- public abstract float getZ();
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java
deleted file mode 100644
index 3c0ee5d..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Vec32 extends Vec {
-
- public static int size() {
- return 12;
- }
-
- Vec32(java.nio.ByteBuffer buf) {
- super(buf);
- }
-
-
- public Vec setX(float val) {
- accessor.setFloatAt(0, val);
- return this;
- }
-
- public float getX() {
- return accessor.getFloatAt(0);
- }
-
- public Vec setY(float val) {
- accessor.setFloatAt(1, val);
- return this;
- }
-
- public float getY() {
- return accessor.getFloatAt(1);
- }
-
- public Vec setZ(float val) {
- accessor.setFloatAt(2, val);
- return this;
- }
-
- public float getZ() {
- return accessor.getFloatAt(2);
- }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java
deleted file mode 100644
index 5952021..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Vec64 extends Vec {
-
- public static int size() {
- return 12;
- }
-
- Vec64(java.nio.ByteBuffer buf) {
- super(buf);
- }
-
-
- public Vec setX(float val) {
- accessor.setFloatAt(0, val);
- return this;
- }
-
- public float getX() {
- return accessor.getFloatAt(0);
- }
-
- public Vec setY(float val) {
- accessor.setFloatAt(1, val);
- return this;
- }
-
- public float getY() {
- return accessor.getFloatAt(1);
- }
-
- public Vec setZ(float val) {
- accessor.setFloatAt(2, val);
- return this;
- }
-
- public float getZ() {
- return accessor.getFloatAt(2);
- }
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl b/src/com/mbien/opencl/demos/radixsort/RadixSort.cl
deleted file mode 100644
index d014692..0000000
--- a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
-* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
-*
-* NVIDIA Corporation and its licensors retain all intellectual property and
-* proprietary rights in and to this software and related documentation.
-* Any use, reproduction, disclosure, or distribution of this software
-* and related documentation without an express license agreement from
-* NVIDIA Corporation is strictly prohibited.
-*
-* Please refer to the applicable NVIDIA end user license agreement (EULA)
-* associated with this source code for terms and conditions that govern
-* your use of this NVIDIA software.
-*
-*/
-
-//----------------------------------------------------------------------------
-// Scans each warp in parallel ("warp-scan"), one element per thread.
-// uses 2 numElements of shared memory per thread (64 = elements per warp)
-//----------------------------------------------------------------------------
-//#define WARP_SIZE 32
-uint scanwarp(uint val, __local uint* sData, int maxlevel)
-{
- // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp =
- // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
- int localId = get_local_id(0);
- int idx = 2 * localId - (localId & (WARP_SIZE - 1));
- sData[idx] = 0;
- idx += WARP_SIZE;
- sData[idx] = val;
-
- if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; }
- if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; }
- if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; }
- if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; }
- if (4 <= maxlevel) { sData[idx] += sData[idx -16]; }
-
- return sData[idx] - val; // convert inclusive -> exclusive
-}
-
-//----------------------------------------------------------------------------
-// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using
-// a warp-scan algorithm
-//----------------------------------------------------------------------------
-uint4 scan4(uint4 idata, __local uint* ptr)
-{
-
- uint idx = get_local_id(0);
-
- uint4 val4 = idata;
- uint sum[3];
- sum[0] = val4.x;
- sum[1] = val4.y + sum[0];
- sum[2] = val4.z + sum[1];
-
- uint val = val4.w + sum[2];
-
- val = scanwarp(val, ptr, 4);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
- {
- ptr[idx >> 5] = val + val4.w + sum[2];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (idx < WARP_SIZE)
- ptr[idx] = scanwarp(ptr[idx], ptr, 2);
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- val += ptr[idx >> 5];
-
- val4.x = val;
- val4.y = val + sum[0];
- val4.z = val + sum[1];
- val4.w = val + sum[2];
-
- return val4;
-}
-
-#ifdef MAC
-__kernel uint4 rank4(uint4 preds, __local uint* sMem)
-#else
-uint4 rank4(uint4 preds, __local uint* sMem)
-#endif
-{
- int localId = get_local_id(0);
- int localSize = get_local_size(0);
-
- uint4 address = scan4(preds, sMem);
-
- __local uint numtrue;
- if (localId == localSize - 1)
- {
- numtrue = address.w + preds.w;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- uint4 rank;
- int idx = localId*4;
- rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
- rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
- rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
- rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
-
- return rank;
-}
-
-void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem)
-{
- int localId = get_local_id(0);
- int localSize = get_local_size(0);
-
- for(uint shift = startbit; shift < (startbit + nbits); ++shift)
- {
- uint4 lsb;
- lsb.x = !(((*key).x >> shift) & 0x1);
- lsb.y = !(((*key).y >> shift) & 0x1);
- lsb.z = !(((*key).z >> shift) & 0x1);
- lsb.w = !(((*key).w >> shift) & 0x1);
-
- uint4 r;
-
- r = rank4(lsb, sMem);
-
- // This arithmetic strides the ranks across 4 CTA_SIZE regions
- sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x;
- sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y;
- sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z;
- sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // The above allows us to read without 4-way bank conflicts:
- (*key).x = sMem[localId];
- (*key).y = sMem[localId + localSize];
- (*key).z = sMem[localId + 2 * localSize];
- (*key).w = sMem[localId + 3 * localSize];
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn,
- __global uint4* keysOut,
- uint nbits,
- uint startbit,
- uint numElements,
- uint totalBlocks,
- __local uint* sMem)
-{
- int globalId = get_global_id(0);
-
- uint4 key;
- key = keysIn[globalId];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- radixSortBlockKeysOnly(&key, nbits, startbit, sMem);
-
- keysOut[globalId] = key;
-}
-
-//----------------------------------------------------------------------------
-// Given an array with blocks sorted according to a 4-bit radix group, each
-// block counts the number of keys that fall into each radix in the group, and
-// finds the starting offset of each radix in the block. It then writes the radix
-// counts to the counters array, and the starting offsets to the blockOffsets array.
-//
-// Template parameters are used to generate efficient code for various special cases
-// For example, we have to handle arrays that are a multiple of the block size
-// (fullBlocks) differently than arrays that are not. "loop" is used when persistent
-// CTAs are used.
-//
-// By persistent CTAs we mean that we launch only as many thread blocks as can
-// be resident in the GPU and no more, rather than launching as many threads as
-// we have elements. Persistent CTAs loop over blocks of elements until all work
-// is complete. This can be faster in some cases. In our tests it is faster
-// for large sorts (and the threshold is higher on compute version 1.1 and earlier
-// GPUs than it is on compute version 1.2 GPUs.
-//
-//----------------------------------------------------------------------------
-__kernel void findRadixOffsets(__global uint2* keys,
- __global uint* counters,
- __global uint* blockOffsets,
- uint startbit,
- uint numElements,
- uint totalBlocks,
- __local uint* sRadix1)
-{
- __local uint sStartPointers[16];
-
- uint groupId = get_group_id(0);
- uint localId = get_local_id(0);
- uint groupSize = get_local_size(0);
-
- uint2 radix2;
-
- radix2 = keys[get_global_id(0)];
-
-
- sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF;
- sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
-
- // Finds the position where the sRadix1 entries differ and stores start
- // index for each radix.
- if(localId < 16)
- {
- sStartPointers[localId] = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) )
- {
- sStartPointers[sRadix1[localId]] = localId;
- }
- if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1])
- {
- sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(localId < 16)
- {
- blockOffsets[groupId*16 + localId] = sStartPointers[localId];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Compute the sizes of each block.
- if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) )
- {
- sStartPointers[sRadix1[localId - 1]] =
- localId - sStartPointers[sRadix1[localId - 1]];
- }
- if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] )
- {
- sStartPointers[sRadix1[localId + groupSize - 1]] =
- localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
- }
-
-
- if(localId == groupSize - 1)
- {
- sStartPointers[sRadix1[2 * groupSize - 1]] =
- 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(localId < 16)
- {
- counters[localId * totalBlocks + groupId] = sStartPointers[localId];
- }
-}
-
-// a naive scan routine that works only for array that
-// can fit into a single block, just for debugging purpose,
-// not used in the sort now
-__kernel void scanNaive(__global uint *g_odata,
- __global uint *g_idata,
- uint n,
- __local uint* temp)
-{
-
- int localId = get_local_id(0);
-
- int pout = 0;
- int pin = 1;
-
- // Cache the computational window in shared memory
- temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0;
-
- for (int offset = 1; offset < n; offset *= 2)
- {
- pout = 1 - pout;
- pin = 1 - pout;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- temp[pout*n+localId] = temp[pin*n+localId];
-
- if (localId >= offset)
- temp[pout*n+localId] += temp[pin*n+localId - offset];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- g_odata[localId] = temp[pout*n+localId];
-}
-
-//----------------------------------------------------------------------------
-// reorderData shuffles data in the array globally after the radix offsets
-// have been found. On compute version 1.1 and earlier GPUs, this code depends
-// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
-//
-// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
-// that all writes are coalesced using extra work in the kernel. On later
-// GPUs coalescing rules have been relaxed, so this extra overhead hurts
-// performance. On these GPUs we set manualCoalesce=false and directly store
-// the results.
-//
-// Template parameters are used to generate efficient code for various special cases
-// For example, we have to handle arrays that are a multiple of the block size
-// (fullBlocks) differently than arrays that are not. "loop" is used when persistent
-// CTAs are used.
-//
-// By persistent CTAs we mean that we launch only as many thread blocks as can
-// be resident in the GPU and no more, rather than launching as many threads as
-// we have elements. Persistent CTAs loop over blocks of elements until all work
-// is complete. This can be faster in some cases. In our tests it is faster
-// for large sorts (and the threshold is higher on compute version 1.1 and earlier
-// GPUs than it is on compute version 1.2 GPUs.
-//----------------------------------------------------------------------------
-__kernel void reorderDataKeysOnly(__global uint *outKeys,
- __global uint2 *keys,
- __global uint *blockOffsets,
- __global uint *offsets,
- __global uint *sizes,
- uint startbit,
- uint numElements,
- uint totalBlocks,
- __local uint2* sKeys2)
-{
- __local uint sOffsets[16];
- __local uint sBlockOffsets[16];
-
- __local uint *sKeys1 = (__local uint*)sKeys2;
-
- uint groupId = get_group_id(0);
-
- uint globalId = get_global_id(0);
- uint localId = get_local_id(0);
- uint groupSize = get_local_size(0);
-
- sKeys2[localId] = keys[globalId];
-
- if(localId < 16)
- {
- sOffsets[localId] = offsets[localId * totalBlocks + groupId];
- sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- uint radix = (sKeys1[localId] >> startbit) & 0xF;
- uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
-
- if (globalOffset < numElements)
- {
- outKeys[globalOffset] = sKeys1[localId];
- }
-
- radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
- globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
-
- if (globalOffset < numElements)
- {
- outKeys[globalOffset] = sKeys1[localId + groupSize];
- }
-
-
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.java b/src/com/mbien/opencl/demos/radixsort/RadixSort.java
deleted file mode 100644
index 783c094..0000000
--- a/src/com/mbien/opencl/demos/radixsort/RadixSort.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * 20:38 Sunday, February 28 2010
- */
-
-package com.mbien.opencl.demos.radixsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLResource;
-import java.io.IOException;
-import java.nio.IntBuffer;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.*;
-import static com.mbien.opencl.CLProgram.CompilerOptions.*;
-import static java.lang.System.*;
-
-/**
- *
- * @author Michael Bien
- */
-public class RadixSort implements CLResource {
-
- private static final int NUM_BANKS = 16;
- private static final int WARP_SIZE = 32;
- private static final int bitStep = 4;
-
- private final int CTA_SIZE;
-
- private final CLKernel ckRadixSortBlocksKeysOnly;
- private final CLKernel ckFindRadixOffsets;
- private final CLKernel ckScanNaive;
- private final CLKernel ckReorderDataKeysOnly;
-
- private final CLBuffer<?> tempKeys;
- private final CLBuffer<?> mCounters;
- private final CLBuffer<?> mCountersSum;
- private final CLBuffer<?> mBlockOffsets;
-
- private final CLCommandQueue queue;
- private final Scan scan;
- private final CLProgram program;
-
- public RadixSort(CLCommandQueue queue, int maxElements, int CTA_SIZE) throws IOException {
-
- this.CTA_SIZE = CTA_SIZE;
- scan = new Scan(queue, maxElements / 2 / CTA_SIZE * 16);
-
- int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0)
- ? (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1);
-
- this.queue = queue;
-
- CLContext context = queue.getContext();
- this.tempKeys = context.createBuffer(4 * maxElements, READ_WRITE);
- this.mCounters = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
- this.mCountersSum = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
- this.mBlockOffsets = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
-
- program = context.createProgram(getClass().getResourceAsStream("RadixSort.cl"))
- .build(ENABLE_MAD, define("WARP_SIZE", WARP_SIZE));
-
-// out.println(program.getBuildLog());
-
- ckRadixSortBlocksKeysOnly = program.createCLKernel("radixSortBlocksKeysOnly");
- ckFindRadixOffsets = program.createCLKernel("findRadixOffsets");
- ckScanNaive = program.createCLKernel("scanNaive");
- ckReorderDataKeysOnly = program.createCLKernel("reorderDataKeysOnly");
-
- }
-
- void sort(CLBuffer<IntBuffer> d_keys, int numElements, int keyBits) {
- radixSortKeysOnly(d_keys, numElements, keyBits);
- }
-
- //----------------------------------------------------------------------------
- // Main key-only radix sort function. Sorts in place in the keys and values
- // arrays, but uses the other device arrays as temporary storage. All pointer
- // parameters are device pointers. Uses cudppScan() for the prefix sum of
- // radix counters.
- //----------------------------------------------------------------------------
- void radixSortKeysOnly(CLBuffer<IntBuffer> keys, int numElements, int keyBits) {
- int i = 0;
- while (keyBits > i * bitStep) {
- radixSortStepKeysOnly(keys, bitStep, i * bitStep, numElements);
- i++;
- }
- }
-
- //----------------------------------------------------------------------------
- // Perform one step of the radix sort. Sorts by nbits key bits per step,
- // starting at startbit.
- //----------------------------------------------------------------------------
- void radixSortStepKeysOnly(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
-
- // Four step algorithms from Satish, Harris & Garland
- radixSortBlocksKeysOnlyOCL(keys, nbits, startbit, numElements);
-
- findRadixOffsetsOCL(startbit, numElements);
-
- scan.scanExclusiveLarge(mCountersSum, mCounters, 1, numElements / 2 / CTA_SIZE * 16);
-
- reorderDataKeysOnlyOCL(keys, startbit, numElements);
- }
-
- //----------------------------------------------------------------------------
- // Wrapper for the kernels of the four steps
- //----------------------------------------------------------------------------
- void radixSortBlocksKeysOnlyOCL(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
-
- int totalBlocks = numElements / 4 / CTA_SIZE;
- int globalWorkSize = CTA_SIZE * totalBlocks;
- int localWorkSize = CTA_SIZE;
-
- ckRadixSortBlocksKeysOnly.putArg(keys).putArg(tempKeys).putArg(nbits).putArg(startbit)
- .putArg(numElements).putArg(totalBlocks).putNullArg(4 * CTA_SIZE * 4)
- .rewind();
-
- queue.put1DRangeKernel(ckRadixSortBlocksKeysOnly, 0, globalWorkSize, localWorkSize);
- }
-
- void findRadixOffsetsOCL(int startbit, int numElements) {
-
- int totalBlocks = numElements / 2 / CTA_SIZE;
- int globalWorkSize = CTA_SIZE * totalBlocks;
- int localWorkSize = CTA_SIZE;
-
- ckFindRadixOffsets.putArg(tempKeys).putArg(mCounters).putArg(mBlockOffsets)
- .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4)
- .rewind();
-
- queue.put1DRangeKernel(ckFindRadixOffsets, 0, globalWorkSize, localWorkSize);
- }
-
- void scanNaiveOCL(int numElements) {
-
- int nHist = numElements / 2 / CTA_SIZE * 16;
- int globalWorkSize = nHist;
- int localWorkSize = nHist;
- int extra_space = nHist / NUM_BANKS;
- int shared_mem_size = 4 * (nHist + extra_space);
-
- ckScanNaive.putArg(mCountersSum).putArg(mCounters).putArg(nHist).putNullArg(2 * shared_mem_size).rewind();
-
- queue.put1DRangeKernel(ckScanNaive, 0, globalWorkSize, localWorkSize);
- }
-
- void reorderDataKeysOnlyOCL(CLBuffer<IntBuffer> keys, int startbit, int numElements) {
-
- int totalBlocks = numElements / 2 / CTA_SIZE;
- int globalWorkSize = CTA_SIZE * totalBlocks;
- int localWorkSize = CTA_SIZE;
-
- ckReorderDataKeysOnly.putArg(keys).putArg(tempKeys).putArg(mBlockOffsets).putArg(mCountersSum).putArg(mCounters)
- .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4).rewind();
-
- queue.put1DRangeKernel(ckReorderDataKeysOnly, 0, globalWorkSize, localWorkSize);
- }
-
- public void release() {
-
- scan.release();
-
- //program & kernels
- program.release();
-
- //buffers
- tempKeys.release();
- mCounters.release();
- mCountersSum.release();
- mBlockOffsets.release();
- }
-
- public void close() {
- release();
- }
-
-
-
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java b/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java
deleted file mode 100644
index 93bc071..0000000
--- a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * 20:48 Sunday, February 28 2010
- */
-
-package com.mbien.opencl.demos.radixsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLPlatform;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.Random;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static java.lang.System.*;
-import static com.mbien.opencl.CLDevice.Type.*;
-
-/**
- * GPU radix sort demo.
- * @author Michael Bien
- */
-public class RadixSortDemo {
-
- public RadixSortDemo() throws IOException {
-
- CLContext context = null;
- try{
- //single GPU setup
- context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU));
- CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
-
- int maxValue = Integer.MAX_VALUE;
- int samples = 10;
-
- int[] workgroupSizes = new int[] {128, 256};
-
- int[] runs = new int[] { 32768,
- 65536,
- 131072,
- 262144,
- 524288,
- 1048576,
- 2097152,
- 4194304,
- 8388608 };
-
- for (int i = 0; i < workgroupSizes.length; i++) {
-
- int workgroupSize = workgroupSizes[i];
-
- out.println("\n = = = workgroup size: "+workgroupSize+" = = = ");
-
- for(int run = 0; run < runs.length; run++) {
-
- if( workgroupSize==128 && runs[run] >= 8388608
- || workgroupSize==256 && runs[run] <= 32768) {
- continue; // we can only sort up to 4MB with wg size of 128
- }
-
- int numElements = runs[run];
-
- CLBuffer<IntBuffer> array = context.createIntBuffer(numElements, READ_WRITE);
- out.print("array size: " + array.getCLSize()/1000000.0f+"MB; ");
- out.println("elements: " + array.getCapacity()/1000+"K");
-
- fillBuffer(array, maxValue);
-
- RadixSort radixSort = new RadixSort(queue, numElements, workgroupSize);
- for(int a = 0; a < samples; a++) {
-
- queue.finish();
-
- long time = nanoTime();
-
- queue.putWriteBuffer(array, false);
- radixSort.sort(array, numElements, 32);
- queue.putReadBuffer(array, true);
-
- out.println("time: " + (nanoTime() - time)/1000000.0f+"ms");
- }
-
- out.print("snapshot: ");
- printSnapshot(array.getBuffer(), 20);
-
- out.println("validating...");
- checkIfSorted(array.getBuffer());
- out.println("values sorted");
-
- array.release();
- radixSort.release();
- }
- }
-
- }finally{
- if(context != null) {
- context.release();
- }
- }
-
- }
-
- private void fillBuffer(CLBuffer<IntBuffer> array, int maxValue) {
- Random random = new Random(42);
- for (int n = 0; n < array.getBuffer().capacity(); n++) {
- int rnd = random.nextInt(maxValue);
- array.getBuffer().put(n, rnd);
- }
- }
-
- private void printSnapshot(IntBuffer buffer, int snapshot) {
- for(int i = 0; i < snapshot; i++)
- out.print(buffer.get() + ", ");
- out.println("...; " + buffer.remaining() + " more");
- buffer.rewind();
- }
-
- private void checkIfSorted(IntBuffer keys) {
- for (int i = 1; i < keys.capacity(); i++) {
- if (keys.get(i - 1) > keys.get(i)) {
- throw new RuntimeException("not sorted "+ keys.get(i - 1) +" !> "+ keys.get(i));
- }
- }
- }
-
- public static void main(String[] args) throws IOException {
- new RadixSortDemo();
- }
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/Scan.java b/src/com/mbien/opencl/demos/radixsort/Scan.java
deleted file mode 100644
index 1426e9b..0000000
--- a/src/com/mbien/opencl/demos/radixsort/Scan.java
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * 22:12 Sunday, February 28 2010
- */
-package com.mbien.opencl.demos.radixsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLResource;
-import java.io.IOException;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.CompilerOptions.*;
-
-/**
- *
- * @author Michael Bien
- */
-public class Scan implements CLResource {
-
- private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024;
- private final static int MAX_LOCAL_GROUP_SIZE = 256;
- private final static int WORKGROUP_SIZE = 256;
- private final static int MAX_BATCH_ELEMENTS = 64 * 1048576;
- private final static int MIN_SHORT_ARRAY_SIZE = 4;
- private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE;
- private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE;
- private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE;
-
- private final CLKernel ckScanExclusiveLocal1;
- private final CLKernel ckScanExclusiveLocal2;
- private final CLKernel ckUniformUpdate;
-
- private final CLCommandQueue queue;
- private final CLProgram program;
- private CLBuffer<?> buffer;
-
- public Scan(CLCommandQueue queue, int numElements) throws IOException {
-
- this.queue = queue;
-
- CLContext context = queue.getContext();
- if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) {
- buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE);
- }
- program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl"))
- .build(ENABLE_MAD);
-
- ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1");
- ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2");
- ckUniformUpdate = program.createCLKernel("uniformUpdate");
- }
-
- // main exclusive scan routine
- void scanExclusiveLarge(CLBuffer<?> dst, CLBuffer<?> src, int batchSize, int arrayLength) {
-
- //Check power-of-two factorization
- if(!isPowerOf2(arrayLength)) {
- throw new RuntimeException();
- }
-
- //Check supported size range
- if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) {
- throw new RuntimeException();
- }
-
- //Check total batch size limit
- if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) {
- throw new RuntimeException();
- }
-
- scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE);
- scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE));
- uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE));
- }
-
- void scanExclusiveLocal1(CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
-
- ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size)
- .rewind();
-
- int localWorkSize = WORKGROUP_SIZE;
- int globalWorkSize = (n * size) / 4;
-
- queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize);
- }
-
- void scanExclusiveLocal2(CLBuffer<?> buffer, CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
-
- int elements = n * size;
- ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4)
- .putArg(elements).putArg(size).rewind();
-
- int localWorkSize = WORKGROUP_SIZE;
- int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);
-
- queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize);
- }
-
- void uniformUpdate(CLBuffer<?> dst, CLBuffer<?> buffer, int n) {
-
- ckUniformUpdate.setArgs(dst, buffer);
-
- int localWorkSize = WORKGROUP_SIZE;
- int globalWorkSize = n * WORKGROUP_SIZE;
-
- queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize);
- }
-
- private int iSnapUp(int dividend, int divisor) {
- return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor);
- }
-
- public static boolean isPowerOf2(int x) {
- return ((x - 1) & x) == 0;
- }
-
- public void release() {
- program.release();
-
- if(buffer!=null) {
- buffer.release();
- }
- }
-
- public void close() {
- release();
- }
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl b/src/com/mbien/opencl/demos/radixsort/Scan_b.cl
deleted file mode 100644
index 32fd4dd..0000000
--- a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and
- * proprietary rights in and to this software and related documentation.
- * Any use, reproduction, disclosure, or distribution of this software
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA)
- * associated with this source code for terms and conditions that govern
- * your use of this NVIDIA software.
- *
- */
-
-
-
-//All three kernels run 512 threads per workgroup
-//Must be a power of two
-#define WORKGROUP_SIZE 256
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Scan codelets
-////////////////////////////////////////////////////////////////////////////////
-#if(1)
- //Naive inclusive scan: O(N * log2(N)) operations
- //Allocate 2 * 'size' local memory, initialize the first half
- //with 'size' zeros avoiding if(pos >= offset) condition evaluation
- //and saving instructions
- inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
- uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
- l_Data[pos] = 0;
- pos += size;
- l_Data[pos] = idata;
-
- for(uint offset = 1; offset < size; offset <<= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- uint t = l_Data[pos] + l_Data[pos - offset];
- barrier(CLK_LOCAL_MEM_FENCE);
- l_Data[pos] = t;
- }
-
- return l_Data[pos];
- }
-
- inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
- return scan1Inclusive(idata, l_Data, size) - idata;
- }
-
-#else
- #define LOG2_WARP_SIZE 5U
- #define WARP_SIZE (1U << LOG2_WARP_SIZE)
-
- //Almost the same as naiveScan1 but doesn't need barriers
- //assuming size <= WARP_SIZE
- inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){
- uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
- l_Data[pos] = 0;
- pos += size;
- l_Data[pos] = idata;
-
- for(uint offset = 1; offset < size; offset <<= 1)
- l_Data[pos] += l_Data[pos - offset];
-
- return l_Data[pos];
- }
-
- inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){
- return warpScanInclusive(idata, l_Data, size) - idata;
- }
-
- inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
- if(size > WARP_SIZE){
- //Bottom-level inclusive warp scan
- uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE);
-
- //Save top elements of each warp for exclusive warp scan
- //sync to wait for warp scans to complete (because l_Data is being overwritten)
- barrier(CLK_LOCAL_MEM_FENCE);
- if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
- l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult;
-
- //wait for warp scans to complete
- barrier(CLK_LOCAL_MEM_FENCE);
- if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){
- //grab top warp elements
- uint val = l_Data[get_local_id(0)];
- //calculate exclsive scan and write back to shared memory
- l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE);
- }
-
- //return updated warp scans with exclusive scan results
- barrier(CLK_LOCAL_MEM_FENCE);
- return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE];
- }else{
- return warpScanInclusive(idata, l_Data, size);
- }
- }
-
- inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
- return scan1Inclusive(idata, l_Data, size) - idata;
- }
-#endif
-
-
-//Vector scan: the array to be scanned is stored
-//in work-item private memory as uint4
-inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){
- //Level-0 inclusive scan
- data4.y += data4.x;
- data4.z += data4.y;
- data4.w += data4.z;
-
- //Level-1 exclusive scan
- uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w;
-
- return (data4 + (uint4)val);
-}
-
-inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){
- return scan4Inclusive(data4, l_Data, size) - data4;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Scan kernels
-////////////////////////////////////////////////////////////////////////////////
-__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
-void scanExclusiveLocal1(
- __global uint4 *d_Dst,
- __global uint4 *d_Src,
- __local uint* l_Data,
- uint size
-){
- //Load data
- uint4 idata4 = d_Src[get_global_id(0)];
-
- //Calculate exclusive scan
- uint4 odata4 = scan4Exclusive(idata4, l_Data, size);
-
- //Write back
- d_Dst[get_global_id(0)] = odata4;
-}
-
-//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
-__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
-void scanExclusiveLocal2(
- __global uint *d_Buf,
- __global uint *d_Dst,
- __global uint *d_Src,
- __local uint* l_Data,
- uint N,
- uint arrayLength
-){
- //Load top elements
- //Convert results of bottom-level scan back to inclusive
- //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N)
- uint data = 0;
- if(get_global_id(0) < N)
- data =
- d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] +
- d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)];
-
- //Compute
- uint odata = scan1Exclusive(data, l_Data, arrayLength);
-
- //Avoid out-of-bound access
- if(get_global_id(0) < N)
- d_Buf[get_global_id(0)] = odata;
-}
-
-//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
-__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
-void uniformUpdate(
- __global uint4 *d_Data,
- __global uint *d_Buf
-){
- __local uint buf[1];
-
- uint4 data4 = d_Data[get_global_id(0)];
-
- if(get_local_id(0) == 0)
- buf[0] = d_Buf[get_group_id(0)];
-
- barrier(CLK_LOCAL_MEM_FENCE);
- data4 += (uint4)buf[0];
- d_Data[get_global_id(0)] = data4;
-}