From 2c85c416d85205ab98b33e1a0b0daab32d4d81ff Mon Sep 17 00:00:00 2001
From: Michael Bien <mbien@fh-landshut.de>
Date: Mon, 12 Apr 2010 22:27:03 +0200
Subject: changes due to package renaming in jocl.

---
 nbproject/configs/BitonicSort.properties           |   2 +-
 nbproject/configs/GLCLMandel.properties            |   2 +-
 nbproject/configs/GLCLinterop.properties           |   2 +-
 nbproject/configs/Julia3d.properties               |   2 +-
 nbproject/configs/RadixSort.properties             |   2 +-
 nbproject/project.properties                       |   4 +-
 .../jogamp/opencl/demos/bitonicsort/BitonicSort.cl | 214 +++++++++
 .../opencl/demos/bitonicsort/BitonicSort.java      | 201 +++++++++
 src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl  |  51 +++
 .../opencl/demos/fractal/MultiDeviceFractal.java   | 485 ++++++++++++++++++++
 .../demos/fractal/MultiDeviceFractal.java.orig     | 484 ++++++++++++++++++++
 .../jogamp/opencl/demos/hellojocl/HelloJOCL.java   |  91 ++++
 src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl |  15 +
 .../joglinterop/GLCLInteroperabilityDemo.java      | 277 ++++++++++++
 .../jogamp/opencl/demos/joglinterop/JoglInterop.cl |  23 +
 .../demos/joglinterop/UserSceneInteraction.java    | 103 +++++
 src/com/jogamp/opencl/demos/julia3d/Julia3d.java   | 212 +++++++++
 src/com/jogamp/opencl/demos/julia3d/Renderer.java  | 203 +++++++++
 .../opencl/demos/julia3d/UserSceneController.java  | 249 +++++++++++
 src/com/jogamp/opencl/demos/julia3d/config.h       |  24 +
 .../opencl/demos/julia3d/mandelbrot_kernel.cl      | 357 +++++++++++++++
 .../opencl/demos/julia3d/rendering_kernel.cl       | 382 ++++++++++++++++
 .../opencl/demos/julia3d/structs/Camera.java       |  50 +++
 .../opencl/demos/julia3d/structs/Camera32.java     |  37 ++
 .../opencl/demos/julia3d/structs/Camera64.java     |  48 ++
 .../demos/julia3d/structs/RenderingConfig.java     |  78 ++++
 .../demos/julia3d/structs/RenderingConfig32.java   | 102 +++++
 .../demos/julia3d/structs/RenderingConfig64.java   | 105 +++++
 .../jogamp/opencl/demos/julia3d/structs/Vec.java   |  53 +++
 .../jogamp/opencl/demos/julia3d/structs/Vec32.java |  44 ++
 .../jogamp/opencl/demos/julia3d/structs/Vec64.java |  44 ++
 src/com/jogamp/opencl/demos/radixsort/RadixSort.cl | 358 +++++++++++++++
 .../jogamp/opencl/demos/radixsort/RadixSort.java   | 182 ++++++++
 .../opencl/demos/radixsort/RadixSortDemo.java      | 129 ++++++
 src/com/jogamp/opencl/demos/radixsort/Scan.java    | 131 ++++++
 src/com/jogamp/opencl/demos/radixsort/Scan_b.cl    | 190 ++++++++
 .../mbien/opencl/demos/bitonicsort/BitonicSort.cl  | 214 ---------
 .../opencl/demos/bitonicsort/BitonicSort.java      | 201 ---------
 src/com/mbien/opencl/demos/fractal/Mandelbrot.cl   |  51 ---
 .../opencl/demos/fractal/MultiDeviceFractal.java   | 486 ---------------------
 .../demos/fractal/MultiDeviceFractal.java.orig     | 484 --------------------
 .../mbien/opencl/demos/hellojocl/HelloJOCL.java    |  91 ----
 src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl  |  15 -
 .../joglinterop/GLCLInteroperabilityDemo.java      | 277 ------------
 .../mbien/opencl/demos/joglinterop/JoglInterop.cl  |  23 -
 .../demos/joglinterop/UserSceneInteraction.java    | 103 -----
 src/com/mbien/opencl/demos/julia3d/Julia3d.java    | 212 ---------
 src/com/mbien/opencl/demos/julia3d/Renderer.java   | 203 ---------
 .../opencl/demos/julia3d/UserSceneController.java  | 249 -----------
 src/com/mbien/opencl/demos/julia3d/config.h        |  24 -
 .../opencl/demos/julia3d/mandelbrot_kernel.cl      | 357 ---------------
 .../mbien/opencl/demos/julia3d/rendering_kernel.cl | 382 ----------------
 .../mbien/opencl/demos/julia3d/structs/Camera.java |  52 ---
 .../opencl/demos/julia3d/structs/Camera32.java     |  41 --
 .../opencl/demos/julia3d/structs/Camera64.java     |  52 ---
 .../demos/julia3d/structs/RenderingConfig.java     |  80 ----
 .../demos/julia3d/structs/RenderingConfig32.java   | 106 -----
 .../demos/julia3d/structs/RenderingConfig64.java   | 109 -----
 .../mbien/opencl/demos/julia3d/structs/Vec.java    |  53 ---
 .../mbien/opencl/demos/julia3d/structs/Vec32.java  |  48 --
 .../mbien/opencl/demos/julia3d/structs/Vec64.java  |  48 --
 src/com/mbien/opencl/demos/radixsort/RadixSort.cl  | 358 ---------------
 .../mbien/opencl/demos/radixsort/RadixSort.java    | 183 --------
 .../opencl/demos/radixsort/RadixSortDemo.java      | 129 ------
 src/com/mbien/opencl/demos/radixsort/Scan.java     | 131 ------
 src/com/mbien/opencl/demos/radixsort/Scan_b.cl     | 190 --------
 66 files changed, 4929 insertions(+), 4959 deletions(-)
 create mode 100644 src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl
 create mode 100644 src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java
 create mode 100644 src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl
 create mode 100644 src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java
 create mode 100644 src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig
 create mode 100644 src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
 create mode 100644 src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl
 create mode 100644 src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
 create mode 100644 src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl
 create mode 100644 src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/Julia3d.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/Renderer.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/UserSceneController.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/config.h
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Camera.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Vec.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java
 create mode 100644 src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java
 create mode 100644 src/com/jogamp/opencl/demos/radixsort/RadixSort.cl
 create mode 100644 src/com/jogamp/opencl/demos/radixsort/RadixSort.java
 create mode 100644 src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java
 create mode 100644 src/com/jogamp/opencl/demos/radixsort/Scan.java
 create mode 100644 src/com/jogamp/opencl/demos/radixsort/Scan_b.cl
 delete mode 100644 src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
 delete mode 100644 src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
 delete mode 100644 src/com/mbien/opencl/demos/fractal/Mandelbrot.cl
 delete mode 100644 src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java
 delete mode 100644 src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig
 delete mode 100644 src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java
 delete mode 100644 src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl
 delete mode 100644 src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
 delete mode 100644 src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl
 delete mode 100644 src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/Julia3d.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/Renderer.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/UserSceneController.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/config.h
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Camera.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Camera32.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Camera64.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Vec.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Vec32.java
 delete mode 100644 src/com/mbien/opencl/demos/julia3d/structs/Vec64.java
 delete mode 100644 src/com/mbien/opencl/demos/radixsort/RadixSort.cl
 delete mode 100644 src/com/mbien/opencl/demos/radixsort/RadixSort.java
 delete mode 100644 src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java
 delete mode 100644 src/com/mbien/opencl/demos/radixsort/Scan.java
 delete mode 100644 src/com/mbien/opencl/demos/radixsort/Scan_b.cl

diff --git a/nbproject/configs/BitonicSort.properties b/nbproject/configs/BitonicSort.properties
index afddef4..87fc951 100644
--- a/nbproject/configs/BitonicSort.properties
+++ b/nbproject/configs/BitonicSort.properties
@@ -1 +1 @@
-main.class=com.mbien.opencl.demos.bitonicsort.BitonicSort
+main.class=com.jogamp.opencl.demos.bitonicsort.BitonicSort
diff --git a/nbproject/configs/GLCLMandel.properties b/nbproject/configs/GLCLMandel.properties
index fac72c7..fab0042 100644
--- a/nbproject/configs/GLCLMandel.properties
+++ b/nbproject/configs/GLCLMandel.properties
@@ -1 +1 @@
-main.class=com.mbien.opencl.demos.fractal.MultiDeviceFractal
+main.class=com.jogamp.opencl.demos.fractal.MultiDeviceFractal
diff --git a/nbproject/configs/GLCLinterop.properties b/nbproject/configs/GLCLinterop.properties
index 710c784..eff8ca4 100644
--- a/nbproject/configs/GLCLinterop.properties
+++ b/nbproject/configs/GLCLinterop.properties
@@ -1 +1 @@
-main.class=com.mbien.opencl.demos.joglinterop.GLCLInteroperabilityDemo
+main.class=com.jogamp.opencl.demos.joglinterop.GLCLInteroperabilityDemo
diff --git a/nbproject/configs/Julia3d.properties b/nbproject/configs/Julia3d.properties
index 3fa79e8..e11756d 100644
--- a/nbproject/configs/Julia3d.properties
+++ b/nbproject/configs/Julia3d.properties
@@ -1 +1 @@
-main.class=com.mbien.opencl.demos.julia3d.Julia3d
+main.class=com.jogamp.opencl.demos.julia3d.Julia3d
diff --git a/nbproject/configs/RadixSort.properties b/nbproject/configs/RadixSort.properties
index 41e68f5..d3c5458 100644
--- a/nbproject/configs/RadixSort.properties
+++ b/nbproject/configs/RadixSort.properties
@@ -1 +1 @@
-main.class=com.mbien.opencl.demos.radixsort.RadixSortDemo
+main.class=com.jogamp.opencl.demos.radixsort.RadixSortDemo
diff --git a/nbproject/project.properties b/nbproject/project.properties
index a21489a..c1425c6 100644
--- a/nbproject/project.properties
+++ b/nbproject/project.properties
@@ -1,5 +1,5 @@
 application.title=JOCL Demos
-application.vendor=mbien
+application.vendor=jogamp
 build.classes.dir=${build.dir}/classes
 build.classes.excludes=**/*.java,**/*.form
 # This directory is removed when the project is cleaned:
@@ -52,7 +52,7 @@ javadoc.use=true
 javadoc.version=false
 javadoc.windowtitle=
 jaxbwiz.endorsed.dirs="${netbeans.home}/../ide12/modules/ext/jaxb/api"
-main.class=com.mbien.opencl.demos.hellojocl.HelloJOCL
+main.class=com.jogamp.opencl.demos.hellojocl.HelloJOCL
 manifest.file=manifest.mf
 meta.inf.dir=${src.dir}/META-INF
 no.dependencies=true
diff --git a/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl
new file mode 100644
index 0000000..a8d0e1d
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.cl
@@ -0,0 +1,214 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+//Passed down by clBuildProgram
+//#define LOCAL_SIZE_LIMIT 1024
+
+
+
+inline void ComparatorPrivate(
+    uint *keyA,
+    uint *keyB,
+    uint arrowDir
+){
+    if( (*keyA > *keyB) == arrowDir ){
+        uint t;
+        t = *keyA; *keyA = *keyB; *keyB = t;
+    }
+}
+
+inline void ComparatorLocal(
+    __local uint *keyA,
+    __local uint *keyB,
+    uint arrowDir
+){
+    if( (*keyA > *keyB) == arrowDir ){
+        uint t;
+        t = *keyA; *keyA = *keyB; *keyB = t;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Monolithic bitonic sort kernel for short arrays fitting into local memory
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey,
+    uint arrayLength,
+    uint sortDir
+){
+    __local  uint l_key[LOCAL_SIZE_LIMIT];
+
+    //Offset to the beginning of subbatch and load data
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+    for(uint size = 2; size < arrayLength; size <<= 1){
+        //Bitonic merge
+        uint dir = ( (get_local_id(0) & (size / 2)) != 0 );
+        for(uint stride = size / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+                dir
+            );
+        }
+    }
+
+    //dir == sortDir for the last bitonic merge step
+    {
+        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+                sortDir
+            );
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bitonic sort kernel for large arrays (not fitting into local memory)
+////////////////////////////////////////////////////////////////////////////////
+//Bottom-level bitonic sort
+//Almost the same as bitonicSortLocal with the only exception
+//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
+//sorted in opposite directions
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicSortLocal1(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey
+){
+    __local uint l_key[LOCAL_SIZE_LIMIT];
+
+    //Offset to the beginning of subarray and load data
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+    uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
+
+    for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
+        //Bitonic merge
+        uint dir = (comparatorI & (size / 2)) != 0;
+        for(uint stride = size / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+                dir
+            );
+        }
+    }
+
+    //Odd / even arrays of LOCAL_SIZE_LIMIT elements
+    //sorted in opposite directions
+    {
+        uint dir = (get_group_id(0) & 1);
+        for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+            ComparatorLocal(
+                &l_key[pos +      0],
+                &l_key[pos + stride],
+               dir
+            );
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
+
+//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
+__kernel void bitonicMergeGlobal(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey,
+    uint arrayLength,
+    uint size,
+    uint stride,
+    uint sortDir
+){
+    uint global_comparatorI = get_global_id(0);
+    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
+
+    //Bitonic merge
+    uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
+
+    uint keyA = d_SrcKey[pos +      0];
+    uint keyB = d_SrcKey[pos + stride];
+
+    ComparatorPrivate(
+        &keyA, 
+        &keyB, 
+        dir
+    );
+
+    d_DstKey[pos +      0] = keyA;
+    d_DstKey[pos + stride] = keyB;
+}
+
+//Combined bitonic merge steps for
+//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
+void bitonicMergeLocal(
+    __global uint *d_DstKey,
+    __global uint *d_SrcKey,
+    uint arrayLength,
+    uint stride,
+    uint size,
+    uint sortDir
+){
+    __local uint l_key[LOCAL_SIZE_LIMIT];
+
+    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
+    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
+    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
+
+    //Bitonic merge
+    uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
+    uint         dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
+    for(; stride > 0; stride >>= 1){
+        barrier(CLK_LOCAL_MEM_FENCE);
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+        ComparatorLocal(
+            &l_key[pos +      0],
+            &l_key[pos + stride],
+            dir
+        );
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
+    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
+}
diff --git a/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java
new file mode 100644
index 0000000..3d954f2
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/bitonicsort/BitonicSort.java
@@ -0,0 +1,201 @@
+/*
+ * 18:42 Saturday, February 27 2010
+ */
+package com.jogamp.opencl.demos.bitonicsort;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.Map;
+import java.util.Random;
+
+import static java.lang.System.*;
+import static com.jogamp.opencl.CLMemory.Mem.*;
+import static com.jogamp.opencl.CLProgram.*;
+
+/**
+ * Bitonic sort optimized for GPUs.
+ * Uses NVIDIA's bitonic merge sort kernel.
+ * @author Michael Bien
+ */
+public class BitonicSort {
+
+    private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal";
+    private static final String BITONIC_MERGE_LOCAL  = "bitonicMergeLocal";
+    private static final String BITONIC_SORT_LOCAL   = "bitonicSortLocal";
+    private static final String BITONIC_SORT_LOCAL1  = "bitonicSortLocal1";
+
+    private final static int LOCAL_SIZE_LIMIT = 1024;
+    private final Map<String, CLKernel> kernels;
+
+    public BitonicSort() throws IOException {
+
+        final int sortDir  = 1;
+        final int elements = 1048576;
+        final int maxvalue = 1000000;
+
+        out.println("Initializing OpenCL...");
+
+        //Create the context
+        CLContext context = null;
+
+        try{
+
+            context = CLContext.create();
+            CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+            out.println("Initializing OpenCL bitonic sorter...");
+            kernels = initBitonicSort(queue);
+
+            out.println("Creating OpenCL memory objects...");
+            CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
+            System.out.println(keyBuffer.getCLSize()/1000000.0f);
+
+            out.println("Initializing data...\n");
+            Random random = new Random();
+            for (int i = 0; i < elements; i++) {
+                int rnd = random.nextInt(maxvalue);
+                keyBuffer.getBuffer().put(i, rnd);
+            }
+
+            int arrayLength = elements;
+            int batch = elements / arrayLength;
+
+            out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
+
+            long time = currentTimeMillis();
+
+            bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir);
+            queue.putReadBuffer(keyBuffer, true);
+
+            out.println(currentTimeMillis() - time+"ms");
+
+            IntBuffer keys = keyBuffer.getBuffer();
+            printSnapshot(keys, 20);
+            checkIfSorted(keys);
+
+            out.println("\nTEST PASSED");
+        
+        }finally{
+            if(context!=null) {
+                context.release();
+            }
+        }
+
+    }
+    
+    private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException {
+
+        out.println("    creating bitonic sort program");
+
+        CLContext context = queue.getContext();
+
+        CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
+                                   .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
+
+        Map<String, CLKernel> kernelMap = program.createCLKernels();
+
+        out.println("    checking minimum supported workgroup size");
+        //Check for work group size
+        CLDevice device = queue.getDevice();
+        long szBitonicSortLocal  = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
+        long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
+        long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
+
+        if (    (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
+             || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
+             || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2))  ) {
+            throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2
+                    +" required by this application is not supported on this device.");
+        }
+
+        return kernelMap;
+
+    }
+
+    public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) {
+
+        if (arrayLength < 2) {
+            throw new IllegalArgumentException("arrayLength was "+arrayLength);
+        }
+
+        // TODO Only power-of-two array lengths are supported so far
+
+        dir = (dir != 0) ? 1 : 0;
+
+        CLKernel sortlocal1  = kernels.get(BITONIC_SORT_LOCAL1);
+        CLKernel sortlocal   = kernels.get(BITONIC_SORT_LOCAL);
+        CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL);
+        CLKernel mergeLocal  = kernels.get(BITONIC_MERGE_LOCAL);
+
+        if (arrayLength <= LOCAL_SIZE_LIMIT) {
+
+            //        oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
+
+            //Launch bitonicSortLocal
+            sortlocal.putArgs(dstKey, srcKey)
+                     .putArg(arrayLength).putArg(dir).rewind();
+
+            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+            int globalWorkSize = batch * arrayLength / 2;
+            queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize);
+
+        } else {
+
+            //Launch bitonicSortLocal1
+            sortlocal1.setArgs(dstKey, srcKey);
+
+            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
+            int globalWorkSize = batch * arrayLength / 2;
+
+            queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize);
+
+            for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
+                for (int stride = size / 2; stride > 0; stride >>= 1) {
+                    if (stride >= LOCAL_SIZE_LIMIT) {
+                        //Launch bitonicMergeGlobal
+                        mergeGlobal.putArgs(dstKey, dstKey)
+                                   .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
+
+                        globalWorkSize = batch * arrayLength / 2;
+                        queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0);
+                    } else {
+                        //Launch bitonicMergeLocal
+                        mergeLocal.putArgs(dstKey, dstKey)
+                                  .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
+
+                        localWorkSize = LOCAL_SIZE_LIMIT / 2;
+                        globalWorkSize = batch * arrayLength / 2;
+
+                        queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    private void printSnapshot(IntBuffer buffer, int snapshot) {
+        for(int i = 0; i < snapshot; i++)
+            out.print(buffer.get() + ", ");
+        out.println("...; " + buffer.remaining() + " more");
+        buffer.rewind();
+    }
+
+    private void checkIfSorted(IntBuffer keys) {
+        for (int i = 1; i < keys.capacity(); i++) {
+            if (keys.get(i - 1) > keys.get(i)) {
+                throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i));
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        new BitonicSort();
+    }
+}
diff --git a/src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl b/src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl
new file mode 100644
index 0000000..640c775
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/fractal/Mandelbrot.cl
@@ -0,0 +1,51 @@
+#ifdef DOUBLE_FP
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    typedef double varfloat;
+#else
+    typedef float varfloat;
+#endif
+
+/**
+ * For a description of this algorithm please refer to
+ * http://en.wikipedia.org/wiki/Mandelbrot_set
+ * @author Michael Bien
+ */
+kernel void mandelbrot(
+        const int width,        const int height,
+        const varfloat x0,      const varfloat y0,
+        const varfloat rangeX,  const varfloat rangeY,
+        global uint *output,    global uint *colorMap,
+        const int colorMapSize, const int maxIterations) {
+
+    unsigned int ix = get_global_id(0);
+    unsigned int iy = get_global_id(1);
+
+    varfloat r = x0 + ix * rangeX / width;
+    varfloat i = y0 + iy * rangeY / height;
+
+    varfloat x = 0;
+    varfloat y = 0;
+
+    varfloat magnitudeSquared = 0;
+    int iteration = 0;
+
+    while (magnitudeSquared < 4 && iteration < maxIterations) {
+        varfloat x2 = x*x;
+        varfloat y2 = y*y;
+        y = 2 * x * y + i;
+        x = x2 - y2 + r;
+        magnitudeSquared = x2+y2;
+        iteration++;
+    }
+
+    if (iteration == maxIterations)  {
+        output[iy * width + ix] = 0;
+    }else {
+        varfloat alpha = (varfloat)iteration / maxIterations;
+        int colorIndex = (int)(alpha * colorMapSize);
+        output[iy * width + ix] = colorMap[colorIndex];
+      // monochrom
+      //  output[iy * width + ix] = 255*iteration/maxIterations;
+    }
+
+}
\ No newline at end of file
diff --git a/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java
new file mode 100644
index 0000000..26770b6
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java
@@ -0,0 +1,485 @@
+package com.jogamp.opencl.demos.fractal;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.CLEvent;
+import com.jogamp.opencl.CLEventList;
+import com.jogamp.opencl.CLException;
+import com.jogamp.opencl.gl.CLGLBuffer;
+import com.jogamp.opencl.gl.CLGLContext;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLPlatform;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opencl.CLProgram.CompilerOptions;
+import com.jogamp.opengl.util.awt.TextRenderer;
+import java.awt.Color;
+import java.awt.Dimension;
+import java.awt.Font;
+import java.awt.Point;
+import java.awt.event.KeyAdapter;
+import java.awt.event.KeyEvent;
+import java.awt.event.MouseAdapter;
+import java.awt.event.MouseEvent;
+import java.awt.event.MouseWheelEvent;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import javax.media.opengl.DebugGL2;
+import javax.media.opengl.GL;
+import javax.media.opengl.GL2;
+import javax.media.opengl.GLAutoDrawable;
+import javax.media.opengl.GLCapabilities;
+import javax.media.opengl.GLContext;
+import javax.media.opengl.GLEventListener;
+import javax.media.opengl.GLProfile;
+import javax.media.opengl.awt.GLCanvas;
+import javax.swing.JFrame;
+import javax.swing.SwingUtilities;
+
+import static com.jogamp.common.nio.Buffers.*;
+import static javax.media.opengl.GL2.*;
+import static com.jogamp.opencl.CLMemory.Mem.*;
+import static com.jogamp.opencl.CLEvent.ProfilingCommand.*;
+import static com.jogamp.opencl.CLCommandQueue.Mode.*;
+import static java.lang.Math.*;
+
+/**
+ * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL.
+ * A shared PBO is used as storage for the fractal image.<br/>
+ * http://en.wikipedia.org/wiki/Mandelbrot_set
+ * <p>
+ * controls:<br/>
+ * keys 1-9 control parallelism level<br/>
+ * space enables/disables slice seperator<br/>
+ * 'd' toggles between 32/64bit floatingpoint precision<br/>
+ * mouse/mousewheel to drag and zoom<br/>
+ * </p>
+ * @author Michael Bien
+ */
+public class MultiDeviceFractal implements GLEventListener {
+
+    // max number of used GPUs
+    private static final int MAX_PARRALLELISM_LEVEL = 8;
+
+    // max per pixel iterations to compute the fractal
+    private static final int MAX_ITERATIONS         = 500;
+
+    private GLCanvas canvas;
+
+    private CLGLContext clContext;
+    private CLCommandQueue[] queues;
+    private CLKernel[] kernels;
+    private CLProgram program;
+    private CLEventList probes;
+    private CLGLBuffer<?>[] pboBuffers;
+    private CLBuffer<IntBuffer>[] colorMap;
+
+    private int width  = 0;
+    private int height = 0;
+
+    private double minX = -2f;
+    private double minY = -1.2f;
+    private double maxX  = 0.6f;
+    private double maxY  = 1.3f;
+
+    private int slices;
+
+    private boolean drawSeperator;
+    private boolean doublePrecision;
+    private boolean buffersInitialized;
+    private boolean rebuild;
+
+    private final TextRenderer textRenderer;
+
+    public MultiDeviceFractal(int width, int height) {
+
+        this.width = width;
+        this.height = height;
+
+        canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
+        canvas.addGLEventListener(this);
+        initSceneInteraction();
+
+        JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set");
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        canvas.setPreferredSize(new Dimension(width, height));
+        frame.add(canvas);
+        frame.pack();
+
+        frame.setVisible(true);
+
+        textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
+    }
+
+    public void init(GLAutoDrawable drawable) {
+
+        if(clContext == null) {
+            // enable GL error checking using the composable pipeline
+            drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
+
+            drawable.getGL().glFinish();
+            initCL(drawable.getContext());
+
+            GL2 gl = drawable.getGL().getGL2();
+
+            gl.setSwapInterval(0);
+            gl.glDisable(GL_DEPTH_TEST);
+            gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
+
+            initView(gl, drawable.getWidth(), drawable.getHeight());
+
+            initPBO(gl);
+            drawable.getGL().glFinish();
+
+            setKernelConstants();
+        }
+    }
+
+    private void initCL(GLContext glCtx){
+        try {
+            // create context managing all available GPUs
+//            clContext = CLGLContext.create(glCtx, GPU);
+            clContext = CLGLContext.create(glCtx, CLPlatform.getDefault().listCLDevices()[0]);
+
+
+            CLDevice[] devices = clContext.getDevices();
+
+            slices = min(devices.length, MAX_PARRALLELISM_LEVEL);
+
+            // create command queues for every GPU, setup colormap and init kernels
+            queues = new CLCommandQueue[slices];
+            kernels = new CLKernel[slices];
+            probes = new CLEventList(slices);
+            colorMap = new CLBuffer[slices];
+
+            for (int i = 0; i < slices; i++) {
+
+                colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY);
+                initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED);
+
+                // create command queue and upload color map buffer on each used device
+                queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload
+
+            }
+
+            // load and build program
+            program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl"));
+            buildProgram();
+
+        } catch (IOException ex) {
+            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex);
+        } catch (CLException ex) {
+            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex);
+        }
+
+    }
+
+    private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) {
+        
+        for (int n = 0; n < colors.length - 1; n++) {
+
+            Color color = colors[n];
+            int r0 = color.getRed();
+            int g0 = color.getGreen();
+            int b0 = color.getBlue();
+
+            color = colors[n + 1];
+            int r1 = color.getRed();
+            int g1 = color.getGreen();
+            int b1 = color.getBlue();
+
+            int deltaR = r1 - r0;
+            int deltaG = g1 - g0;
+            int deltaB = b1 - b0;
+
+            for (int step = 0; step < stepSize; step++) {
+                float alpha = (float) step / (stepSize - 1);
+                int r = (int) (r0 + alpha * deltaR);
+                int g = (int) (g0 + alpha * deltaG);
+                int b = (int) (b0 + alpha * deltaB);
+                colorMap.put((r << 16) | (g << 8) | (b << 0));
+            }
+        }
+        colorMap.rewind();
+
+    }
+
+    private void initView(GL2 gl, int width, int height) {
+
+        gl.glViewport(0, 0, width, height);
+
+        gl.glMatrixMode(GL_MODELVIEW);
+        gl.glLoadIdentity();
+
+        gl.glMatrixMode(GL_PROJECTION);
+        gl.glLoadIdentity();
+        gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0);
+    }
+
+    @SuppressWarnings("unchecked")
+    private void initPBO(GL gl) {
+
+        if(pboBuffers != null) {
+            int[] oldPbos = new int[pboBuffers.length];
+            for (int i = 0; i < pboBuffers.length; i++) {
+                CLGLBuffer<?> buffer = pboBuffers[i];
+                oldPbos[i] = buffer.GLID;
+                buffer.release();
+            }
+            gl.glDeleteBuffers(oldPbos.length, oldPbos, 0);
+        }
+
+        pboBuffers = new CLGLBuffer[slices];
+
+        int[] pbo = new int[slices];
+        gl.glGenBuffers(slices, pbo, 0);
+
+        // setup one empty PBO per slice
+        for (int i = 0; i < slices; i++) {
+
+            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]);
+            gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW);
+            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+            pboBuffers[i] = clContext.createFromGLBuffer(pbo[i], WRITE_ONLY);
+
+        }
+
+        buffersInitialized = true;
+    }
+
+    private void buildProgram() {
+
+        /*
+         * workaround: The driver keeps using the old binaries for some reason.
+         * to solve this we simple create a new program and release the old.
+         * however rebuilding programs should be possible -> remove when drivers are fixed.
+         */
+        if(program != null && rebuild) {
+            String source = program.getSource();
+            program.release();
+            program = clContext.createProgram(source);
+        }
+
+        // disable 64bit floating point math if not available
+        if(doublePrecision) {
+            for (CLDevice device : program.getCLDevices()) {
+                if(!device.isDoubleFPAvailable()) {
+                    doublePrecision = false;
+                    break;
+                }
+            }
+        }
+
+        if(doublePrecision) {
+            program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP");
+        }else{
+            program.build(CompilerOptions.FAST_RELAXED_MATH);
+        }
+        rebuild = false;
+
+        for (int i = 0; i < kernels.length; i++) {
+            // init kernel with constants
+            kernels[i] = program.createCLKernel("mandelbrot");
+        }
+
+    }
+
+    // init kernels with constants
+    private void setKernelConstants() {
+        for (int i = 0; i < slices; i++) {
+            kernels[i].setForce32BitArgs(!doublePrecision)
+                      .setArg(6, pboBuffers[i])
+                      .setArg(7, colorMap[i])
+                      .setArg(8, colorMap[i].getBuffer().capacity())
+                      .setArg(9, MAX_ITERATIONS);
+        }
+    }
+
+    // rendering cycle
+    public void display(GLAutoDrawable drawable) {
+        GL gl = drawable.getGL();
+
+        // make sure GL does not use our objects before we start computeing
+        gl.glFinish();
+        if(!buffersInitialized) {
+            initPBO(gl);
+            setKernelConstants();
+        }
+        if(rebuild) {
+            buildProgram();
+            setKernelConstants();
+        }
+        compute();
+
+        render(gl.getGL2());
+    }
+
+    // OpenCL
+    private void compute() {
+
+        int sliceWidth = width / slices;
+        double rangeX   = (maxX - minX) / slices;
+        double rangeY   = (maxY - minY);
+
+        // release all old events, you can't reuse events in OpenCL
+        probes.release();
+
+        // start computation
+        for (int i = 0; i < slices; i++) {
+
+            kernels[i].putArg(     sliceWidth).putArg(height)
+                      .putArg(minX + rangeX*i).putArg(  minY)
+                      .putArg(       rangeX  ).putArg(rangeY)
+                      .rewind();
+
+            // aquire GL objects, and enqueue a kernel with a probe from the list
+            queues[i].putAcquireGLObject(pboBuffers[i].ID)
+                     .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes)
+                     .putReleaseGLObject(pboBuffers[i].ID);
+
+        }
+
+        // block until done (important: finish before doing further gl work)
+        for (int i = 0; i < slices; i++) {
+            queues[i].finish();
+        }
+
+    }
+
+    // OpenGL
+    private void render(GL2 gl) {
+
+        gl.glClear(GL_COLOR_BUFFER_BIT);
+
+        //draw slices
+        int sliceWidth = width / slices;
+
+        for (int i = 0; i < slices; i++) {
+
+            int seperatorOffset = drawSeperator?i:0;
+
+            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID);
+            gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0);
+
+            gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0);
+
+        }
+        gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+        //draw info text
+        textRenderer.beginRendering(width, height, false);
+
+            textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15);
+
+            for (int i = 0; i < slices; i++) {
+                CLEvent event = probes.getEvent(i);
+                long start = event.getProfilingInfo(START);
+                long end = event.getProfilingInfo(END);
+                textRenderer.draw("GPU"+i +" "+(int)((end-start)/1000000.0f)+"ms", 10, height-(20+16*(slices-i)));
+            }
+
+        textRenderer.endRendering();
+    }
+
+    public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) {
+
+        if(this.width == width && this.height == height)
+            return;
+
+        this.width = width;
+        this.height = height;
+
+        initPBO(drawable.getGL());
+
+        initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight());
+    }
+
+    private void initSceneInteraction() {
+
+        MouseAdapter mouseAdapter = new MouseAdapter() {
+
+            Point lastpos = new Point();
+
+            @Override
+            public void mouseDragged(MouseEvent e) {
+                
+                double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width;
+                double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height;
+
+                minX += offsetX;
+                minY -= offsetY;
+
+                maxX += offsetX;
+                maxY -= offsetY;
+
+                lastpos = e.getPoint();
+
+                canvas.display();
+
+            }
+
+            @Override
+            public void mouseMoved(MouseEvent e) {
+                lastpos = e.getPoint();
+            }
+            
+            @Override
+            public void mouseWheelMoved(MouseWheelEvent e) {
+                float rotation = e.getWheelRotation() / 25.0f;
+
+                double deltaX = rotation * (maxX - minX);
+                double deltaY = rotation * (maxY - minY);
+
+                // offset for "zoom to cursor"
+                double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2;
+                double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2;
+
+                minX += deltaX+offsetX;
+                minY += deltaY-offsetY;
+
+                maxX +=-deltaX+offsetX;
+                maxY +=-deltaY-offsetY;
+
+                canvas.display();
+            }
+        };
+
+        KeyAdapter keyAdapter = new KeyAdapter() {
+
+            @Override
+            public void keyPressed(KeyEvent e) {
+                if(e.getKeyCode() == KeyEvent.VK_SPACE) {
+                    drawSeperator = !drawSeperator;
+                }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') {
+                    int number = e.getKeyChar()-'0';
+                    slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL));
+                    buffersInitialized = false;
+                }else if(e.getKeyCode() == KeyEvent.VK_D) {
+                    doublePrecision = !doublePrecision;
+                    rebuild = true;
+                }
+                canvas.display();
+            }
+
+        };
+
+        canvas.addMouseMotionListener(mouseAdapter);
+        canvas.addMouseWheelListener(mouseAdapter);
+        canvas.addKeyListener(keyAdapter);
+    }
+
+    public void dispose(GLAutoDrawable drawable) {
+    }
+
+    public static void main(String args[]) {
+        SwingUtilities.invokeLater(new Runnable() {
+            public void run() {
+                new MultiDeviceFractal(512, 512);
+            }
+        });
+    }
+
+}
diff --git a/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig
new file mode 100644
index 0000000..403aae3
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/fractal/MultiDeviceFractal.java.orig
@@ -0,0 +1,484 @@
+package com.jogamp.opencl.demos.fractal;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.CLEvent;
+import com.jogamp.opencl.CLEventList;
+import com.jogamp.opencl.CLException;
+import com.jogamp.opencl.CLGLBuffer;
+import com.jogamp.opencl.CLGLContext;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opencl.CLProgram.CompilerOptions;
+import com.sun.opengl.util.awt.TextRenderer;
+import java.awt.Color;
+import java.awt.Dimension;
+import java.awt.Font;
+import java.awt.Point;
+import java.awt.event.KeyAdapter;
+import java.awt.event.KeyEvent;
+import java.awt.event.MouseAdapter;
+import java.awt.event.MouseEvent;
+import java.awt.event.MouseWheelEvent;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import javax.media.opengl.DebugGL2;
+import javax.media.opengl.GL;
+import javax.media.opengl.GL2;
+import javax.media.opengl.GLAutoDrawable;
+import javax.media.opengl.GLCapabilities;
+import javax.media.opengl.GLContext;
+import javax.media.opengl.GLEventListener;
+import javax.media.opengl.GLProfile;
+import javax.media.opengl.awt.GLCanvas;
+import javax.swing.JFrame;
+import javax.swing.SwingUtilities;
+
+import static com.sun.gluegen.runtime.BufferFactory.*;
+import static javax.media.opengl.GL2.*;
+import static com.jogamp.opencl.CLMemory.Mem.*;
+import static com.jogamp.opencl.CLEvent.ProfilingCommand.*;
+import static com.jogamp.opencl.CLCommandQueue.Mode.*;
+import static com.jogamp.opencl.CLDevice.Type.*;
+import static java.lang.Math.*;
+
+/**
+ * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL.
+ * A shared PBO is used as storage for the fractal image.<br/>
+ * http://en.wikipedia.org/wiki/Mandelbrot_set
+ * <p>
+ * controls:<br/>
+ * keys 1-9 control parallelism level<br/>
+ * space enables/disables slice seperator<br/>
+ * 'd' toggles between 32/64bit floatingpoint precision<br/>
+ * mouse/mousewheel to drag and zoom<br/>
+ * </p>
+ * @author Michael Bien
+ */
+public class MultiDeviceFractal implements GLEventListener {
+
+    // max number of used GPUs
+    private static final int MAX_PARRALLELISM_LEVEL = 8;
+
+    // max per pixel iterations to compute the fractal
+    private static final int MAX_ITERATIONS         = 1000;
+
+    private GLCanvas canvas;
+
+    private CLContext clContext;
+    private CLCommandQueue[] queues;
+    private CLKernel[] kernels;
+    private CLProgram program;
+    private CLEventList probes;
+    private CLBuffer<?>[] pboBuffers;
+    private CLBuffer<IntBuffer>[] colorMap;
+
+    private int width  = 0;
+    private int height = 0;
+
+    private double minX = -2f;
+    private double minY = -1.2f;
+    private double maxX  = 0.6f;
+    private double maxY  = 1.3f;
+
+    private int slices;
+
+    private boolean drawSeperator;
+    private boolean doublePrecision;
+    private boolean buffersInitialized;
+    private boolean rebuild;
+
+    private final TextRenderer textRenderer;
+
+    public MultiDeviceFractal(int width, int height) {
+
+        this.width = width;
+        this.height = height;
+
+        canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
+        canvas.addGLEventListener(this);
+        initSceneInteraction();
+
+        JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set");
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        canvas.setPreferredSize(new Dimension(width, height));
+        frame.add(canvas);
+        frame.pack();
+
+        frame.setVisible(true);
+
+        textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
+    }
+
+    public void init(GLAutoDrawable drawable) {
+
+        // enable GL error checking using the composable pipeline
+        drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
+
+        initCL(drawable.getContext());
+
+        GL2 gl = drawable.getGL().getGL2();
+
+        gl.setSwapInterval(0);
+        gl.glDisable(GL_DEPTH_TEST);
+        gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
+
+        initView(gl, drawable.getWidth(), drawable.getHeight());
+
+        initPBO(gl);
+        setKernelConstants();
+    }
+
+    private void initCL(GLContext glCtx){
+        try {
+            // create context managing all available GPUs
+            clContext = CLContext.create(GPU);
+
+            CLDevice[] devices = clContext.getCLDevices();
+
+            slices = min(devices.length, MAX_PARRALLELISM_LEVEL);
+
+            // create command queues for every GPU, setup colormap and init kernels
+            queues = new CLCommandQueue[slices];
+            kernels = new CLKernel[slices];
+            probes = new CLEventList(slices);
+            colorMap = new CLBuffer[slices];
+
+            for (int i = 0; i < slices; i++) {
+
+                colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY);
+                initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED);
+
+                // create command queue and upload color map buffer on each used device
+                queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload
+
+            }
+
+            // load and build program
+            program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl"));
+            buildProgram();
+
+        } catch (IOException ex) {
+            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex);
+        } catch (CLException ex) {
+            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex);
+        }
+
+    }
+
+    private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) {
+        
+        for (int n = 0; n < colors.length - 1; n++) {
+
+            Color color = colors[n];
+            int r0 = color.getRed();
+            int g0 = color.getGreen();
+            int b0 = color.getBlue();
+
+            color = colors[n + 1];
+            int r1 = color.getRed();
+            int g1 = color.getGreen();
+            int b1 = color.getBlue();
+
+            int deltaR = r1 - r0;
+            int deltaG = g1 - g0;
+            int deltaB = b1 - b0;
+
+            for (int step = 0; step < stepSize; step++) {
+                float alpha = (float) step / (stepSize - 1);
+                int r = (int) (r0 + alpha * deltaR);
+                int g = (int) (g0 + alpha * deltaG);
+                int b = (int) (b0 + alpha * deltaB);
+                colorMap.put((r << 16) | (g << 8) | (b << 0));
+            }
+        }
+        colorMap.rewind();
+
+    }
+
+    private void initView(GL2 gl, int width, int height) {
+
+        gl.glViewport(0, 0, width, height);
+
+        gl.glMatrixMode(GL_MODELVIEW);
+        gl.glLoadIdentity();
+
+        gl.glMatrixMode(GL_PROJECTION);
+        gl.glLoadIdentity();
+        gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0);
+    }
+
+    @SuppressWarnings("unchecked")
+    private void initPBO(GL gl) {
+
+        if(pboBuffers != null) {
+            int[] oldPbos = new int[pboBuffers.length];
+            for (int i = 0; i < pboBuffers.length; i++) {
+                CLBuffer<?> buffer = pboBuffers[i];
+//                oldPbos[i] = buffer.GLID;
+                buffer.release();
+            }
+//            gl.glDeleteBuffers(oldPbos.length, oldPbos, 0);
+        }
+
+        pboBuffers = new CLBuffer[slices];
+
+//        int[] pbo = new int[slices];
+//        gl.glGenBuffers(slices, pbo, 0);
+
+        // setup one empty PBO per slice
+        for (int i = 0; i < slices; i++) {
+
+//            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]);
+//            gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW);
+//            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+            pboBuffers[i] = clContext.createByteBuffer(width*height * SIZEOF_INT / slices, WRITE_ONLY);
+//            pboBuffers[i] = clContext.createFromGLBuffer(null, pbo[i], WRITE_ONLY);
+
+        }
+
+        buffersInitialized = true;
+    }
+
+    private void buildProgram() {
+
+        /*
+         * workaround: The driver keeps using the old binaries for some reason.
+         * to solve this we simple create a new program and release the old.
+         * however rebuilding programs should be possible -> remove when drivers are fixed.
+         */
+        if(program != null && rebuild) {
+            String source = program.getSource();
+            program.release();
+            program = clContext.createProgram(source);
+        }
+
+        // disable 64bit floating point math if not available
+        if(doublePrecision) {
+            for (CLDevice device : program.getCLDevices()) {
+                if(!device.isDoubleFPAvailable()) {
+                    doublePrecision = false;
+                    break;
+                }
+            }
+        }
+
+        if(doublePrecision) {
+            program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP");
+        }else{
+            program.build(CompilerOptions.FAST_RELAXED_MATH);
+        }
+        rebuild = false;
+
+        for (int i = 0; i < kernels.length; i++) {
+            // init kernel with constants
+            kernels[i] = program.createCLKernel("mandelbrot");
+        }
+
+    }
+
+    // init kernels with constants
+    private void setKernelConstants() {
+        for (int i = 0; i < slices; i++) {
+            kernels[i].setForce32BitArgs(!doublePrecision)
+                      .setArg(6, pboBuffers[i])
+                      .setArg(7, colorMap[i])
+                      .setArg(8, colorMap[i].getBuffer().capacity())
+                      .setArg(9, MAX_ITERATIONS);
+        }
+    }
+
+    // rendering cycle
+    public void display(GLAutoDrawable drawable) {
+        GL gl = drawable.getGL();
+
+        if(!buffersInitialized) {
+            initPBO(gl);
+            setKernelConstants();
+        }
+        if(rebuild) {
+            buildProgram();
+            setKernelConstants();
+        }
+        // make sure GL does not use our objects before we start computeing
+        gl.glFinish();
+        compute();
+
+        render(gl.getGL2());
+    }
+
+    // OpenCL
+    private void compute() {
+
+        int sliceWidth = width / slices;
+        double rangeX   = (maxX - minX) / slices;
+        double rangeY   = (maxY - minY);
+
+        // release all old events, you can't reuse events in OpenCL
+        probes.release();
+
+        long time = System.currentTimeMillis();
+        // start computation
+        for (int i = 0; i < slices; i++) {
+
+            kernels[i].putArg(     sliceWidth).putArg(height)
+                      .putArg(minX + rangeX*i).putArg(  minY)
+                      .putArg(       rangeX  ).putArg(rangeY)
+                      .rewind();
+
+            // aquire GL objects, and enqueue a kernel with a probe from the list
+            queues[i]
+//                    .putAcquireGLObject(pboBuffers[i].ID)
+                     .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes)
+//                     .putReleaseGLObject(pboBuffers[i].ID)
+                     ;
+
+        }
+
+        // block until done
+        for (int i = 0; i < slices; i++) {
+            queues[i].finish();
+        }
+        System.out.println((System.currentTimeMillis()-time)/1000.0f);
+
+    }
+
+    // OpenGL
+    private void render(GL2 gl) {
+
+        gl.glClear(GL_COLOR_BUFFER_BIT);
+
+        //draw slices
+        int sliceWidth = width / slices;
+
+//        for (int i = 0; i < slices; i++) {
+//
+//            int seperatorOffset = drawSeperator?i:0;
+//
+//            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID);
+//            gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0);
+//
+//            gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0);
+//
+//        }
+//        gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+        //draw info text
+        textRenderer.beginRendering(width, height, false);
+
+            textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15);
+
+            for (int i = 0; i < slices; i++) {
+                CLEvent event = probes.getEvent(i);
+                long start = event.getProfilingInfo(START);
+                long end = event.getProfilingInfo(END);
+                textRenderer.draw("GPU"+i +" "+((end-start)/1000000000.0f)+"s", 10, height-(20+16*(slices-i)));
+            }
+
+        textRenderer.endRendering();
+    }
+
+    public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) {
+
+        if(this.width == width && this.height == height)
+            return;
+
+        this.width = width;
+        this.height = height;
+
+        initPBO(drawable.getGL());
+
+        initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight());
+    }
+
+    private void initSceneInteraction() {
+
+        MouseAdapter mouseAdapter = new MouseAdapter() {
+
+            Point lastpos = new Point();
+
+            @Override
+            public void mouseDragged(MouseEvent e) {
+                
+                double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width;
+                double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height;
+
+                minX += offsetX;
+                minY -= offsetY;
+
+                maxX += offsetX;
+                maxY -= offsetY;
+
+                lastpos = e.getPoint();
+
+                canvas.display();
+
+            }
+
+            @Override
+            public void mouseMoved(MouseEvent e) {
+                lastpos = e.getPoint();
+            }
+            
+            @Override
+            public void mouseWheelMoved(MouseWheelEvent e) {
+                float rotation = e.getWheelRotation() / 25.0f;
+
+                double deltaX = rotation * (maxX - minX);
+                double deltaY = rotation * (maxY - minY);
+
+                // offset for "zoom to cursor"
+                double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2;
+                double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2;
+
+                minX += deltaX+offsetX;
+                minY += deltaY-offsetY;
+
+                maxX +=-deltaX+offsetX;
+                maxY +=-deltaY-offsetY;
+
+                canvas.display();
+            }
+        };
+
+        KeyAdapter keyAdapter = new KeyAdapter() {
+
+            @Override
+            public void keyPressed(KeyEvent e) {
+                if(e.getKeyCode() == KeyEvent.VK_SPACE) {
+                    drawSeperator = !drawSeperator;
+                }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') {
+                    int number = e.getKeyChar()-'0';
+                    slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL));
+                    buffersInitialized = false;
+                }else if(e.getKeyCode() == KeyEvent.VK_D) {
+                    doublePrecision = !doublePrecision;
+                    rebuild = true;
+                }
+                canvas.display();
+            }
+
+        };
+
+        canvas.addMouseMotionListener(mouseAdapter);
+        canvas.addMouseWheelListener(mouseAdapter);
+        canvas.addKeyListener(keyAdapter);
+    }
+
+    public void dispose(GLAutoDrawable drawable) {
+    }
+
+    public static void main(String args[]) {
+        SwingUtilities.invokeLater(new Runnable() {
+            public void run() {
+                new MultiDeviceFractal(512, 512);
+            }
+        });
+    }
+
+}
diff --git a/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
new file mode 100644
index 0000000..31fabab
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
@@ -0,0 +1,91 @@
+package com.jogamp.opencl.demos.hellojocl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.io.IOException;
+import java.nio.FloatBuffer;
+import java.util.Random;
+
+import static java.lang.System.*;
+import static com.jogamp.opencl.CLMemory.Mem.*;
+
+/**
+ * Hello Java OpenCL example. Adds all elements of buffer A to buffer B
+ * and stores the result in buffer C.<br/>
+ * Sample was inspired by the Nvidia VectorAdd example written in C/C++
+ * which is bundled in the Nvidia OpenCL SDK.
+ * @author Michael Bien
+ */
+public class HelloJOCL {
+
+    public static void main(String[] args) throws IOException {
+
+        int elementCount = 11444777;                                // Length of arrays to process
+        int localWorkSize = 256;                                    // Local work size dimensions
+        int globalWorkSize = roundUp(localWorkSize, elementCount);  // rounded up to the nearest multiple of the localWorkSize
+
+        // set up
+        CLContext context = CLContext.create();
+
+        CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
+
+        CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+        CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+        CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
+
+        out.println("used device memory: "
+            + (clBufferA.getSize()+clBufferB.getSize()+clBufferC.getSize())/1000000 +"MB");
+
+        // fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between runs).
+        fillBuffer(clBufferA.getBuffer(), 12345);
+        fillBuffer(clBufferB.getBuffer(), 67890);
+
+        // get a reference to the kernel functon with the name 'VectorAdd'
+        // and map the buffers to its input parameters.
+        CLKernel kernel = program.createCLKernel("VectorAdd");
+        kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
+
+        // create command queue on fastest device.
+        CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
+        // asynchronous write of data to GPU device, blocking read later to get the computed results back.
+        long time = nanoTime();
+        queue.putWriteBuffer(clBufferA, false)
+             .putWriteBuffer(clBufferB, false)
+             .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
+             .putReadBuffer(clBufferC, true);
+        time = nanoTime() - time;
+
+        // cleanup all resources associated with this context.
+        context.release();
+
+        // print first few elements of the resulting buffer to the console.
+        out.println("a+b=c results snapshot: ");
+        for(int i = 0; i < 10; i++)
+            out.print(clBufferC.getBuffer().get() + ", ");
+        out.println("...; " + clBufferC.getBuffer().remaining() + " more");
+
+        out.println("computation took: "+(time/1000000)+"ms");
+
+    }
+
+    private static final void fillBuffer(FloatBuffer buffer, int seed) {
+        Random rnd = new Random(seed);
+        while(buffer.remaining() != 0)
+            buffer.put(rnd.nextFloat()*100);
+        buffer.rewind();
+    }
+
+    private static final int roundUp(int groupSize, int globalSize) {
+        int r = globalSize % groupSize;
+        if (r == 0) {
+            return globalSize;
+        } else {
+            return globalSize + groupSize - r;
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl b/src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl
new file mode 100644
index 0000000..ac9dde2
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/hellojocl/VectorAdd.cl
@@ -0,0 +1,15 @@
+
+    // OpenCL Kernel Function for element by element vector addition
+    kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {
+
+        // get index into global data array
+        int iGID = get_global_id(0);
+
+        // bound check (equivalent to the limit on a 'for' loop for standard/serial C code
+        if (iGID >= numElements)  {
+            return;
+        }
+
+        // add the vector elements
+        c[iGID] = a[iGID] + b[iGID];
+    }
\ No newline at end of file
diff --git a/src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java b/src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
new file mode 100644
index 0000000..24af1fe
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
@@ -0,0 +1,277 @@
+package com.jogamp.opencl.demos.joglinterop;
+
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.gl.CLGLBuffer;
+import com.jogamp.opencl.gl.CLGLContext;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLPlatform;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opengl.util.Animator;
+import java.io.IOException;
+import javax.media.opengl.DebugGL2;
+import javax.media.opengl.GL2;
+import javax.media.opengl.GLAutoDrawable;
+import javax.media.opengl.GLCapabilities;
+import javax.media.opengl.GLEventListener;
+import javax.media.opengl.GLProfile;
+import javax.media.opengl.awt.GLCanvas;
+import javax.media.opengl.glu.gl2.GLUgl2;
+import javax.swing.JFrame;
+import javax.swing.SwingUtilities;
+
+import static com.jogamp.common.nio.Buffers.*;
+
+/**
+ * JOCL - JOGL interoperability example.
+ * @author Michael Bien
+ */
+public class GLCLInteroperabilityDemo implements GLEventListener {
+
+    private final GLUgl2 glu = new GLUgl2();
+
+    private final int MESH_SIZE = 256;
+
+    private int width;
+    private int height;
+
+//    private final FloatBuffer vb;
+//    private final IntBuffer ib;
+
+    private final int[] glObjects = new int[2];
+    private final int VERTICES = 0;
+//    private final int INDICES  = 1;
+
+    private final UserSceneInteraction usi;
+
+    private CLGLContext clContext;
+    private CLKernel kernel;
+    private CLCommandQueue commandQueue;
+    private CLGLBuffer<?> clBuffer;
+
+    private float step = 0;
+
+    public GLCLInteroperabilityDemo() {
+
+        this.usi = new UserSceneInteraction();
+
+        // create direct memory buffers
+//        vb = newFloatBuffer(MESH_SIZE * MESH_SIZE * 4);
+//        ib = newIntBuffer((MESH_SIZE - 1) * (MESH_SIZE - 1) * 2 * 3);
+//
+//        // build indices
+//        //    0---3
+//        //    | \ |
+//        //    1---2
+//        for (int h = 0; h < MESH_SIZE - 1; h++) {
+//            for (int w = 0; w < MESH_SIZE - 1; w++) {
+//
+//                // 0 - 3 - 2
+//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6,      w + (h    ) * (MESH_SIZE)    );
+//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 1,  w + (h    ) * (MESH_SIZE) + 1);
+//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 2,  w + (h + 1) * (MESH_SIZE) + 1);
+//
+//                // 0 - 2 - 1
+//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 3,  w + (h    ) * (MESH_SIZE)    );
+//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 4,  w + (h + 1) * (MESH_SIZE) + 1);
+//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 5,  w + (h + 1) * (MESH_SIZE)    );
+//
+//            }
+//        }
+//        ib.rewind();
+
+        SwingUtilities.invokeLater(new Runnable() {
+            public void run() {
+                initUI();
+            }
+        });
+
+    }
+
+    private void initUI() {
+
+        this.width  = 600;
+        this.height = 400;
+
+        GLCapabilities config = new GLCapabilities(GLProfile.get(GLProfile.GL2));
+        config.setSampleBuffers(true);
+        config.setNumSamples(4);
+
+        GLCanvas canvas = new GLCanvas(config);
+        canvas.addGLEventListener(this);
+        usi.init(canvas);
+
+        JFrame frame = new JFrame("JOGL-JOCL Interoperability Example");
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        frame.add(canvas);
+        frame.setSize(width, height);
+
+        frame.setVisible(true);
+
+    }
+
+
+    public void init(GLAutoDrawable drawable) {
+
+        if(clContext == null) {
+
+            // find gl compatible device
+            CLDevice[] devices = CLPlatform.getDefault().listCLDevices();
+            CLDevice device = null;
+            for (CLDevice d : devices) {
+                if(d.isGLMemorySharingSupported()) {
+                    device = d;
+                    break;
+                }
+            }
+            // create OpenCL context before creating any OpenGL objects
+            // you want to share with OpenCL (AMD driver requirement)
+            clContext = CLGLContext.create(drawable.getContext(), device);
+
+            // enable GL error checking using the composable pipeline
+            drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
+
+            // OpenGL initialization
+            GL2 gl = drawable.getGL().getGL2();
+
+            gl.setSwapInterval(1);
+
+            gl.glPolygonMode(GL2.GL_FRONT_AND_BACK, GL2.GL_LINE);
+
+            gl.glGenBuffers(glObjects.length, glObjects, 0);
+
+    //        gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]);
+    //        gl.glBufferData(GL2.GL_ELEMENT_ARRAY_BUFFER, ib.capacity() * SIZEOF_INT, ib, GL2.GL_STATIC_DRAW);
+    //        gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, 0);
+
+            gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
+                gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]);
+                gl.glBufferData(GL2.GL_ARRAY_BUFFER, MESH_SIZE * MESH_SIZE * 4 * SIZEOF_FLOAT, null, GL2.GL_DYNAMIC_DRAW);
+                gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
+            gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
+
+            pushPerspectiveView(gl);
+            gl.glFinish();
+
+            // init OpenCL
+            initCL();
+
+            // start rendering thread
+            Animator animator = new Animator(drawable);
+            animator.start();
+
+        }
+    }
+
+    private void initCL() {
+
+        CLProgram program;
+        try {
+            program = clContext.createProgram(getClass().getResourceAsStream("JoglInterop.cl"));
+            program.build();
+            System.out.println(program.getBuildStatus());
+            System.out.println(program.isExecutable());
+            System.out.println(program.getBuildLog());
+        } catch (IOException ex) {
+            throw new RuntimeException("can not handle exception", ex);
+        }
+
+        commandQueue = clContext.getMaxFlopsDevice().createCommandQueue();
+
+        clBuffer = clContext.createFromGLBuffer(glObjects[VERTICES], CLGLBuffer.Mem.WRITE_ONLY);
+
+        System.out.println("cl buffer type: " + clBuffer.getGLObjectType());
+        System.out.println("shared with gl buffer: " + clBuffer.getGLObjectID());
+
+        kernel = program.createCLKernel("sineWave")
+                        .putArg(clBuffer)
+                        .putArg(MESH_SIZE)
+                        .rewind();
+
+        System.out.println("cl initialised");
+    }
+
+
+    public void display(GLAutoDrawable drawable) {
+
+        GL2 gl = drawable.getGL().getGL2();
+
+        // ensure pipeline is clean before doing cl work
+        gl.glFinish();
+
+        computeHeightfield();
+
+        gl.glClear(GL2.GL_COLOR_BUFFER_BIT | GL2.GL_DEPTH_BUFFER_BIT);
+        gl.glLoadIdentity();
+
+        usi.interact(gl);
+
+        gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]);
+        gl.glVertexPointer(4, GL2.GL_FLOAT, 0, 0);
+
+//            gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]);
+
+        gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
+        gl.glDrawArrays(GL2.GL_POINTS, 0, MESH_SIZE * MESH_SIZE);
+//            gl.glDrawElements(GL2.GL_TRIANGLES, ib.capacity(), GL2.GL_UNSIGNED_INT, 0);
+        gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
+
+//            gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
+
+    }
+
+    /*
+     * Computes a heightfield using a OpenCL kernel.
+     */
+    private void computeHeightfield() {
+
+        kernel.setArg(2, step += 0.05f);
+
+        commandQueue.putAcquireGLObject(clBuffer.ID)
+                    .put2DRangeKernel(kernel, 0, 0, MESH_SIZE, MESH_SIZE, 0, 0)
+                    .putReleaseGLObject(clBuffer.ID)
+                    .finish();
+
+    }
+
+    private void pushPerspectiveView(GL2 gl) {
+
+        gl.glMatrixMode(GL2.GL_PROJECTION);
+        gl.glPushMatrix();
+
+            gl.glLoadIdentity();
+
+            glu.gluPerspective(60, width / (float)height, 1, 1000);
+            gl.glMatrixMode(GL2.GL_MODELVIEW);
+
+            gl.glPushMatrix();
+                gl.glLoadIdentity();
+
+    }
+
+    private void popView(GL2 gl) {
+
+                gl.glMatrixMode(GL2.GL_PROJECTION);
+            gl.glPopMatrix();
+
+            gl.glMatrixMode(GL2.GL_MODELVIEW);
+        gl.glPopMatrix();
+
+    }
+
+
+    public void reshape(GLAutoDrawable drawable, int arg1, int arg2, int width, int height) {
+        this.width = width;
+        this.height = height;
+        GL2 gl = drawable.getGL().getGL2();
+        popView(gl);
+        pushPerspectiveView(gl);
+    }
+
+    public void dispose(GLAutoDrawable drawable) {  }
+
+    public static void main(String[] args) {
+        new GLCLInteroperabilityDemo();
+    }
+
+}
\ No newline at end of file
diff --git a/src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl b/src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl
new file mode 100644
index 0000000..0f0bcfc
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/joglinterop/JoglInterop.cl
@@ -0,0 +1,23 @@
+
+/**
+* animated 2D sine pattern.
+*/
+kernel void sineWave(global float4 * vertex, int size, float time) {
+
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    // calculate uv coordinates
+    float u = x / (float) size;
+    float v = y / (float) size;
+
+    u = u*2.0f - 1.0f;
+    v = v*2.0f - 1.0f;
+
+    // calculate simple sine wave pattern
+    float freq = 4.0f;
+    float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;
+
+    // write output vertex
+    vertex[y*size + x] = (float4)(u*10.0f, w*10.0f, v*10.0f, 1.0f);
+}
diff --git a/src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java b/src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java
new file mode 100644
index 0000000..fc0f054
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/joglinterop/UserSceneInteraction.java
@@ -0,0 +1,103 @@
+package com.jogamp.opencl.demos.joglinterop;
+
+import java.awt.Component;
+import java.awt.Point;
+import java.awt.event.MouseAdapter;
+import java.awt.event.MouseEvent;
+import java.awt.event.MouseMotionAdapter;
+import java.awt.event.MouseWheelEvent;
+import java.awt.event.MouseWheelListener;
+import javax.media.opengl.GL2;
+
+/**
+ * Utility class for interacting with a scene. Supports rotation and zoom around origin.
+ * @author Michael Bien
+ */
+public class UserSceneInteraction {
+
+    private float z = -20;
+    private float rotx = 45;
+    private float roty = 30;
+
+    private Point dragstart;
+    private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM }
+    private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE;
+
+
+    public void init(Component component) {
+        initMouseListeners(component);
+    }
+
+    private void initMouseListeners(Component component) {
+        component.addMouseMotionListener(new MouseMotionAdapter() {
+
+            @Override
+            public void mouseDragged(MouseEvent e) {
+
+                if (dragstart != null) {
+                    switch (dragmode) {
+                        case DRAG_ROTATE:
+                            rotx += e.getY() - dragstart.getY();
+                            roty += e.getX() - dragstart.getX();
+                            break;
+                        case DRAG_ZOOM:
+                            z += (e.getY() - dragstart.getY()) / 5.0f;
+                            break;
+                    }
+                }
+
+                dragstart = e.getPoint();
+            }
+        });
+        component.addMouseWheelListener(new MouseWheelListener() {
+
+            public void mouseWheelMoved(MouseWheelEvent e) {
+                z += e.getWheelRotation()*5;
+            }
+
+        });
+        component.addMouseListener(new MouseAdapter() {
+
+            @Override
+            public void mousePressed(MouseEvent e) {
+                switch (e.getButton()) {
+                    case (MouseEvent.BUTTON1):
+                        dragmode = MOUSE_MODE.DRAG_ROTATE;
+                        break;
+                    case (MouseEvent.BUTTON2):
+                        dragmode = MOUSE_MODE.DRAG_ZOOM;
+                        break;
+                    case (MouseEvent.BUTTON3):
+                        dragmode = MOUSE_MODE.DRAG_ZOOM;
+                        break;
+                }
+            }
+
+            @Override
+            public void mouseReleased(MouseEvent e) {
+                switch (e.getButton()) {
+                    case (MouseEvent.BUTTON1):
+                        dragmode = MOUSE_MODE.DRAG_ZOOM;
+                        break;
+                    case (MouseEvent.BUTTON2):
+                        dragmode = MOUSE_MODE.DRAG_ROTATE;
+                        break;
+                    case (MouseEvent.BUTTON3):
+                        dragmode = MOUSE_MODE.DRAG_ROTATE;
+                        break;
+                }
+
+                dragstart = null;
+            }
+        });
+    }
+
+
+    public void interact(GL2 gl) {
+        gl.glTranslatef(0, 0, z);
+        gl.glRotatef(rotx, 1f, 0f, 0f);
+        gl.glRotatef(roty, 0f, 1.0f, 0f);
+    }
+
+
+}
\ No newline at end of file
diff --git a/src/com/jogamp/opencl/demos/julia3d/Julia3d.java b/src/com/jogamp/opencl/demos/julia3d/Julia3d.java
new file mode 100644
index 0000000..38633c6
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/Julia3d.java
@@ -0,0 +1,212 @@
+package com.jogamp.opencl.demos.julia3d;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLPlatform;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opencl.demos.julia3d.structs.Camera;
+import com.jogamp.opencl.demos.julia3d.structs.RenderingConfig;
+import com.jogamp.opencl.demos.julia3d.structs.Vec;
+import java.io.IOException;
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+import java.nio.FloatBuffer;
+import javax.swing.SwingUtilities;
+
+import static com.jogamp.opencl.CLMemory.Mem.*;
+import static com.jogamp.opencl.CLProgram.CompilerOptions.*;
+import static com.jogamp.opencl.demos.julia3d.UserSceneController.*;
+
+/**
+ * This sample has been ported from David Buciarelli's juliaGPU v1.2 written in C.
+ * @author Michael Bien
+ */
+public class Julia3d {
+
+    private final CLContext context;
+    private CLBuffer<FloatBuffer> pixelBuffer;
+    private final CLBuffer<ByteBuffer> configBuffer;
+    private final CLCommandQueue commandQueue;
+    private final CLProgram program;
+    private final CLKernel julia;
+    private final CLKernel multiply;
+
+    private final int workGroupSize;
+    private final String kernelFileName = "rendering_kernel.cl";
+
+    final RenderingConfig config;
+
+    private Julia3d(RenderingConfig renderConfig) {
+        this.config = renderConfig;
+        updateCamera();
+
+        //setup
+        CLDevice gpu = CLPlatform.getDefault().getMaxFlopsDevice();
+        context = CLContext.create(gpu);
+
+        workGroupSize = 256;
+
+        //allocate buffers
+        configBuffer = context.createBuffer(config.getBuffer(), READ_ONLY);
+        commandQueue = gpu.createCommandQueue();
+//        update(true);
+
+        try {
+            program = context.createProgram(Julia3d.class.getResourceAsStream(kernelFileName))
+                             .build(FAST_RELAXED_MATH);
+        } catch (IOException ex) {
+            throw new RuntimeException("unable to load program from source", ex);
+        }
+
+        julia = program.createCLKernel("JuliaGPU");
+        multiply = program.createCLKernel("multiply");
+        System.out.println(program.getBuildStatus(gpu));
+        System.out.println(program.getBuildLog());
+
+    }
+
+    void update(boolean reallocate) {
+
+        updateCamera();
+
+        int bufferSize = config.getWidth() * config.getHeight() * 3;
+        if(reallocate) {
+            if(pixelBuffer != null) {
+                pixelBuffer.release();
+            }
+
+            pixelBuffer = context.createFloatBuffer(bufferSize, READ_WRITE, USE_BUFFER);
+        }
+
+        commandQueue.putWriteBuffer(configBuffer, true);
+        
+        julia.putArg(pixelBuffer)
+             .putArg(configBuffer)
+             .rewind();
+
+        multiply.putArg(pixelBuffer)
+                .putArg(bufferSize)
+                .rewind();
+    }
+
+
+    void compute(boolean fastRendering) {
+
+        // calculate workgroup size
+        int globalThreads = config.getWidth() * config.getHeight();
+        if(globalThreads % workGroupSize != 0)
+            globalThreads = (globalThreads / workGroupSize + 1) * workGroupSize;
+
+        int localThreads = workGroupSize;
+        int superSamplingSize = config.getSuperSamplingSize();
+
+        if (!fastRendering && superSamplingSize > 1) {
+            
+            for (int y = 0; y < superSamplingSize; ++y) {
+                for (int x = 0; x < superSamplingSize; ++x) {
+                    
+                    float sampleX = (x + 0.5f) / superSamplingSize;
+                    float sampleY = (y + 0.5f) / superSamplingSize;
+                    
+                    if (x == 0 && y == 0) {
+                        // First pass
+                        julia.setArg(2, 0)
+                             .setArg(3, sampleX)
+                             .setArg(4, sampleY);
+
+                        commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
+                            
+                    } else if (x == (superSamplingSize - 1) && y == (superSamplingSize - 1)) {
+                        // Last pass
+                        julia.setArg(2, 1)
+                             .setArg(3, sampleX)
+                             .setArg(4, sampleY);
+
+                        // normalize the values we accumulated
+                        multiply.setArg(2, 1.0f/(superSamplingSize*superSamplingSize));
+                        
+                        commandQueue.put1DRangeKernel(julia,    0, globalThreads,   localThreads)
+                                    .put1DRangeKernel(multiply, 0, globalThreads*3, localThreads);
+                    } else {
+                        julia.setArg(2, 1)
+                             .setArg(3, sampleX)
+                             .setArg(4, sampleY);
+                        
+                        commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
+                        
+                    }
+                }
+            }
+            
+        }else{
+            
+            //fast rendering
+            julia.setArg(2, 0)
+                 .setArg(3, 0.0f)
+                 .setArg(4, 0.0f);
+
+            commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
+        }
+        
+        commandQueue.putBarrier()
+                    .putReadBuffer(pixelBuffer, true);
+
+    }
+
+    private void updateCamera() {
+
+        Camera camera = config.getCamera();
+
+        Vec dir    = camera.getDir();
+        Vec target = camera.getTarget();
+        Vec camX   = camera.getX();
+        Vec camY   = camera.getY();
+        Vec orig   = camera.getOrig();
+
+        vsub(dir, target, orig);
+        vnorm(dir);
+
+        Vec up = Vec.create().setX(0).setY(1).setZ(0);
+        vxcross(camX, dir, up);
+        vnorm(camX);
+        vmul(camX, config.getWidth() * .5135f / config.getHeight(), camX);
+
+        vxcross(camY, camX, dir);
+        vnorm(camY);
+        vmul(camY, .5135f, camY);
+    }
+
+
+    public static void main(String[] args) {
+
+        RenderingConfig config = RenderingConfig.create()
+            .setWidth(640).setHeight(480)
+            .setEnableShadow(1)
+            .setSuperSamplingSize(2)
+            .setActvateFastRendering(1)
+            .setMaxIterations(9)
+            .setEpsilon(0.003f * 0.75f)
+            .setLight(new float[] {5, 10, 15})
+            .setMu(new float[] {-0.2f, 0.4f, -0.4f, -0.4f});
+
+        config.getCamera().getOrig()  .setX(1).setY(2).setZ(8);
+        config.getCamera().getTarget().setX(0).setY(0).setZ(0);
+
+        final Julia3d julia3d = new Julia3d(config);
+
+        SwingUtilities.invokeLater(new Runnable() {
+            public void run() {
+                new Renderer(julia3d);
+            }
+        });
+    }
+
+    Buffer getPixelBuffer() {
+        return pixelBuffer.getBuffer();
+    }
+
+
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/Renderer.java b/src/com/jogamp/opencl/demos/julia3d/Renderer.java
new file mode 100644
index 0000000..ce97e4a
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/Renderer.java
@@ -0,0 +1,203 @@
+package com.jogamp.opencl.demos.julia3d;
+
+import com.jogamp.opencl.demos.julia3d.structs.RenderingConfig;
+import com.jogamp.opengl.util.awt.TextRenderer;
+import java.awt.Dimension;
+import java.awt.Font;
+import java.nio.FloatBuffer;
+import java.util.Timer;
+import java.util.TimerTask;
+import javax.media.opengl.GL2;
+import javax.media.opengl.GLAutoDrawable;
+import javax.media.opengl.GLCapabilities;
+import javax.media.opengl.GLEventListener;
+import javax.media.opengl.GLProfile;
+import javax.media.opengl.awt.GLCanvas;
+import javax.swing.JFrame;
+
+import static com.jogamp.common.nio.Buffers.*;
+import static javax.media.opengl.GL2.*;
+import static java.lang.String.*;
+
+/**
+ * JOGL renderer for displaying the julia set.
+ * @author Michael Bien
+ */
+public class Renderer implements GLEventListener {
+
+    public final static int MU_RECT_SIZE = 80;
+
+    private final Julia3d julia3d;
+    private final GLCanvas canvas;
+    private final RenderingConfig config;
+    private final FloatBuffer juliaSlice;
+    private final UserSceneController usi;
+    private final TextRenderer textRenderer;
+
+    private TimerTask task;
+    private final Timer timer;
+
+    public Renderer(Julia3d julia3d) {
+        this.julia3d = julia3d;
+        this.config = julia3d.config;
+
+        timer = new Timer();
+
+        juliaSlice = newDirectFloatBuffer(MU_RECT_SIZE * MU_RECT_SIZE * 4);
+
+        canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
+        canvas.addGLEventListener(this);
+
+        usi = new UserSceneController();
+        usi.init(this, canvas, config);
+
+        JFrame frame = new JFrame("Java OpenCL - Julia3D GPU");
+        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+        canvas.setPreferredSize(new Dimension(config.getWidth(), config.getHeight()));
+        frame.add(canvas);
+        frame.pack();
+
+        textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
+
+        frame.setVisible(true);
+    }
+
+    public void init(GLAutoDrawable drawable) {
+        drawable.getGL().getGL2().glMatrixMode(GL_PROJECTION);
+    }
+
+    void update() {
+        julia3d.update(false);
+        canvas.display();
+    }
+
+    public void display(GLAutoDrawable drawable) {
+
+        //compute
+        julia3d.compute(config.getActvateFastRendering() == 1);
+
+        GL2 gl = drawable.getGL().getGL2();
+        gl.glClear(GL_COLOR_BUFFER_BIT);
+
+        // draw julia set
+	gl.glRasterPos2i(0, 0);
+	gl.glDrawPixels(config.getWidth(), config.getHeight(), GL_RGB, GL_FLOAT, julia3d.getPixelBuffer());
+
+
+        // Draw Mu constant
+        int width = config.getWidth();
+        int height = config.getHeight();
+        float[] mu = config.getMu();
+
+	gl.glEnable(GL_BLEND);
+            gl.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+            int baseMu1 = width - MU_RECT_SIZE - 2;
+            int baseMu2 = 1;
+            drawJuliaSlice(gl, baseMu1, baseMu2, mu[0], mu[1]);
+            int baseMu3 = width - MU_RECT_SIZE - 2;
+            int baseMu4 = MU_RECT_SIZE + 2;
+            drawJuliaSlice(gl, baseMu3, baseMu4, mu[2], mu[3]);
+	gl.glDisable(GL_BLEND);
+
+	gl.glColor3f(1, 1, 1);
+	int mu1 = (int) (baseMu1 + MU_RECT_SIZE * (mu[0] + 1.5f) / 3.f);
+	int mu2 = (int) (baseMu2 + MU_RECT_SIZE * (mu[1] + 1.5f) / 3.f);
+	gl.glBegin(GL_LINES);
+            gl.glVertex2i(mu1 - 4, mu2);
+            gl.glVertex2i(mu1 + 4, mu2);
+            gl.glVertex2i(mu1, mu2 - 4);
+            gl.glVertex2i(mu1, mu2 + 4);
+	gl.glEnd();
+
+	int mu3 = (int) (baseMu3 + MU_RECT_SIZE * (mu[2] + 1.5f) / 3.f);
+	int mu4 = (int) (baseMu4 + MU_RECT_SIZE * (mu[3] + 1.5f) / 3.f);
+	gl.glBegin(GL_LINES);
+            gl.glVertex2i(mu3 - 4, mu4);
+            gl.glVertex2i(mu3 + 4, mu4);
+            gl.glVertex2i(mu3, mu4 - 4);
+            gl.glVertex2i(mu3, mu4 + 4);
+	gl.glEnd();
+
+        // info text
+        textRenderer.beginRendering(width, height);
+        textRenderer.draw(format("Epsilon %.5f - Max. Iter. %d", config.getEpsilon(), config.getMaxIterations()), 8, 10);
+        textRenderer.draw(format("Mu = (%.3f, %.3f, %.3f, %.3f)", mu[0], mu[1], mu[2], mu[3]), 8, 25);
+        textRenderer.draw(format("Shadow %s - SuperSampling %dx%d - Fast rendering %s",
+			config.getEnableShadow() == 1 ? "on" : "off",
+                        config.getSuperSamplingSize(), config.getSuperSamplingSize(),
+			config.getActvateFastRendering() == 1 ? "on" : "off"), 8, 40);
+        textRenderer.endRendering();
+
+        // timer task scheduling, delay gpu intensive high quality rendering
+        if(task != null) {
+            task.cancel();
+        }
+        if(config.getActvateFastRendering() == 1) {
+            task = new TimerTask() {
+                @Override
+                public void run() {
+                    config.setActvateFastRendering(0);
+                    update();
+                    config.setActvateFastRendering(1);
+                }
+            };
+            timer.schedule(task, 2000);
+        }
+    }
+
+    private void drawJuliaSlice(GL2 gl, int origX, int origY, float cR, float cI) {
+
+        int index = 0;
+        float invSize = 3.0f / MU_RECT_SIZE;
+        for (int i = 0; i < MU_RECT_SIZE; ++i) {
+            for (int j = 0; j < MU_RECT_SIZE; ++j) {
+
+                float x = i * invSize - 1.5f;
+                float y = j * invSize - 1.5f;
+
+                int iter;
+                for (iter = 0; iter < 64; ++iter) {
+                    float x2 = x * x;
+                    float y2 = y * y;
+                    if (x2 + y2 > 4.0f) {
+                        break;
+                    }
+
+                    float newx = x2 - y2 + cR;
+                    float newy = 2.f * x * y + cI;
+                    x = newx;
+                    y = newy;
+                }
+                
+                juliaSlice.put(index++, iter / 64.0f);
+                juliaSlice.put(index++, 0.0f);
+                juliaSlice.put(index++, 0.0f);
+                juliaSlice.put(index++, 0.5f);
+            }
+        }
+
+	gl.glRasterPos2i(origX, origY);
+	gl.glDrawPixels(MU_RECT_SIZE, MU_RECT_SIZE, GL_RGBA, GL_FLOAT, juliaSlice);
+    }
+
+
+    public void reshape(GLAutoDrawable drawable, int x, int y, int newWidth, int newHeight) {
+
+        config.setWidth(newWidth);
+	config.setHeight(newHeight);
+
+        GL2 gl = drawable.getGL().getGL2();
+
+	gl.glViewport(0, 0, newWidth, newHeight);
+	gl.glLoadIdentity();
+	gl.glOrtho(-0.5f, newWidth - 0.5f, -0.5f, newHeight - 0.5f, -1.0f, 1.0f);
+
+        julia3d.update(true);
+
+    }
+
+    public void dispose(GLAutoDrawable drawable) {
+    }
+
+
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/UserSceneController.java b/src/com/jogamp/opencl/demos/julia3d/UserSceneController.java
new file mode 100644
index 0000000..fda54be
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/UserSceneController.java
@@ -0,0 +1,249 @@
+package com.jogamp.opencl.demos.julia3d;
+
+import com.jogamp.opencl.demos.julia3d.structs.RenderingConfig;
+import com.jogamp.opencl.demos.julia3d.structs.Vec;
+import java.awt.Component;
+import java.awt.Point;
+import java.awt.event.KeyAdapter;
+import java.awt.event.KeyEvent;
+import java.awt.event.MouseAdapter;
+import java.awt.event.MouseEvent;
+import java.awt.event.MouseWheelEvent;
+
+import static java.lang.Math.*;
+import static com.jogamp.opencl.demos.julia3d.Renderer.*;
+
+/**
+ * Utility class for interacting with a scene. Supports rotation and zoom around origin.
+ * @author Michael Bien
+ */
+public class UserSceneController {
+
+    private Point dragstart;
+    private RenderingConfig model;
+    private Renderer view;
+
+    private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM }
+    private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE;
+
+
+    public void init(Renderer view, Component component, RenderingConfig model) {
+        initMouseListeners(component);
+        this.view = view;
+        this.model = model;
+    }
+
+    private void initMouseListeners(Component component) {
+
+        MouseAdapter mouseAdapter = new MouseAdapter() {
+            @Override
+            public void mouseDragged(MouseEvent e) {
+
+                int x = e.getX();
+                int y = e.getY();
+
+                switch (dragmode) {
+                    case DRAG_ROTATE:
+                        if (dragstart != null) {
+                            int height = model.getHeight();
+                            int width = model.getWidth();
+
+                            int ry = height - y - 1;
+                            int baseMu1 = width - MU_RECT_SIZE - 2;
+                            int baseMu2 = 1;
+                            int baseMu3 = width - MU_RECT_SIZE - 2;
+                            int baseMu4 = MU_RECT_SIZE + 2;
+
+                            if ((x >= baseMu1 && x <= baseMu1 + MU_RECT_SIZE) && (ry >= baseMu2 && ry <= baseMu2 + MU_RECT_SIZE)) {
+                                float[] mu = model.getMu();
+                                mu[0] = 3.f * ( x - baseMu1) / (float)MU_RECT_SIZE - 1.5f;
+                                mu[1] = 3.f * (ry - baseMu2) / (float)MU_RECT_SIZE - 1.5f;
+                                model.setMu(mu);
+                            } else if ((x >= baseMu3 && x <= baseMu3 + MU_RECT_SIZE) && (ry >= baseMu4 && ry <= baseMu4 + MU_RECT_SIZE)) {
+                                float[] mu = model.getMu();
+                                mu[2] = 3.f * ( x - baseMu3) / (float)MU_RECT_SIZE - 1.5f;
+                                mu[3] = 3.f * (ry - baseMu4) / (float)MU_RECT_SIZE - 1.5f;
+                                model.setMu(mu);
+                            } else {
+                                rotateCameraYbyOrig(0.01f * (x - dragstart.getX()));
+                                rotateCameraXbyOrig(0.01f * (y - dragstart.getY()));
+                            }
+                        }
+                        dragstart = e.getPoint();
+                        view.update();
+                        break;
+                    case DRAG_ZOOM:
+                        if (dragstart != null) {
+                            float zoom = (float) ((y - dragstart.getY()) / 10.0f);
+                            zoom(zoom);
+                        }
+                        dragstart = e.getPoint();
+                        view.update();
+                        break;
+                }
+
+            }
+
+            @Override
+            public void mousePressed(MouseEvent e) {
+                switch (e.getButton()) {
+                    case (MouseEvent.BUTTON1):
+                        dragmode = MOUSE_MODE.DRAG_ROTATE;
+                        break;
+                    case (MouseEvent.BUTTON2):
+                        dragmode = MOUSE_MODE.DRAG_ZOOM;
+                        break;
+                    case (MouseEvent.BUTTON3):
+                        dragmode = MOUSE_MODE.DRAG_ZOOM;
+                        break;
+                }
+            }
+
+            @Override
+            public void mouseReleased(MouseEvent e) {
+                switch (e.getButton()) {
+                    case (MouseEvent.BUTTON1):
+                        dragmode = MOUSE_MODE.DRAG_ZOOM;
+                        break;
+                    case (MouseEvent.BUTTON2):
+                        dragmode = MOUSE_MODE.DRAG_ROTATE;
+                        break;
+                    case (MouseEvent.BUTTON3):
+                        dragmode = MOUSE_MODE.DRAG_ROTATE;
+                        break;
+                }
+
+                dragstart = null;
+            }
+
+            @Override
+            public void mouseWheelMoved(MouseWheelEvent e) {
+                float zoom = e.getWheelRotation() * 0.1f;
+                zoom(zoom);
+                view.update();
+            }
+
+        };
+
+        KeyAdapter keyAdapter = new KeyAdapter() {
+
+            @Override
+            public void keyPressed(KeyEvent e) {
+
+                switch (e.getKeyChar()) {
+                    case 'l':
+                        model.setEnableShadow(model.getEnableShadow()==0 ? 1 : 0);
+                        break;
+                    case '1':
+                        model.setEpsilon(model.getEpsilon() * 0.75f);
+                        break;
+                    case '2':
+                        model.setEpsilon(model.getEpsilon() * 1.f / 0.75f);
+                        break;
+                    case '3':
+                        model.setMaxIterations(max(1, model.getMaxIterations() -1));
+                        break;
+                    case '4':
+                        model.setMaxIterations(min(12, model.getMaxIterations()+1));
+                        break;
+                    case '5':
+                        model.setSuperSamplingSize(max(1, model.getSuperSamplingSize() -1));
+                        break;
+                    case '6':
+                        model.setSuperSamplingSize(min(5, model.getSuperSamplingSize() +1));
+                        break;
+                    default:
+                        break;
+                }
+                view.update();
+
+            }
+
+        };
+
+        component.addKeyListener(keyAdapter);
+
+        component.addMouseListener(mouseAdapter);
+        component.addMouseMotionListener(mouseAdapter);
+        component.addMouseWheelListener(mouseAdapter);
+
+    }
+    private void zoom(float zoom) {
+        Vec orig = model.getCamera().getOrig();
+        orig.setX(orig.getX()+zoom)
+            .setY(orig.getY()+zoom)
+            .setZ(orig.getZ()+zoom);
+    }
+
+    private void rotateLightX(float k) {
+        float[] light = model.getLight();
+        float y = light[1];
+        float z = light[2];
+        light[1] = (float) ( y * cos(k) + z * sin(k));
+        light[2] = (float) (-y * sin(k) + z * cos(k));
+        model.setLight(light);
+    }
+
+    private void rotateLightY(float k) {
+        float[] light = model.getLight();
+        float x = light[0];
+        float z = light[2];
+        light[0] = (float) (x * cos(k) - z * sin(k));
+        light[2] = (float) (x * sin(k) + z * cos(k));
+        model.setLight(light);
+    }
+
+    private void rotateCameraXbyOrig(double k) {
+        Vec orig = model.getCamera().getOrig();
+        float y = orig.getY();
+        float z = orig.getZ();
+        orig.setY((float) ( y * cos(k) + z * sin(k)));
+        orig.setZ((float) (-y * sin(k) + z * cos(k)));
+    }
+
+    private void rotateCameraYbyOrig(double k) {
+        Vec orig = model.getCamera().getOrig();
+        float x = orig.getX();
+        float z = orig.getZ();
+        orig.setX((float) (x * cos(k) - z * sin(k)));
+        orig.setZ((float) (x * sin(k) + z * cos(k)));
+    }
+
+
+    public final static void vadd(Vec v, Vec a, Vec b) {
+        v.setX(a.getX() + b.getX());
+        v.setY(a.getY() + b.getY());
+        v.setZ(a.getZ() + b.getZ());
+    }
+
+    public final static void vsub(Vec v, Vec a, Vec b) {
+        v.setX(a.getX() - b.getX());
+        v.setY(a.getY() - b.getY());
+        v.setZ(a.getZ() - b.getZ());
+    }
+
+    public final static void vmul(Vec v, float s, Vec b) {
+        v.setX(s * b.getX());
+        v.setY(s * b.getY());
+        v.setZ(s * b.getZ());
+    }
+
+    public final static float vdot(Vec a, Vec b) {
+        return a.getX() * b.getX()
+             + a.getY() * b.getY()
+             + a.getZ() * b.getZ();
+    }
+
+    public final static void vnorm(Vec v) {
+        float s = (float) (1.0f / sqrt(vdot(v, v)));
+        vmul(v, s, v);
+    }
+
+    public final static void vxcross(Vec v, Vec a, Vec b) {
+        v.setX(a.getY() * b.getZ() - a.getZ() * b.getY());
+        v.setY(a.getZ() * b.getX() - a.getX() * b.getZ());
+        v.setZ(a.getX() * b.getY() - a.getY() * b.getX());
+    }
+
+
+}
\ No newline at end of file
diff --git a/src/com/jogamp/opencl/demos/julia3d/config.h b/src/com/jogamp/opencl/demos/julia3d/config.h
new file mode 100644
index 0000000..72df3ff
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/config.h
@@ -0,0 +1,24 @@
+
+typedef struct {
+	float x, y, z; // position, also color (r,g,b)
+} Vec;
+
+typedef struct {
+	/* User defined values */
+	Vec orig, target;
+	/* Calculated values */
+	Vec dir, x, y;
+} Camera;
+
+typedef struct {
+	unsigned int width, height;
+	int superSamplingSize;
+	int actvateFastRendering;
+	int enableShadow;
+
+	unsigned int maxIterations;
+	float epsilon;
+	float mu[4];
+	float light[3];
+	Camera camera;
+} RenderingConfig;
diff --git a/src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl b/src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl
new file mode 100644
index 0000000..d5acd02
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/mandelbrot_kernel.cl
@@ -0,0 +1,357 @@
+/*
+Copyright (c) 2009 David Bucciarelli (davibu@interfree.it)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#define GPU_KERNEL
+
+
+typedef struct {
+	float x, y, z; // position, also color (r,g,b)
+} Vec;
+
+typedef struct {
+	/* User defined values */
+	Vec orig, target;
+	/* Calculated values */
+	Vec dir, x, y;
+} Camera;
+
+typedef struct {
+	unsigned int width, height;
+	int superSamplingSize;
+	int actvateFastRendering;
+	int enableShadow;
+
+	unsigned int maxIterations;
+	float epsilon;
+	float mu[4];
+	float light[3];
+	Camera camera;
+} RenderingConfig;
+
+#define BOUNDING_RADIUS_2 4.f
+
+// Scalar derivative approach by Enforcer:
+// http://www.fractalforums.com/mandelbulb-implementation/realtime-renderingoptimisations/
+static float IterateIntersect(const float4 z0, const float4 c0, const uint maxIterations) {
+	float4 z = z0;
+	float4 c = c0;
+
+	float dr = 1.0f;
+	float r2 = dot(z, z);
+	float r = sqrt(r2);
+	for (int n = 0; (n < maxIterations) && (r < 2.f); ++n) {
+		const float zo0 = asin(z.z / r);
+		const float zi0 = atan2(z.y, z.x);
+		float zr = r2 * r2 * r2 * r;
+		const float zo = zo0 * 7.f;
+		const float zi = zi0 * 7.f;
+		const float czo = cos(zo);
+
+		dr = zr * dr * 7.f + 1.f;
+		zr *= r;
+
+		z = zr * (float4)(czo * cos(zi), czo * sin(zi), sin(zo), 0.f);
+		z += c;
+
+		r2 = dot(z, z);
+		r = sqrt(r2);
+	}
+
+	return 0.5f * log(r) * r / dr;
+}
+
+static float IntersectBulb(const float4 eyeRayOrig, const float4 eyeRayDir,
+		const float4 c, const uint maxIterations, const float epsilon,
+		const float maxDist, float4 *hitPoint, uint *steps) {
+	float dist;
+	float4 r0 = eyeRayOrig;
+	float distDone = 0.f;
+
+	uint s = 0;
+	do {
+		dist = IterateIntersect(r0, c, maxIterations);
+		distDone += dist;
+		// We are inside
+		if (dist <= 0.f)
+			break;
+
+		r0 += eyeRayDir * dist;
+		s++;
+	} while ((dist > epsilon) && (distDone < maxDist));
+
+	*hitPoint = r0;
+	*steps = s;
+	return dist;
+}
+
+#define WORLD_RADIUS 1000.f
+#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f))
+float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
+	const float4 op = WORLD_CENTER - eyeRayOrig;
+	const float b = dot(op, eyeRayDir);
+	float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS;
+
+	if (det < 0.f)
+		return -1.f;
+	else
+		det = sqrt(det);
+
+	float t = b - det;
+	if (t > 0.f)
+		return t;
+	else {
+		// We are inside, avoid the hit
+		return -1.f;
+	}
+}
+
+int IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir,
+		float *tmin, float*tmax) {
+	const float4 op = -eyeRayOrig;
+	const float b = dot(op, eyeRayDir);
+	float det = b * b - dot(op, op) + BOUNDING_RADIUS_2;
+
+	if (det < 0.f)
+		return 0;
+	else
+		det = sqrt(det);
+
+	float t1 = b - det;
+	float t2 = b + det;
+	if (t1 > 0.f) {
+		*tmin = t1;
+		*tmax = t2;
+		return 1;
+	} else {
+		if (t2 > 0.f) {
+			// We are inside, start from the ray origin
+			*tmin = 0.f;
+			*tmax = t2;
+
+			return 1;
+		} else
+			return 0;
+	}
+}
+
+static float4 NormEstimate(const float4 p, const float4 c,
+		const float delta, const uint maxIterations) {
+	const float4 qP = p;
+	const float4 gx1 = qP - (float4)(delta, 0.f, 0.f, 0.f);
+	const float4 gx2 = qP + (float4)(delta, 0.f, 0.f, 0.f);
+	const float4 gy1 = qP - (float4)(0.f, delta, 0.f, 0.f);
+	const float4 gy2 = qP + (float4)(0.f, delta, 0.f, 0.f);
+	const float4 gz1 = qP - (float4)(0.f, 0.f, delta, 0.f);
+	const float4 gz2 = qP + (float4)(0.f, 0.f, delta, 0.f);
+
+	const float gradX = length(IterateIntersect(gx2, c, maxIterations)) -
+		length(IterateIntersect(gx1, c, maxIterations));
+	const float gradY = length(IterateIntersect(gy2, c, maxIterations)) -
+		length(IterateIntersect(gy1, c, maxIterations));
+	const float gradZ = length(IterateIntersect(gz2, c, maxIterations)) -
+		length(IterateIntersect(gz1, c, maxIterations));
+
+	const float4 N = normalize((float4)(gradX, gradY, gradZ, 0.f));
+
+	return N;
+}
+
+static float4 Phong(const float4 light, const float4 eye, const float4 pt,
+		const float4 N, const float4 diffuse) {
+	const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f);
+	float4 L = normalize(light - pt);
+	float NdotL = dot(N, L);
+	if (NdotL < 0.f)
+		return diffuse * ambient;
+
+	const float specularExponent = 30.f;
+	const float specularity = 0.65f;
+
+	float4 E = normalize(eye - pt);
+	float4 H = (L + E) * (float)0.5f;
+
+	return diffuse * NdotL +
+			specularity * pow(dot(N, H), specularExponent) +
+			diffuse * ambient;
+}
+
+__kernel void MandelbulbGPU(
+	__global float *pixels,
+	const __global RenderingConfig *config,
+	const int enableAccumulation,
+	const float sampleX,
+	const float sampleY) {
+    const int gid = get_global_id(0);
+	const unsigned width = config->width;
+	const unsigned height = config->height;
+
+	const unsigned int x = gid % width;
+	const int y = gid / width;
+
+	// Check if we have to do something
+	if (y >= height)
+		return;
+
+	const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.5f / 0.75f)) : config->epsilon;
+	const uint maxIterations = config->actvateFastRendering ? (max(3u, config->maxIterations) - 2u) : config->maxIterations;
+
+	const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]);
+	const float4 light = (float4)(config->light[0], config->light[1], config->light[2], 0.f);
+	const __global Camera *camera = &config->camera;
+
+	//--------------------------------------------------------------------------
+	// Calculate eye ray
+	//--------------------------------------------------------------------------
+
+	const float invWidth = 1.f / width;
+	const float invHeight = 1.f / height;
+	const float kcx = (x + sampleX) * invWidth - .5f;
+	const float4 kcx4 = (float4)kcx;
+	const float kcy = (y + sampleY) * invHeight - .5f;
+	const float4 kcy4 = (float4)kcy;
+
+	const float4 cameraX = (float4)(camera->x.x, camera->x.y, camera->x.z, 0.f);
+	const float4 cameraY = (float4)(camera->y.x, camera->y.y, camera->y.z, 0.f);
+	const float4 cameraDir = (float4)(camera->dir.x, camera->dir.y, camera->dir.z, 0.f);
+	const float4 cameraOrig = (float4)(camera->orig.x, camera->orig.y, camera->orig.z, 0.f);
+
+	const float4 eyeRayDir =  normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir);
+	const float4 eyeRayOrig = eyeRayDir * (float4)0.1f + cameraOrig;
+
+	//--------------------------------------------------------------------------
+	// Check if we hit the bounding sphere
+	//--------------------------------------------------------------------------
+
+	int useAO = 1;
+	float4 diffuse, n, color;
+
+	float4 hitPoint;
+	float dist, tmin, tmax;
+	if (IntersectBoundingSphere(eyeRayOrig, eyeRayDir, &tmin, &tmax)) {
+		//--------------------------------------------------------------------------
+		// Find the intersection with the set
+		//--------------------------------------------------------------------------
+
+		uint steps;
+		float4 rayOrig = eyeRayOrig + eyeRayDir * (float4)tmin;
+		dist = IntersectBulb(rayOrig, eyeRayDir, mu, maxIterations,
+				epsilon, tmax - tmin, &hitPoint, &steps);
+
+		if (dist <= epsilon) {
+			// Set hit
+			diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f);
+			n = NormEstimate(hitPoint, mu, dist, maxIterations);
+		} else
+			dist = -1.f;
+	} else
+		dist = -1.f;
+
+	//--------------------------------------------------------------------------
+	// Check if we hit the floor
+	//--------------------------------------------------------------------------
+
+	if (dist < 0.f) {
+		dist = IntersectFloorSphere(eyeRayOrig, eyeRayDir);
+
+		if (dist >= 0.f) {
+			// Floor hit
+			hitPoint = eyeRayOrig + eyeRayDir * (float4)dist;
+			n = hitPoint - WORLD_CENTER;
+			n = normalize(n);
+			// The most important feature in a ray tracer: a checker texture !
+			const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x);
+			const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z);
+			if ((ix + iz) % 2)
+				diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f);
+			else
+				diffuse = (float4) (0.75f, 0.f, 0.f, 0.f);
+			useAO = 0;
+		} else {
+			// Sky hit
+			color = (float4)(0.f, 0.1f, 0.3f, 0.f);
+		}
+	} else {
+		// Sky hit
+		color = (float4)(0.f, 0.1f, 0.3f, 0.f);
+	}
+
+	//--------------------------------------------------------------------------
+	// Select the shadow pass
+	//--------------------------------------------------------------------------
+
+	if (dist >= 0.f) {
+		float shadowFactor = 1.f;
+		if (config->enableShadow) {
+			float4 L = normalize(light -  hitPoint);
+			float4 rO = hitPoint + n * 1e-2f;
+			float4 shadowHitPoint;
+
+			// Check bounding sphere
+			if (IntersectBoundingSphere(rO, L, &tmin, &tmax)) {
+				float shadowDistSet = tmin;
+				uint steps;
+
+				rO = rO + L * (float4)shadowDistSet;
+				shadowDistSet = IntersectBulb(rO, L, mu, maxIterations, epsilon,
+						tmax - tmin, &shadowHitPoint, &steps);
+				if (shadowDistSet < epsilon) {
+					if (useAO) {
+						// Use steps count to simulate ambient occlusion
+						shadowFactor = 0.6f - min(steps / 255.f, 0.5f);
+					} else
+						shadowFactor = 0.6f;
+				}
+			}
+		}
+
+		//--------------------------------------------------------------------------
+		// Direct lighting of hit point
+		//--------------------------------------------------------------------------
+
+		color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor;
+	}
+
+	//--------------------------------------------------------------------------
+	// Write pixel
+	//--------------------------------------------------------------------------
+
+	int offset = 3 * (x + y * width);
+	color = clamp(color, (float4)(0.f, 0.f ,0.f, 0.f), (float4)(1.f, 1.f ,1.f, 0.f));
+	if (enableAccumulation) {
+		pixels[offset++] += color.s0;
+		pixels[offset++] += color.s1;
+		pixels[offset] += color.s2;
+	} else {
+		pixels[offset++] = color.s0;
+		pixels[offset++] = color.s1;
+		pixels[offset] = color.s2;
+	}
+}
+
+kernel void multiply(global float *array, const int numElements, const float s) {
+    const int gid = get_global_id(0);
+    if (gid >= numElements)  {
+        return;
+    }
+    array[gid] *= s;
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl b/src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl
new file mode 100644
index 0000000..9c25c1b
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/rendering_kernel.cl
@@ -0,0 +1,382 @@
+/*
+Copyright (c) 2009 David Bucciarelli (davibu@interfree.it)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#define GPU_KERNEL
+
+
+#define BOUNDING_RADIUS_2 4.f
+#define ESCAPE_THRESHOLD 1e1f
+#define DELTA 1e-4f
+
+typedef struct {
+    float x, y, z; // position, also color (r,g,b)
+} Vec;
+
+typedef struct {
+    Vec orig, target;
+    Vec dir, x, y;
+} Camera;
+
+typedef struct {
+    unsigned int width, height;
+    int superSamplingSize;
+    int actvateFastRendering;
+    int enableShadow;
+
+    unsigned int maxIterations;
+    float epsilon;
+    float mu[4];
+    float light[3];
+    Camera camera;
+} RenderingConfig;
+
+
+static float4 QuatMult(const float4 q1, const float4 q2) {
+    float4 r;
+
+    // a1a2 - b1b2 - c1c2 - d1d2
+    r.x = q1.x * q2.x - q1.y * q2.y - q1.z * q2.z - q1.w * q2.w;
+    // a1b2 + b1a2 + c1d2 - d1c2
+    r.y = q1.x * q2.y + q1.y * q2.x + q1.z * q2.w - q1.w * q2.z;
+    // a1c2 - b1d2 + c1a2 + d1b2
+    r.z = q1.x * q2.z - q1.y * q2.w + q1.z * q2.x + q1.w * q2.y;
+    // a1d2 + b1c2 - c1b2 + d1a2
+    r.w = q1.x * q2.w + q1.y * q2.z - q1.z * q2.y + q1.w * q2.x;
+
+    return r;
+}
+
+static float4 QuatSqr(const float4 q) {
+    float4 r;
+
+    r.x = q.x * q.x - q.y * q.y - q.z * q.z - q.w * q.w;
+    r.y = 2.f * q.x * q.y;
+    r.z = 2.f * q.x * q.z;
+    r.w = 2.f * q.x * q.w;
+
+    return r;
+}
+
+static void IterateIntersect(float4 *q, float4 *qp, const float4 c, const uint maxIterations) {
+    float4 q0 = *q;
+    float4 qp0 = *qp;
+
+    for (uint i = 0; i < maxIterations; ++i) {
+        qp0 = 2.f * QuatMult(q0, qp0);
+        q0 = QuatSqr(q0) + c;
+
+        if (dot(q0, q0) > ESCAPE_THRESHOLD)
+            break;
+    }
+
+    *q = q0;
+    *qp = qp0;
+}
+
+static float IntersectJulia(const float4 eyeRayOrig, const float4 eyeRayDir,
+        const float4 c, const uint maxIterations, const float epsilon,
+        float4 *hitPoint, uint *steps) {
+    float dist;
+    float4 r0 = eyeRayOrig;
+
+    uint s = 0;
+    do {
+        float4 z = r0;
+        float4 zp = (float4) (1.f, 0.f, 0.f, 0.f);
+
+        IterateIntersect(&z, &zp, c, maxIterations);
+
+        const float normZP = length(zp);
+
+        // We are inside
+        if (normZP == 0.f)
+            break;
+
+        const float normZ = length(z);
+        dist = 0.5f * normZ * log(normZ) / normZP;
+
+        r0 += eyeRayDir * dist;
+        s++;
+    } while ((dist > epsilon) && (dot(r0, r0) < BOUNDING_RADIUS_2));
+
+    *hitPoint = r0;
+    *steps = s;
+    return dist;
+}
+
+#define WORLD_RADIUS 1000.f
+#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f))
+
+float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
+    const float4 op = WORLD_CENTER - eyeRayOrig;
+    const float b = dot(op, eyeRayDir);
+    float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS;
+
+    if (det < 0.f)
+        return -1.f;
+    else
+        det = sqrt(det);
+
+    float t = b - det;
+    if (t > 0.f)
+        return t;
+    else {
+        // We are inside, avoid the hit
+        return -1.f;
+    }
+}
+
+float IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
+    const float4 op = -eyeRayOrig;
+    const float b = dot(op, eyeRayDir);
+    float det = b * b - dot(op, op) + BOUNDING_RADIUS_2;
+
+    if (det < 0.f)
+        return -1.f;
+    else
+        det = sqrt(det);
+
+    float t = b - det;
+    if (t > 0.f)
+        return t;
+    else {
+        t = b + det;
+
+        if (t > 0.f) {
+            // We are inside, start from the ray origin
+            return 0.0f;
+        } else
+            return -1.f;
+    }
+}
+
+static float4 NormEstimate(const float4 p, const float4 c,
+        const float delta, const uint maxIterations) {
+    float4 N;
+    float4 qP = p;
+    float gradX, gradY, gradZ;
+
+    float4 gx1 = qP - (float4) (DELTA, 0.f, 0.f, 0.f);
+    float4 gx2 = qP + (float4) (DELTA, 0.f, 0.f, 0.f);
+    float4 gy1 = qP - (float4) (0.f, DELTA, 0.f, 0.f);
+    float4 gy2 = qP + (float4) (0.f, DELTA, 0.f, 0.f);
+    float4 gz1 = qP - (float4) (0.f, 0.f, DELTA, 0.f);
+    float4 gz2 = qP + (float4) (0.f, 0.f, DELTA, 0.f);
+
+    for (uint i = 0; i < maxIterations; ++i) {
+        gx1 = QuatSqr(gx1) + c;
+        gx2 = QuatSqr(gx2) + c;
+        gy1 = QuatSqr(gy1) + c;
+        gy2 = QuatSqr(gy2) + c;
+        gz1 = QuatSqr(gz1) + c;
+        gz2 = QuatSqr(gz2) + c;
+    }
+
+    gradX = length(gx2) - length(gx1);
+    gradY = length(gy2) - length(gy1);
+    gradZ = length(gz2) - length(gz1);
+
+    N = normalize((float4) (gradX, gradY, gradZ, 0.f));
+
+    return N;
+}
+
+static float4 Phong(const float4 light, const float4 eye, const float4 pt, const float4 N, const float4 diffuse) {
+
+    const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f);
+    float4 L = normalize(light - pt);
+    float NdotL = dot(N, L);
+    if (NdotL < 0.f)
+        return diffuse * ambient;
+
+    const float specularExponent = 30.f;
+    const float specularity = 0.65f;
+
+    float4 E = normalize(eye - pt);
+    float4 H = (L + E) * (float) 0.5f;
+
+    return diffuse * NdotL +
+           specularity * pow(dot(N, H), specularExponent) +
+           diffuse * ambient;
+}
+
+kernel void JuliaGPU(   global float *pixels,
+                        const global RenderingConfig *config,
+                        int enableAccumulation,
+                        float sampleX,
+                        float sampleY ) {
+
+    const int gid = get_global_id(0);
+    unsigned width = config->width;
+    unsigned height = config->height;
+
+    const unsigned int x = gid % width;
+    const int y = gid / width;
+
+    // Check if we have to do something
+    if (y >= height)
+        return;
+
+    const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.f / 0.75f)) : config->epsilon;
+    const uint maxIterations = max(1u, config->actvateFastRendering ? (config->maxIterations - 1) : config->maxIterations);
+
+    const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]);
+    const float4 light = (float4) (config->light[0], config->light[1], config->light[2], 0.f);
+    const global Camera *camera = &config->camera;
+
+    //--------------------------------------------------------------------------
+    // Calculate eye ray
+    //--------------------------------------------------------------------------
+
+    const float invWidth = 1.f / width;
+    const float invHeight = 1.f / height;
+    const float kcx = (x + sampleX) * invWidth - .5f;
+    const float4 kcx4 = (float4) kcx;
+    const float kcy = (y + sampleY) * invHeight - .5f;
+    const float4 kcy4 = (float4) kcy;
+
+    const float4 cameraX = (float4) (camera->x.x, camera->x.y, camera->x.z, 0.f);
+    const float4 cameraY = (float4) (camera->y.x, camera->y.y, camera->y.z, 0.f);
+    const float4 cameraDir = (float4) (camera->dir.x, camera->dir.y, camera->dir.z, 0.f);
+    const float4 cameraOrig = (float4) (camera->orig.x, camera->orig.y, camera->orig.z, 0.f);
+
+    const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir);
+    const float4 eyeRayOrig = eyeRayDir * (float4) 0.1f + cameraOrig;
+
+    //--------------------------------------------------------------------------
+    // Check if we hit the bounding sphere
+    //--------------------------------------------------------------------------
+
+    float distSet = IntersectBoundingSphere(eyeRayOrig, eyeRayDir);
+    float4 hitPoint;
+    if (distSet >= 0.f) {
+        //--------------------------------------------------------------------------
+        // Find the intersection with the set
+        //--------------------------------------------------------------------------
+
+        uint steps;
+        float4 rayOrig = eyeRayOrig + eyeRayDir * (float4) distSet;
+        distSet = IntersectJulia(rayOrig, eyeRayDir, mu, maxIterations,
+                epsilon, &hitPoint, &steps);
+        if (distSet > epsilon)
+            distSet = -1.f;
+    }
+
+    //--------------------------------------------------------------------------
+    // Check if we hit the floor
+    //--------------------------------------------------------------------------
+
+    float distFloor = IntersectFloorSphere(eyeRayOrig, eyeRayDir);
+
+    //--------------------------------------------------------------------------
+    // Select the hit point
+    //--------------------------------------------------------------------------
+
+    int doShade = 0;
+    int useAO = 1;
+    float4 diffuse, n, color;
+    if ((distSet < 0.f) && (distFloor < 0.f)) {
+        // Sky hit
+        color = (float4) (0.f, 0.1f, 0.3f, 0.f);
+    } else if ((distSet >= 0.f) && ((distFloor < 0.f) || (distSet <= distFloor))) {
+        // Set hit
+        diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f);
+        n = NormEstimate(hitPoint, mu, distSet, maxIterations);
+        doShade = 1;
+    } else if ((distFloor >= 0.f) && ((distSet < 0.f) || (distFloor <= distSet))) {
+        // Floor hit
+        hitPoint = eyeRayOrig + eyeRayDir * (float4) distFloor;
+        n = hitPoint - WORLD_CENTER;
+        n = normalize(n);
+        // The most important feature in a ray tracer: a checker texture !
+        const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x);
+        const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z);
+        if ((ix + iz) % 2)
+            diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f);
+        else
+            diffuse = (float4) (0.75f, 0.f, 0.f, 0.f);
+        doShade = 1;
+        useAO = 0;
+    }
+
+    //--------------------------------------------------------------------------
+    // Select the shadow pass
+    //--------------------------------------------------------------------------
+
+    if (doShade) {
+        float shadowFactor = 1.f;
+        if (config->enableShadow) {
+            float4 L = normalize(light - hitPoint);
+            float4 rO = hitPoint + n * 1e-2f;
+            float4 shadowHitPoint;
+
+            // Check bounding sphere
+            float shadowDistSet = IntersectBoundingSphere(rO, L);
+            if (shadowDistSet >= 0.f) {
+                uint steps;
+
+                rO = rO + L * (float4) shadowDistSet;
+                shadowDistSet = IntersectJulia(rO, L, mu, maxIterations, epsilon,
+                        &shadowHitPoint, &steps);
+                if (shadowDistSet < epsilon) {
+                    if (useAO) {
+                        // Use steps count to simulate ambient occlusion
+                        shadowFactor = 0.6f - min(steps / 255.f, 0.5f);
+                    } else
+                        shadowFactor = 0.6f;
+                }
+            } else
+                shadowDistSet = -1.f;
+        }
+
+        //--------------------------------------------------------------------------
+        // Direct lighting of hit point
+        //--------------------------------------------------------------------------
+
+        color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor;
+    }
+
+    //--------------------------------------------------------------------------
+    // Write pixel
+    //--------------------------------------------------------------------------
+
+    int offset = 3 * (x + y * width);
+    color = clamp(color, (float4) (0.f, 0.f, 0.f, 0.f), (float4) (1.f, 1.f, 1.f, 0.f));
+    if (enableAccumulation) {
+        pixels[offset++] += color.s0;
+        pixels[offset++] += color.s1;
+        pixels[offset  ] += color.s2;
+    } else {
+        pixels[offset++] = color.s0;
+        pixels[offset++] = color.s1;
+        pixels[offset  ] = color.s2;
+    }
+}
+
+kernel void multiply(global float *array, const int numElements, const float s) {
+    const int gid = get_global_id(0);
+    if (gid >= numElements)  {
+        return;
+    }
+    array[gid] *= s;
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Camera.java b/src/com/jogamp/opencl/demos/julia3d/structs/Camera.java
new file mode 100644
index 0000000..68c567c
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/Camera.java
@@ -0,0 +1,50 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+import com.jogamp.common.nio.*;
+
+
+public abstract class Camera {
+
+  StructAccessor accessor;
+
+  public static int size() {
+//    if (CPU.is32Bit()) {
+//      return Camera32.size();
+//    } else {
+      return Camera64.size();
+//    }
+  }
+
+  public static Camera create() {
+    return create(Buffers.newDirectByteBuffer(size()));
+  }
+
+  public static Camera create(java.nio.ByteBuffer buf) {
+//    if (CPU.is32Bit()) {
+//      return new Camera32(buf);
+//    } else {
+      return new Camera64(buf);
+//    }
+  }
+
+  Camera(java.nio.ByteBuffer buf) {
+    accessor = new StructAccessor(buf);
+  }
+
+  public java.nio.ByteBuffer getBuffer() {
+    return accessor.getBuffer();
+  }
+
+  public abstract Vec getOrig();
+
+  public abstract Vec getTarget();
+
+  public abstract Vec getDir();
+
+  public abstract Vec getX();
+
+  public abstract Vec getY();
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java b/src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java
new file mode 100644
index 0000000..1811583
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/Camera32.java
@@ -0,0 +1,37 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+
+class Camera32 extends Camera {
+
+  public static int size() {
+    return 76;
+  }
+
+  Camera32(java.nio.ByteBuffer buf) {
+    super(buf);
+  }
+
+
+  public Vec getOrig() {
+    return Vec.create(accessor.slice(0, 12));
+  }
+
+  public Vec getTarget() {
+    return Vec.create(accessor.slice(16, 12));
+  }
+
+  public Vec getDir() {
+    return Vec.create(accessor.slice(32, 12));
+  }
+
+  public Vec getX() {
+    return Vec.create(accessor.slice(48, 12));
+  }
+
+  public Vec getY() {
+    return Vec.create(accessor.slice(64, 12));
+  }
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java b/src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java
new file mode 100644
index 0000000..f82d3b3
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/Camera64.java
@@ -0,0 +1,48 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+
+class Camera64 extends Camera {
+
+  private final Vec orig;
+  private final Vec target;
+  private final Vec dir;
+  private final Vec x;
+  private final Vec y;
+
+  public static int size() {
+    return 60;
+  }
+
+  Camera64(java.nio.ByteBuffer buf) {
+    super(buf);
+    orig = Vec.create(accessor.slice(0, 12));
+    target = Vec.create(accessor.slice(12, 12));
+    dir = Vec.create(accessor.slice(24, 12));
+    x = Vec.create(accessor.slice(36, 12));
+    y = Vec.create(accessor.slice(48, 12));
+  }
+
+
+  public Vec getOrig() {
+    return orig;
+  }
+
+  public Vec getTarget() {
+    return target;
+  }
+
+  public Vec getDir() {
+    return dir;
+  }
+
+  public Vec getX() {
+    return x;
+  }
+
+  public Vec getY() {
+    return y;
+  }
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java
new file mode 100644
index 0000000..4b14f1a
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig.java
@@ -0,0 +1,78 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+import com.jogamp.common.nio.*;
+
+
+public abstract class RenderingConfig {
+
+  StructAccessor accessor;
+
+  public static int size() {
+//    if (CPU.is32Bit()) {
+//      return RenderingConfig32.size();
+//    } else {
+      return RenderingConfig64.size();
+//    }
+  }
+
+  public static RenderingConfig create() {
+    return create(Buffers.newDirectByteBuffer(size()));
+  }
+
+  public static RenderingConfig create(java.nio.ByteBuffer buf) {
+//    if (CPU.is32Bit()) {
+//      return new RenderingConfig32(buf);
+//    } else {
+      return new RenderingConfig64(buf);
+//    }
+  }
+
+  RenderingConfig(java.nio.ByteBuffer buf) {
+    accessor = new StructAccessor(buf);
+  }
+
+  public java.nio.ByteBuffer getBuffer() {
+    return accessor.getBuffer();
+  }
+
+  public abstract RenderingConfig setWidth(int val);
+
+  public abstract int getWidth();
+
+  public abstract RenderingConfig setHeight(int val);
+
+  public abstract int getHeight();
+
+  public abstract RenderingConfig setSuperSamplingSize(int val);
+
+  public abstract int getSuperSamplingSize();
+
+  public abstract RenderingConfig setActvateFastRendering(int val);
+
+  public abstract int getActvateFastRendering();
+
+  public abstract RenderingConfig setEnableShadow(int val);
+
+  public abstract int getEnableShadow();
+
+  public abstract RenderingConfig setMaxIterations(int val);
+
+  public abstract int getMaxIterations();
+
+  public abstract RenderingConfig setEpsilon(float val);
+
+  public abstract float getEpsilon();
+
+  public abstract RenderingConfig setMu(float[] val);
+
+  public abstract float[] getMu();
+
+  public abstract RenderingConfig setLight(float[] val);
+
+  public abstract float[] getLight();
+
+  public abstract Camera getCamera();
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java
new file mode 100644
index 0000000..27f40e6
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig32.java
@@ -0,0 +1,102 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+
+class RenderingConfig32 extends RenderingConfig {
+
+  public static int size() {
+    return 140;
+  }
+
+  RenderingConfig32(java.nio.ByteBuffer buf) {
+    super(buf);
+  }
+
+
+  public RenderingConfig setWidth(int val) {
+    accessor.setIntAt(0, val);
+    return this;
+  }
+
+  public int getWidth() {
+    return accessor.getIntAt(0);
+  }
+
+  public RenderingConfig setHeight(int val) {
+    accessor.setIntAt(1, val);
+    return this;
+  }
+
+  public int getHeight() {
+    return accessor.getIntAt(1);
+  }
+
+  public RenderingConfig setSuperSamplingSize(int val) {
+    accessor.setIntAt(2, val);
+    return this;
+  }
+
+  public int getSuperSamplingSize() {
+    return accessor.getIntAt(2);
+  }
+
+  public RenderingConfig setActvateFastRendering(int val) {
+    accessor.setIntAt(3, val);
+    return this;
+  }
+
+  public int getActvateFastRendering() {
+    return accessor.getIntAt(3);
+  }
+
+  public RenderingConfig setEnableShadow(int val) {
+    accessor.setIntAt(4, val);
+    return this;
+  }
+
+  public int getEnableShadow() {
+    return accessor.getIntAt(4);
+  }
+
+  public RenderingConfig setMaxIterations(int val) {
+    accessor.setIntAt(5, val);
+    return this;
+  }
+
+  public int getMaxIterations() {
+    return accessor.getIntAt(5);
+  }
+
+  public RenderingConfig setEpsilon(float val) {
+    accessor.setFloatAt(6, val);
+    return this;
+  }
+
+  public float getEpsilon() {
+    return accessor.getFloatAt(6);
+  }
+
+  public RenderingConfig setMu(float[] val) {
+    accessor.setFloatsAt(8, val);
+    return this;
+  }
+
+  public float[] getMu() {
+    return accessor.getFloatsAt(8, new float[4]);
+  }
+
+  public RenderingConfig setLight(float[] val) {
+    accessor.setFloatsAt(12, val);
+    return this;
+  }
+
+  public float[] getLight() {
+    return accessor.getFloatsAt(12, new float[3]);
+  }
+
+  public Camera getCamera() {
+    return Camera.create(accessor.slice(64, 76));
+  }
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java
new file mode 100644
index 0000000..e60987e
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/RenderingConfig64.java
@@ -0,0 +1,105 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+
+class RenderingConfig64 extends RenderingConfig {
+
+  private final Camera camera;
+
+  public static int size() {
+    return 116;
+  }
+
+  RenderingConfig64(java.nio.ByteBuffer buf) {
+    super(buf);
+    camera = Camera.create(accessor.slice(56, 60));
+  }
+
+
+  public RenderingConfig setWidth(int val) {
+    accessor.setIntAt(0, val);
+    return this;
+  }
+
+  public int getWidth() {
+    return accessor.getIntAt(0);
+  }
+
+  public RenderingConfig setHeight(int val) {
+    accessor.setIntAt(1, val);
+    return this;
+  }
+
+  public int getHeight() {
+    return accessor.getIntAt(1);
+  }
+
+  public RenderingConfig setSuperSamplingSize(int val) {
+    accessor.setIntAt(2, val);
+    return this;
+  }
+
+  public int getSuperSamplingSize() {
+    return accessor.getIntAt(2);
+  }
+
+  public RenderingConfig setActvateFastRendering(int val) {
+    accessor.setIntAt(3, val);
+    return this;
+  }
+
+  public int getActvateFastRendering() {
+    return accessor.getIntAt(3);
+  }
+
+  public RenderingConfig setEnableShadow(int val) {
+    accessor.setIntAt(4, val);
+    return this;
+  }
+
+  public int getEnableShadow() {
+    return accessor.getIntAt(4);
+  }
+
+  public RenderingConfig setMaxIterations(int val) {
+    accessor.setIntAt(5, val);
+    return this;
+  }
+
+  public int getMaxIterations() {
+    return accessor.getIntAt(5);
+  }
+
+  public RenderingConfig setEpsilon(float val) {
+    accessor.setFloatAt(6, val);
+    return this;
+  }
+
+  public float getEpsilon() {
+    return accessor.getFloatAt(6);
+  }
+
+  public RenderingConfig setMu(float[] val) {
+    accessor.setFloatsAt(7, val);
+    return this;
+  }
+
+  public float[] getMu() {
+    return accessor.getFloatsAt(7, new float[4]);
+  }
+
+  public RenderingConfig setLight(float[] val) {
+    accessor.setFloatsAt(11, val);
+    return this;
+  }
+
+  public float[] getLight() {
+    return accessor.getFloatsAt(11, new float[3]);
+  }
+
+  public Camera getCamera() {
+    return camera;
+  }
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Vec.java b/src/com/jogamp/opencl/demos/julia3d/structs/Vec.java
new file mode 100644
index 0000000..d4b2d48
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/Vec.java
@@ -0,0 +1,53 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+
+import com.jogamp.common.nio.*;
+
+
+public abstract class Vec {
+
+  StructAccessor accessor;
+
+  public static int size() {
+//    if (CPU.is32Bit()) {
+//      return Vec32.size();
+//    } else {
+      return Vec64.size();
+//    }
+  }
+
+  public static Vec create() {
+    return create(Buffers.newDirectByteBuffer(size()));
+  }
+
+  public static Vec create(java.nio.ByteBuffer buf) {
+//    if (CPU.is32Bit()) {
+//      return new Vec32(buf);
+//    } else {
+      return new Vec64(buf);
+//    }
+  }
+
+  Vec(java.nio.ByteBuffer buf) {
+    accessor = new StructAccessor(buf);
+  }
+
+  public java.nio.ByteBuffer getBuffer() {
+    return accessor.getBuffer();
+  }
+
+  public abstract Vec setX(float val);
+
+  public abstract float getX();
+
+  public abstract Vec setY(float val);
+
+  public abstract float getY();
+
+  public abstract Vec setZ(float val);
+
+  public abstract float getZ();
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java b/src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java
new file mode 100644
index 0000000..e7668ac
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/Vec32.java
@@ -0,0 +1,44 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+
+class Vec32 extends Vec {
+
+  public static int size() {
+    return 12;
+  }
+
+  Vec32(java.nio.ByteBuffer buf) {
+    super(buf);
+  }
+
+
+  public Vec setX(float val) {
+    accessor.setFloatAt(0, val);
+    return this;
+  }
+
+  public float getX() {
+    return accessor.getFloatAt(0);
+  }
+
+  public Vec setY(float val) {
+    accessor.setFloatAt(1, val);
+    return this;
+  }
+
+  public float getY() {
+    return accessor.getFloatAt(1);
+  }
+
+  public Vec setZ(float val) {
+    accessor.setFloatAt(2, val);
+    return this;
+  }
+
+  public float getZ() {
+    return accessor.getFloatAt(2);
+  }
+}
diff --git a/src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java b/src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java
new file mode 100644
index 0000000..60750a4
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/julia3d/structs/Vec64.java
@@ -0,0 +1,44 @@
+/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
+
+
+package com.jogamp.opencl.demos.julia3d.structs;
+
+
+class Vec64 extends Vec {
+
+  public static int size() {
+    return 12;
+  }
+
+  Vec64(java.nio.ByteBuffer buf) {
+    super(buf);
+  }
+
+
+  public Vec setX(float val) {
+    accessor.setFloatAt(0, val);
+    return this;
+  }
+
+  public float getX() {
+    return accessor.getFloatAt(0);
+  }
+
+  public Vec setY(float val) {
+    accessor.setFloatAt(1, val);
+    return this;
+  }
+
+  public float getY() {
+    return accessor.getFloatAt(1);
+  }
+
+  public Vec setZ(float val) {
+    accessor.setFloatAt(2, val);
+    return this;
+  }
+
+  public float getZ() {
+    return accessor.getFloatAt(2);
+  }
+}
diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl
new file mode 100644
index 0000000..d014692
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl
@@ -0,0 +1,358 @@
+/*
+* Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+*
+* NVIDIA Corporation and its licensors retain all intellectual property and 
+* proprietary rights in and to this software and related documentation.
+* Any use, reproduction, disclosure, or distribution of this software
+* and related documentation without an express license agreement from 
+* NVIDIA Corporation is strictly prohibited.
+* 
+* Please refer to the applicable NVIDIA end user license agreement (EULA) 
+* associated with this source code for terms and conditions that govern 
+* your use of this NVIDIA software.
+* 
+*/
+
+//----------------------------------------------------------------------------
+// Scans each warp in parallel ("warp-scan"), one element per thread.
+// uses 2 numElements of shared memory per thread (64 = elements per warp)
+//----------------------------------------------------------------------------
+//#define WARP_SIZE 32
+uint scanwarp(uint val, __local uint* sData, int maxlevel)
+{
+    // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
+    int localId = get_local_id(0);
+    int idx = 2 * localId - (localId & (WARP_SIZE - 1));
+    sData[idx] = 0;
+    idx += WARP_SIZE;
+    sData[idx] = val;     
+
+    if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; }
+    if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; }
+    if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; }
+    if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; }
+    if (4 <= maxlevel) { sData[idx] += sData[idx -16]; }
+
+    return sData[idx] - val;  // convert inclusive -> exclusive
+}
+
+//----------------------------------------------------------------------------
+// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using 
+// a warp-scan algorithm
+//----------------------------------------------------------------------------
+uint4 scan4(uint4 idata, __local uint* ptr)
+{    
+    
+    uint idx = get_local_id(0);
+
+    uint4 val4 = idata;
+    uint sum[3];
+    sum[0] = val4.x;
+    sum[1] = val4.y + sum[0];
+    sum[2] = val4.z + sum[1];
+    
+    uint val = val4.w + sum[2];
+    
+    val = scanwarp(val, ptr, 4);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
+    {
+        ptr[idx >> 5] = val + val4.w + sum[2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+	if (idx < WARP_SIZE)
+		ptr[idx] = scanwarp(ptr[idx], ptr, 2);
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    val += ptr[idx >> 5];
+
+    val4.x = val;
+    val4.y = val + sum[0];
+    val4.z = val + sum[1];
+    val4.w = val + sum[2];
+
+    return val4;
+}
+
+#ifdef MAC
+__kernel uint4 rank4(uint4 preds, __local uint* sMem)
+#else
+uint4 rank4(uint4 preds, __local uint* sMem)
+#endif
+{
+	int localId = get_local_id(0);
+	int localSize = get_local_size(0);
+
+	uint4 address = scan4(preds, sMem);
+	
+	__local uint numtrue;
+	if (localId == localSize - 1) 
+	{
+		numtrue = address.w + preds.w;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	uint4 rank;
+	int idx = localId*4;
+	rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
+	rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
+	rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
+	rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
+	
+	return rank;
+}
+
+void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem)
+{
+	int localId = get_local_id(0);
+    int localSize = get_local_size(0);
+	
+	for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+	{
+		uint4 lsb;
+		lsb.x = !(((*key).x >> shift) & 0x1);
+		lsb.y = !(((*key).y >> shift) & 0x1);
+        lsb.z = !(((*key).z >> shift) & 0x1);
+        lsb.w = !(((*key).w >> shift) & 0x1);
+        
+		uint4 r;
+		
+		r = rank4(lsb, sMem);
+
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x;
+        sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y;
+        sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z;
+        sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // The above allows us to read without 4-way bank conflicts:
+        (*key).x = sMem[localId];
+        (*key).y = sMem[localId +     localSize];
+        (*key).z = sMem[localId + 2 * localSize];
+        (*key).w = sMem[localId + 3 * localSize];
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+}
+
+__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, 
+                                      __global uint4* keysOut,
+                                      uint nbits,
+                                      uint startbit,
+                                      uint numElements,
+                                      uint totalBlocks,
+                                      __local uint* sMem)
+{
+	int globalId = get_global_id(0);
+	
+	uint4 key;
+	key = keysIn[globalId];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	radixSortBlockKeysOnly(&key, nbits, startbit, sMem);
+	
+	keysOut[globalId] = key;
+}
+
+//----------------------------------------------------------------------------
+// Given an array with blocks sorted according to a 4-bit radix group, each 
+// block counts the number of keys that fall into each radix in the group, and 
+// finds the starting offset of each radix in the block.  It then writes the radix 
+// counts to the counters array, and the starting offsets to the blockOffsets array.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//                                
+//----------------------------------------------------------------------------
+__kernel void findRadixOffsets(__global uint2* keys,
+                               __global uint* counters,
+                               __global uint* blockOffsets,
+                               uint startbit,
+                               uint numElements,
+                               uint totalBlocks,
+                               __local uint* sRadix1)
+{
+	__local uint  sStartPointers[16];
+
+    uint groupId = get_group_id(0);
+    uint localId = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    uint2 radix2;
+
+    radix2 = keys[get_global_id(0)];
+        
+
+    sRadix1[2 * localId]     = (radix2.x >> startbit) & 0xF;
+    sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
+
+    // Finds the position where the sRadix1 entries differ and stores start 
+    // index for each radix.
+    if(localId < 16) 
+    {
+        sStartPointers[localId] = 0; 
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId]] = localId;
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) 
+    {
+        sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        blockOffsets[groupId*16 + localId] = sStartPointers[localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Compute the sizes of each block.
+    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    {
+        sStartPointers[sRadix1[localId - 1]] = 
+            localId - sStartPointers[sRadix1[localId - 1]];
+    }
+    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) 
+    {
+        sStartPointers[sRadix1[localId + groupSize - 1]] = 
+            localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
+    }
+        
+
+    if(localId == groupSize - 1) 
+    {
+        sStartPointers[sRadix1[2 * groupSize - 1]] = 
+            2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(localId < 16) 
+    {
+        counters[localId * totalBlocks + groupId] = sStartPointers[localId];
+    }
+}
+
+// a naive scan routine that works only for array that
+// can fit into a single block, just for debugging purpose,
+// not used in the sort now
+__kernel void scanNaive(__global uint *g_odata, 
+                        __global uint *g_idata, 
+                        uint n,
+                        __local uint* temp)
+{
+
+    int localId = get_local_id(0);
+
+    int pout = 0;
+    int pin = 1;
+
+    // Cache the computational window in shared memory
+    temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0;
+
+    for (int offset = 1; offset < n; offset *= 2)
+    {
+        pout = 1 - pout;
+        pin  = 1 - pout;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        temp[pout*n+localId] = temp[pin*n+localId];
+
+        if (localId >= offset)
+            temp[pout*n+localId] += temp[pin*n+localId - offset];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    g_odata[localId] = temp[pout*n+localId];
+}
+
+//----------------------------------------------------------------------------
+// reorderData shuffles data in the array globally after the radix offsets 
+// have been found. On compute version 1.1 and earlier GPUs, this code depends 
+// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+// 
+// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+// that all writes are coalesced using extra work in the kernel.  On later
+// GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+// performance.  On these GPUs we set manualCoalesce=false and directly store
+// the results.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//----------------------------------------------------------------------------
+__kernel void reorderDataKeysOnly(__global uint  *outKeys, 
+                                  __global uint2  *keys, 
+                                  __global uint  *blockOffsets, 
+                                  __global uint  *offsets, 
+                                  __global uint  *sizes, 
+                                  uint startbit,
+                                  uint numElements,
+                                  uint totalBlocks,
+                                  __local uint2* sKeys2)
+{
+    __local uint sOffsets[16];
+    __local uint sBlockOffsets[16];
+
+    __local uint *sKeys1 = (__local uint*)sKeys2; 
+
+    uint groupId = get_group_id(0);
+
+	uint globalId = get_global_id(0);
+    uint localId = get_local_id(0);
+    uint groupSize = get_local_size(0);
+
+    sKeys2[localId]   = keys[globalId];
+
+    if(localId < 16)  
+    {
+        sOffsets[localId]      = offsets[localId * totalBlocks + groupId];
+        sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint radix = (sKeys1[localId] >> startbit) & 0xF;
+    uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId];
+    }
+
+    radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
+    globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
+
+    if (globalOffset < numElements)
+    {
+        outKeys[globalOffset]   = sKeys1[localId + groupSize];
+    }
+ 
+
+}
diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSort.java b/src/com/jogamp/opencl/demos/radixsort/RadixSort.java
new file mode 100644
index 0000000..e2a7b46
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/radixsort/RadixSort.java
@@ -0,0 +1,182 @@
+/*
+ * 20:38 Sunday, February 28 2010
+ */
+
+package com.jogamp.opencl.demos.radixsort;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opencl.CLResource;
+import java.io.IOException;
+import java.nio.IntBuffer;
+
+import static com.jogamp.opencl.CLMemory.Mem.*;
+import static com.jogamp.opencl.CLProgram.*;
+import static com.jogamp.opencl.CLProgram.CompilerOptions.*;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class RadixSort implements CLResource {
+
+    private static final int NUM_BANKS = 16;
+    private static final int WARP_SIZE = 32;
+    private static final int bitStep   = 4;
+
+    private final int CTA_SIZE;
+
+    private final CLKernel ckRadixSortBlocksKeysOnly;
+    private final CLKernel ckFindRadixOffsets;
+    private final CLKernel ckScanNaive;
+    private final CLKernel ckReorderDataKeysOnly;
+
+    private final CLBuffer<?> tempKeys;
+    private final CLBuffer<?> mCounters;
+    private final CLBuffer<?> mCountersSum;
+    private final CLBuffer<?> mBlockOffsets;
+
+    private final CLCommandQueue queue;
+    private final Scan scan;
+    private final CLProgram program;
+
+    public RadixSort(CLCommandQueue queue, int maxElements, int CTA_SIZE) throws IOException {
+
+        this.CTA_SIZE = CTA_SIZE;
+        scan = new Scan(queue, maxElements / 2 / CTA_SIZE * 16);
+
+        int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0)
+                ? (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1);
+
+        this.queue = queue;
+
+        CLContext context  = queue.getContext();
+        this.tempKeys      = context.createBuffer(4 * maxElements,           READ_WRITE);
+        this.mCounters     = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
+        this.mCountersSum  = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
+        this.mBlockOffsets = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
+
+        program = context.createProgram(getClass().getResourceAsStream("RadixSort.cl"))
+                         .build(ENABLE_MAD, define("WARP_SIZE", WARP_SIZE));
+
+//        out.println(program.getBuildLog());
+
+        ckRadixSortBlocksKeysOnly  = program.createCLKernel("radixSortBlocksKeysOnly");
+        ckFindRadixOffsets         = program.createCLKernel("findRadixOffsets");
+        ckScanNaive                = program.createCLKernel("scanNaive");
+        ckReorderDataKeysOnly      = program.createCLKernel("reorderDataKeysOnly");
+
+    }
+
+    void sort(CLBuffer<IntBuffer> d_keys, int numElements, int keyBits) {
+        radixSortKeysOnly(d_keys, numElements, keyBits);
+    }
+
+    //----------------------------------------------------------------------------
+    // Main key-only radix sort function.  Sorts in place in the keys and values
+    // arrays, but uses the other device arrays as temporary storage.  All pointer
+    // parameters are device pointers.  Uses cudppScan() for the prefix sum of
+    // radix counters.
+    //----------------------------------------------------------------------------
+    void radixSortKeysOnly(CLBuffer<IntBuffer> keys, int numElements, int keyBits) {
+        int i = 0;
+        while (keyBits > i * bitStep) {
+            radixSortStepKeysOnly(keys, bitStep, i * bitStep, numElements);
+            i++;
+        }
+    }
+
+    //----------------------------------------------------------------------------
+    // Perform one step of the radix sort.  Sorts by nbits key bits per step,
+    // starting at startbit.
+    //----------------------------------------------------------------------------
+    void radixSortStepKeysOnly(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
+
+        // Four step algorithms from Satish, Harris & Garland
+        radixSortBlocksKeysOnlyOCL(keys, nbits, startbit, numElements);
+
+        findRadixOffsetsOCL(startbit, numElements);
+
+        scan.scanExclusiveLarge(mCountersSum, mCounters, 1, numElements / 2 / CTA_SIZE * 16);
+
+        reorderDataKeysOnlyOCL(keys, startbit, numElements);
+    }
+
+    //----------------------------------------------------------------------------
+    // Wrapper for the kernels of the four steps
+    //----------------------------------------------------------------------------
+    void radixSortBlocksKeysOnlyOCL(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
+
+        int totalBlocks = numElements / 4 / CTA_SIZE;
+        int globalWorkSize = CTA_SIZE * totalBlocks;
+        int localWorkSize = CTA_SIZE;
+
+        ckRadixSortBlocksKeysOnly.putArg(keys).putArg(tempKeys).putArg(nbits).putArg(startbit)
+                                 .putArg(numElements).putArg(totalBlocks).putNullArg(4 * CTA_SIZE * 4)
+                                 .rewind();
+
+        queue.put1DRangeKernel(ckRadixSortBlocksKeysOnly, 0, globalWorkSize, localWorkSize);
+    }
+
+    void findRadixOffsetsOCL(int startbit, int numElements) {
+
+        int totalBlocks = numElements / 2 / CTA_SIZE;
+        int globalWorkSize = CTA_SIZE * totalBlocks;
+        int localWorkSize = CTA_SIZE;
+
+        ckFindRadixOffsets.putArg(tempKeys).putArg(mCounters).putArg(mBlockOffsets)
+                          .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4)
+                          .rewind();
+
+        queue.put1DRangeKernel(ckFindRadixOffsets, 0, globalWorkSize, localWorkSize);
+    }
+
+    void scanNaiveOCL(int numElements) {
+        
+        int nHist = numElements / 2 / CTA_SIZE * 16;
+        int globalWorkSize = nHist;
+        int localWorkSize = nHist;
+        int extra_space = nHist / NUM_BANKS;
+        int shared_mem_size = 4 * (nHist + extra_space);
+
+        ckScanNaive.putArg(mCountersSum).putArg(mCounters).putArg(nHist).putNullArg(2 * shared_mem_size).rewind();
+
+        queue.put1DRangeKernel(ckScanNaive, 0, globalWorkSize, localWorkSize);
+    }
+
+    void reorderDataKeysOnlyOCL(CLBuffer<IntBuffer> keys, int startbit, int numElements) {
+
+        int totalBlocks = numElements / 2 / CTA_SIZE;
+        int globalWorkSize = CTA_SIZE * totalBlocks;
+        int localWorkSize = CTA_SIZE;
+
+        ckReorderDataKeysOnly.putArg(keys).putArg(tempKeys).putArg(mBlockOffsets).putArg(mCountersSum).putArg(mCounters)
+                             .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4).rewind();
+
+        queue.put1DRangeKernel(ckReorderDataKeysOnly, 0, globalWorkSize, localWorkSize);
+    }
+
+    public void release() {
+
+        scan.release();
+
+        //program & kernels
+        program.release();
+
+        //buffers
+        tempKeys.release();
+        mCounters.release();
+        mCountersSum.release();
+        mBlockOffsets.release();
+    }
+
+    public void close() {
+        release();
+    }
+
+
+
+}
diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java
new file mode 100644
index 0000000..2ce429a
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java
@@ -0,0 +1,129 @@
+/*
+ * 20:48 Sunday, February 28 2010
+ */
+
+package com.jogamp.opencl.demos.radixsort;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLPlatform;
+import java.io.IOException;
+import java.nio.IntBuffer;
+import java.util.Random;
+
+import static com.jogamp.opencl.CLMemory.Mem.*;
+import static java.lang.System.*;
+import static com.jogamp.opencl.CLDevice.Type.*;
+
+/**
+ * GPU radix sort demo.
+ * @author Michael Bien
+ */
+public class RadixSortDemo {
+
+    public RadixSortDemo() throws IOException {
+
+        CLContext context = null;
+        try{
+            //single GPU setup
+            context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU));
+            CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
+
+            int maxValue = Integer.MAX_VALUE;
+            int samples  = 10;
+
+            int[] workgroupSizes = new int[] {128, 256};
+
+            int[] runs = new int[] {   32768,
+                                       65536,
+                                      131072,
+                                      262144,
+                                      524288,
+                                     1048576,
+                                     2097152,
+                                     4194304,
+                                     8388608 };
+
+            for (int i = 0; i < workgroupSizes.length; i++) {
+
+                int workgroupSize = workgroupSizes[i];
+
+                out.println("\n = = = workgroup size: "+workgroupSize+" = = = ");
+
+                for(int run = 0; run < runs.length; run++) {
+
+                    if(  workgroupSize==128 && runs[run] >= 8388608
+                      || workgroupSize==256 && runs[run] <= 32768) {
+                        continue; // we can only sort up to 4MB with wg size of 128
+                    }
+
+                    int numElements = runs[run];
+
+                    CLBuffer<IntBuffer> array = context.createIntBuffer(numElements, READ_WRITE);
+                    out.print("array size: " + array.getCLSize()/1000000.0f+"MB; ");
+                    out.println("elements: " + array.getCapacity()/1000+"K");
+
+                    fillBuffer(array, maxValue);
+
+                    RadixSort radixSort = new RadixSort(queue, numElements, workgroupSize);
+                    for(int a = 0; a < samples; a++) {
+
+                        queue.finish();
+
+                        long time = nanoTime();
+
+                        queue.putWriteBuffer(array, false);
+                        radixSort.sort(array, numElements, 32);
+                        queue.putReadBuffer(array, true);
+
+                        out.println("time: " + (nanoTime() - time)/1000000.0f+"ms");
+                    }
+
+                    out.print("snapshot: ");
+                    printSnapshot(array.getBuffer(), 20);
+
+                    out.println("validating...");
+                    checkIfSorted(array.getBuffer());
+                    out.println("values sorted");
+
+                    array.release();
+                    radixSort.release();
+                }
+            }
+
+        }finally{
+            if(context != null) {
+                context.release();
+            }
+        }
+
+    }
+
+    private void fillBuffer(CLBuffer<IntBuffer> array, int maxValue) {
+        Random random = new Random(42);
+        for (int n = 0; n < array.getBuffer().capacity(); n++) {
+            int rnd = random.nextInt(maxValue);
+            array.getBuffer().put(n, rnd);
+        }
+    }
+
+    private void printSnapshot(IntBuffer buffer, int snapshot) {
+        for(int i = 0; i < snapshot; i++)
+            out.print(buffer.get() + ", ");
+        out.println("...; " + buffer.remaining() + " more");
+        buffer.rewind();
+    }
+
+    private void checkIfSorted(IntBuffer keys) {
+        for (int i = 1; i < keys.capacity(); i++) {
+            if (keys.get(i - 1) > keys.get(i)) {
+                throw new RuntimeException("not sorted "+ keys.get(i - 1) +" !> "+ keys.get(i));
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        new RadixSortDemo();
+    }
+}
diff --git a/src/com/jogamp/opencl/demos/radixsort/Scan.java b/src/com/jogamp/opencl/demos/radixsort/Scan.java
new file mode 100644
index 0000000..3d364ed
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/radixsort/Scan.java
@@ -0,0 +1,131 @@
+/*
+ * 22:12 Sunday, February 28 2010
+ */
+package com.jogamp.opencl.demos.radixsort;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import com.jogamp.opencl.CLResource;
+import java.io.IOException;
+
+import static com.jogamp.opencl.CLMemory.Mem.*;
+import static com.jogamp.opencl.CLProgram.CompilerOptions.*;
+
+/**
+ *
+ * @author Michael Bien
+ */
+public class Scan implements CLResource {
+
+    private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024;
+    private final static int MAX_LOCAL_GROUP_SIZE = 256;
+    private final static int WORKGROUP_SIZE = 256;
+    private final static int MAX_BATCH_ELEMENTS = 64 * 1048576;
+    private final static int MIN_SHORT_ARRAY_SIZE = 4;
+    private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE;
+    private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE;
+    private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE;
+
+    private final CLKernel ckScanExclusiveLocal1;
+    private final CLKernel ckScanExclusiveLocal2;
+    private final CLKernel ckUniformUpdate;
+
+    private final CLCommandQueue queue;
+    private final CLProgram program;
+    private CLBuffer<?> buffer;
+
+    public Scan(CLCommandQueue queue, int numElements) throws IOException {
+
+        this.queue = queue;
+
+        CLContext context = queue.getContext();
+        if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) {
+            buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE);
+        }
+        program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl"))
+                         .build(ENABLE_MAD);
+
+        ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1");
+        ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2");
+        ckUniformUpdate       = program.createCLKernel("uniformUpdate");
+    }
+
+    // main exclusive scan routine
+    void scanExclusiveLarge(CLBuffer<?> dst, CLBuffer<?> src, int batchSize, int arrayLength) {
+
+        //Check power-of-two factorization
+        if(!isPowerOf2(arrayLength)) {
+            throw new RuntimeException();
+        }
+
+        //Check supported size range
+        if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) {
+            throw new RuntimeException();
+        }
+
+        //Check total batch size limit
+        if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) {
+            throw new RuntimeException();
+        }
+
+        scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE);
+        scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE));
+        uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE));
+    }
+
+    void scanExclusiveLocal1(CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
+
+        ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size)
+                             .rewind();
+
+        int localWorkSize = WORKGROUP_SIZE;
+        int globalWorkSize = (n * size) / 4;
+
+        queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize);
+    }
+
+    void scanExclusiveLocal2(CLBuffer<?> buffer, CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
+
+        int elements = n * size;
+        ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4)
+                             .putArg(elements).putArg(size).rewind();
+
+        int localWorkSize = WORKGROUP_SIZE;
+        int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);
+
+        queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize);
+    }
+
+    void uniformUpdate(CLBuffer<?> dst, CLBuffer<?> buffer, int n) {
+
+        ckUniformUpdate.setArgs(dst, buffer);
+
+        int localWorkSize  = WORKGROUP_SIZE;
+        int globalWorkSize = n * WORKGROUP_SIZE;
+
+        queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize);
+    }
+
+    private int iSnapUp(int dividend, int divisor) {
+        return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor);
+    }
+
+    public static boolean isPowerOf2(int x) {
+        return ((x - 1) & x) == 0;
+    }
+
+    public void release() {
+        program.release();
+
+        if(buffer!=null) {
+            buffer.release();
+        }
+    }
+
+    public void close() {
+        release();
+    }
+}
diff --git a/src/com/jogamp/opencl/demos/radixsort/Scan_b.cl b/src/com/jogamp/opencl/demos/radixsort/Scan_b.cl
new file mode 100644
index 0000000..32fd4dd
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/radixsort/Scan_b.cl
@@ -0,0 +1,190 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation. 
+ * Any use, reproduction, disclosure, or distribution of this software 
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA) 
+ * associated with this source code for terms and conditions that govern 
+ * your use of this NVIDIA software.
+ * 
+ */
+
+
+
+//All three kernels run 512 threads per workgroup
+//Must be a power of two
+#define WORKGROUP_SIZE 256
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Scan codelets
+////////////////////////////////////////////////////////////////////////////////
+#if(1)
+    //Naive inclusive scan: O(N * log2(N)) operations
+    //Allocate 2 * 'size' local memory, initialize the first half
+    //with 'size' zeros avoiding if(pos >= offset) condition evaluation
+    //and saving instructions
+    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
+        l_Data[pos] = 0;
+        pos += size;
+        l_Data[pos] = idata;
+
+        for(uint offset = 1; offset < size; offset <<= 1){
+            barrier(CLK_LOCAL_MEM_FENCE);
+            uint t = l_Data[pos] + l_Data[pos - offset];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            l_Data[pos] = t;
+        }
+
+        return l_Data[pos];
+    }
+
+    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
+        return scan1Inclusive(idata, l_Data, size) - idata;
+    }
+
+#else
+    #define LOG2_WARP_SIZE 5U
+    #define      WARP_SIZE (1U << LOG2_WARP_SIZE)
+
+    //Almost the same as naiveScan1 but doesn't need barriers
+    //assuming size <= WARP_SIZE
+    inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){
+        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
+        l_Data[pos] = 0;
+        pos += size;
+        l_Data[pos] = idata;
+
+        for(uint offset = 1; offset < size; offset <<= 1)
+            l_Data[pos] += l_Data[pos - offset];
+
+        return l_Data[pos];
+    }
+
+    inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){
+        return warpScanInclusive(idata, l_Data, size) - idata;
+    }
+
+    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
+        if(size > WARP_SIZE){
+            //Bottom-level inclusive warp scan
+            uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE);
+
+            //Save top elements of each warp for exclusive warp scan
+            //sync to wait for warp scans to complete (because l_Data is being overwritten)
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
+                l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult;
+
+            //wait for warp scans to complete
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){
+                //grab top warp elements
+                uint val = l_Data[get_local_id(0)];
+                //calculate exclsive scan and write back to shared memory
+                l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE);
+            }
+
+            //return updated warp scans with exclusive scan results
+            barrier(CLK_LOCAL_MEM_FENCE);
+            return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE];
+        }else{
+            return warpScanInclusive(idata, l_Data, size);
+        }
+    }
+
+    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
+        return scan1Inclusive(idata, l_Data, size) - idata;
+    }
+#endif
+
+
+//Vector scan: the array to be scanned is stored
+//in work-item private memory as uint4
+inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){
+    //Level-0 inclusive scan
+    data4.y += data4.x;
+    data4.z += data4.y;
+    data4.w += data4.z;
+
+    //Level-1 exclusive scan
+    uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w;
+
+    return (data4 + (uint4)val);
+}
+
+inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){
+    return scan4Inclusive(data4, l_Data, size) - data4;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Scan kernels
+////////////////////////////////////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void scanExclusiveLocal1(
+    __global uint4 *d_Dst,
+    __global uint4 *d_Src,
+    __local uint* l_Data,
+    uint size
+){
+    //Load data
+    uint4 idata4 = d_Src[get_global_id(0)];
+
+    //Calculate exclusive scan
+    uint4 odata4  = scan4Exclusive(idata4, l_Data, size);
+
+    //Write back
+    d_Dst[get_global_id(0)] = odata4;
+}
+
+//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void scanExclusiveLocal2(
+    __global uint *d_Buf,
+    __global uint *d_Dst,
+    __global uint *d_Src,
+    __local uint* l_Data,
+    uint N,
+    uint arrayLength
+){
+    //Load top elements
+    //Convert results of bottom-level scan back to inclusive
+    //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N)
+    uint data = 0;
+    if(get_global_id(0) < N)
+    data =
+        d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + 
+        d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)];
+
+    //Compute
+    uint odata = scan1Exclusive(data, l_Data, arrayLength);
+
+    //Avoid out-of-bound access
+    if(get_global_id(0) < N)
+        d_Buf[get_global_id(0)] = odata;
+}
+
+//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
+__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
+void uniformUpdate(
+    __global uint4 *d_Data,
+    __global uint *d_Buf
+){
+    __local uint buf[1];
+
+    uint4 data4 = d_Data[get_global_id(0)];
+
+    if(get_local_id(0) == 0)
+        buf[0] = d_Buf[get_group_id(0)];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    data4 += (uint4)buf[0];
+    d_Data[get_global_id(0)] = data4;
+}
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
deleted file mode 100644
index a8d0e1d..0000000
--- a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.cl
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
- */
-
-
-
-//Passed down by clBuildProgram
-//#define LOCAL_SIZE_LIMIT 1024
-
-
-
-inline void ComparatorPrivate(
-    uint *keyA,
-    uint *keyB,
-    uint arrowDir
-){
-    if( (*keyA > *keyB) == arrowDir ){
-        uint t;
-        t = *keyA; *keyA = *keyB; *keyB = t;
-    }
-}
-
-inline void ComparatorLocal(
-    __local uint *keyA,
-    __local uint *keyB,
-    uint arrowDir
-){
-    if( (*keyA > *keyB) == arrowDir ){
-        uint t;
-        t = *keyA; *keyA = *keyB; *keyB = t;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Monolithic bitonic sort kernel for short arrays fitting into local memory
-////////////////////////////////////////////////////////////////////////////////
-__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
-void bitonicSortLocal(
-    __global uint *d_DstKey,
-    __global uint *d_SrcKey,
-    uint arrayLength,
-    uint sortDir
-){
-    __local  uint l_key[LOCAL_SIZE_LIMIT];
-
-    //Offset to the beginning of subbatch and load data
-    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
-    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
-    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
-    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
-
-    for(uint size = 2; size < arrayLength; size <<= 1){
-        //Bitonic merge
-        uint dir = ( (get_local_id(0) & (size / 2)) != 0 );
-        for(uint stride = size / 2; stride > 0; stride >>= 1){
-            barrier(CLK_LOCAL_MEM_FENCE);
-            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
-            ComparatorLocal(
-                &l_key[pos +      0],
-                &l_key[pos + stride],
-                dir
-            );
-        }
-    }
-
-    //dir == sortDir for the last bitonic merge step
-    {
-        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
-            barrier(CLK_LOCAL_MEM_FENCE);
-            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
-            ComparatorLocal(
-                &l_key[pos +      0],
-                &l_key[pos + stride],
-                sortDir
-            );
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
-    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Bitonic sort kernel for large arrays (not fitting into local memory)
-////////////////////////////////////////////////////////////////////////////////
-//Bottom-level bitonic sort
-//Almost the same as bitonicSortLocal with the only exception
-//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
-//sorted in opposite directions
-__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
-void bitonicSortLocal1(
-    __global uint *d_DstKey,
-    __global uint *d_SrcKey
-){
-    __local uint l_key[LOCAL_SIZE_LIMIT];
-
-    //Offset to the beginning of subarray and load data
-    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
-    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
-    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
-    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
-
-    uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1);
-
-    for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){
-        //Bitonic merge
-        uint dir = (comparatorI & (size / 2)) != 0;
-        for(uint stride = size / 2; stride > 0; stride >>= 1){
-            barrier(CLK_LOCAL_MEM_FENCE);
-            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
-            ComparatorLocal(
-                &l_key[pos +      0],
-                &l_key[pos + stride],
-                dir
-            );
-        }
-    }
-
-    //Odd / even arrays of LOCAL_SIZE_LIMIT elements
-    //sorted in opposite directions
-    {
-        uint dir = (get_group_id(0) & 1);
-        for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
-            barrier(CLK_LOCAL_MEM_FENCE);
-            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
-            ComparatorLocal(
-                &l_key[pos +      0],
-                &l_key[pos + stride],
-               dir
-            );
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
-    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
-}
-
-//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
-__kernel void bitonicMergeGlobal(
-    __global uint *d_DstKey,
-    __global uint *d_SrcKey,
-    uint arrayLength,
-    uint size,
-    uint stride,
-    uint sortDir
-){
-    uint global_comparatorI = get_global_id(0);
-    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
-
-    //Bitonic merge
-    uint dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
-    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
-
-    uint keyA = d_SrcKey[pos +      0];
-    uint keyB = d_SrcKey[pos + stride];
-
-    ComparatorPrivate(
-        &keyA, 
-        &keyB, 
-        dir
-    );
-
-    d_DstKey[pos +      0] = keyA;
-    d_DstKey[pos + stride] = keyB;
-}
-
-//Combined bitonic merge steps for
-//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
-__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_LIMIT / 2, 1, 1)))
-void bitonicMergeLocal(
-    __global uint *d_DstKey,
-    __global uint *d_SrcKey,
-    uint arrayLength,
-    uint stride,
-    uint size,
-    uint sortDir
-){
-    __local uint l_key[LOCAL_SIZE_LIMIT];
-
-    d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
-    d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0);
-    l_key[get_local_id(0) +                      0] = d_SrcKey[                     0];
-    l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)];
-
-    //Bitonic merge
-    uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
-    uint         dir = sortDir ^ ( (comparatorI & (size / 2)) != 0 );
-    for(; stride > 0; stride >>= 1){
-        barrier(CLK_LOCAL_MEM_FENCE);
-        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
-        ComparatorLocal(
-            &l_key[pos +      0],
-            &l_key[pos + stride],
-            dir
-        );
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    d_DstKey[                     0] = l_key[get_local_id(0) +                      0];
-    d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)];
-}
diff --git a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java b/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
deleted file mode 100644
index ed5b32e..0000000
--- a/src/com/mbien/opencl/demos/bitonicsort/BitonicSort.java
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * 18:42 Saturday, February 27 2010
- */
-package com.mbien.opencl.demos.bitonicsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.Map;
-import java.util.Random;
-
-import static java.lang.System.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.*;
-
-/**
- * Bitonic sort optimized for GPUs.
- * Uses NVIDIA's bitonic merge sort kernel.
- * @author Michael Bien
- */
-public class BitonicSort {
-
-    private static final String BITONIC_MERGE_GLOBAL = "bitonicMergeGlobal";
-    private static final String BITONIC_MERGE_LOCAL  = "bitonicMergeLocal";
-    private static final String BITONIC_SORT_LOCAL   = "bitonicSortLocal";
-    private static final String BITONIC_SORT_LOCAL1  = "bitonicSortLocal1";
-
-    private final static int LOCAL_SIZE_LIMIT = 1024;
-    private final Map<String, CLKernel> kernels;
-
-    public BitonicSort() throws IOException {
-
-        final int sortDir  = 1;
-        final int elements = 1048576;
-        final int maxvalue = 1000000;
-
-        out.println("Initializing OpenCL...");
-
-        //Create the context
-        CLContext context = null;
-
-        try{
-
-            context = CLContext.create();
-            CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
-
-            out.println("Initializing OpenCL bitonic sorter...");
-            kernels = initBitonicSort(queue);
-
-            out.println("Creating OpenCL memory objects...");
-            CLBuffer<IntBuffer> keyBuffer = context.createIntBuffer(elements, READ_ONLY, USE_BUFFER);
-            System.out.println(keyBuffer.getCLSize()/1000000.0f);
-
-            out.println("Initializing data...\n");
-            Random random = new Random();
-            for (int i = 0; i < elements; i++) {
-                int rnd = random.nextInt(maxvalue);
-                keyBuffer.getBuffer().put(i, rnd);
-            }
-
-            int arrayLength = elements;
-            int batch = elements / arrayLength;
-
-            out.printf("Test array length %d (%d arrays in the batch)...\n", arrayLength, batch);
-
-            long time = currentTimeMillis();
-
-            bitonicSort(queue, keyBuffer, keyBuffer, batch, arrayLength, sortDir);
-            queue.putReadBuffer(keyBuffer, true);
-
-            out.println(currentTimeMillis() - time+"ms");
-
-            IntBuffer keys = keyBuffer.getBuffer();
-            printSnapshot(keys, 20);
-            checkIfSorted(keys);
-
-            out.println("\nTEST PASSED");
-        
-        }finally{
-            if(context!=null) {
-                context.release();
-            }
-        }
-
-    }
-    
-    private Map<String, CLKernel> initBitonicSort(CLCommandQueue queue) throws IOException {
-
-        out.println("    creating bitonic sort program");
-
-        CLContext context = queue.getContext();
-
-        CLProgram program = context.createProgram(getClass().getResourceAsStream("BitonicSort.cl"))
-                                   .build(define("LOCAL_SIZE_LIMIT", LOCAL_SIZE_LIMIT));
-
-        Map<String, CLKernel> kernelMap = program.createCLKernels();
-
-        out.println("    checking minimum supported workgroup size");
-        //Check for work group size
-        CLDevice device = queue.getDevice();
-        long szBitonicSortLocal  = kernelMap.get(BITONIC_SORT_LOCAL).getWorkGroupSize(device);
-        long szBitonicSortLocal1 = kernelMap.get(BITONIC_SORT_LOCAL1).getWorkGroupSize(device);
-        long szBitonicMergeLocal = kernelMap.get(BITONIC_MERGE_LOCAL).getWorkGroupSize(device);
-
-        if (    (szBitonicSortLocal < (LOCAL_SIZE_LIMIT / 2))
-             || (szBitonicSortLocal1 < (LOCAL_SIZE_LIMIT / 2))
-             || (szBitonicMergeLocal < (LOCAL_SIZE_LIMIT / 2))  ) {
-            throw new RuntimeException("Minimum work-group size "+LOCAL_SIZE_LIMIT/2
-                    +" required by this application is not supported on this device.");
-        }
-
-        return kernelMap;
-
-    }
-
-    public void bitonicSort(CLCommandQueue queue, CLBuffer<?> dstKey, CLBuffer<?> srcKey, int batch, int arrayLength, int dir) {
-
-        if (arrayLength < 2) {
-            throw new IllegalArgumentException("arrayLength was "+arrayLength);
-        }
-
-        // TODO Only power-of-two array lengths are supported so far
-
-        dir = (dir != 0) ? 1 : 0;
-
-        CLKernel sortlocal1  = kernels.get(BITONIC_SORT_LOCAL1);
-        CLKernel sortlocal   = kernels.get(BITONIC_SORT_LOCAL);
-        CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL);
-        CLKernel mergeLocal  = kernels.get(BITONIC_MERGE_LOCAL);
-
-        if (arrayLength <= LOCAL_SIZE_LIMIT) {
-
-            //        oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
-
-            //Launch bitonicSortLocal
-            sortlocal.putArgs(dstKey, srcKey)
-                     .putArg(arrayLength).putArg(dir).rewind();
-
-            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
-            int globalWorkSize = batch * arrayLength / 2;
-            queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize);
-
-        } else {
-
-            //Launch bitonicSortLocal1
-            sortlocal1.setArgs(dstKey, srcKey);
-
-            int localWorkSize = LOCAL_SIZE_LIMIT / 2;
-            int globalWorkSize = batch * arrayLength / 2;
-
-            queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize);
-
-            for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
-                for (int stride = size / 2; stride > 0; stride >>= 1) {
-                    if (stride >= LOCAL_SIZE_LIMIT) {
-                        //Launch bitonicMergeGlobal
-                        mergeGlobal.putArgs(dstKey, dstKey)
-                                   .putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
-
-                        globalWorkSize = batch * arrayLength / 2;
-                        queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0);
-                    } else {
-                        //Launch bitonicMergeLocal
-                        mergeLocal.putArgs(dstKey, dstKey)
-                                  .putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
-
-                        localWorkSize = LOCAL_SIZE_LIMIT / 2;
-                        globalWorkSize = batch * arrayLength / 2;
-
-                        queue.put1DRangeKernel(mergeLocal, 0, globalWorkSize, localWorkSize);
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    private void printSnapshot(IntBuffer buffer, int snapshot) {
-        for(int i = 0; i < snapshot; i++)
-            out.print(buffer.get() + ", ");
-        out.println("...; " + buffer.remaining() + " more");
-        buffer.rewind();
-    }
-
-    private void checkIfSorted(IntBuffer keys) {
-        for (int i = 1; i < keys.capacity(); i++) {
-            if (keys.get(i - 1) > keys.get(i)) {
-                throw new RuntimeException("not sorted "+ keys.get(i - 1) +"!> "+ keys.get(i));
-            }
-        }
-    }
-
-    public static void main(String[] args) throws IOException {
-        new BitonicSort();
-    }
-}
diff --git a/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl b/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl
deleted file mode 100644
index 640c775..0000000
--- a/src/com/mbien/opencl/demos/fractal/Mandelbrot.cl
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifdef DOUBLE_FP
-    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-    typedef double varfloat;
-#else
-    typedef float varfloat;
-#endif
-
-/**
- * For a description of this algorithm please refer to
- * http://en.wikipedia.org/wiki/Mandelbrot_set
- * @author Michael Bien
- */
-kernel void mandelbrot(
-        const int width,        const int height,
-        const varfloat x0,      const varfloat y0,
-        const varfloat rangeX,  const varfloat rangeY,
-        global uint *output,    global uint *colorMap,
-        const int colorMapSize, const int maxIterations) {
-
-    unsigned int ix = get_global_id(0);
-    unsigned int iy = get_global_id(1);
-
-    varfloat r = x0 + ix * rangeX / width;
-    varfloat i = y0 + iy * rangeY / height;
-
-    varfloat x = 0;
-    varfloat y = 0;
-
-    varfloat magnitudeSquared = 0;
-    int iteration = 0;
-
-    while (magnitudeSquared < 4 && iteration < maxIterations) {
-        varfloat x2 = x*x;
-        varfloat y2 = y*y;
-        y = 2 * x * y + i;
-        x = x2 - y2 + r;
-        magnitudeSquared = x2+y2;
-        iteration++;
-    }
-
-    if (iteration == maxIterations)  {
-        output[iy * width + ix] = 0;
-    }else {
-        varfloat alpha = (varfloat)iteration / maxIterations;
-        int colorIndex = (int)(alpha * colorMapSize);
-        output[iy * width + ix] = colorMap[colorIndex];
-      // monochrom
-      //  output[iy * width + ix] = 255*iteration/maxIterations;
-    }
-
-}
\ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java b/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java
deleted file mode 100644
index 265add8..0000000
--- a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java
+++ /dev/null
@@ -1,486 +0,0 @@
-package com.mbien.opencl.demos.fractal;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLEvent;
-import com.mbien.opencl.CLEventList;
-import com.mbien.opencl.CLException;
-import com.mbien.opencl.gl.CLGLBuffer;
-import com.mbien.opencl.gl.CLGLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLPlatform;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLProgram.CompilerOptions;
-import com.jogamp.opengl.util.awt.TextRenderer;
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Font;
-import java.awt.Point;
-import java.awt.event.KeyAdapter;
-import java.awt.event.KeyEvent;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseWheelEvent;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import javax.media.opengl.DebugGL2;
-import javax.media.opengl.GL;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLContext;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.swing.JFrame;
-import javax.swing.SwingUtilities;
-
-import static com.jogamp.common.nio.Buffers.*;
-import static javax.media.opengl.GL2.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLEvent.ProfilingCommand.*;
-import static com.mbien.opencl.CLCommandQueue.Mode.*;
-import static com.mbien.opencl.CLDevice.Type.*;
-import static java.lang.Math.*;
-
-/**
- * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL.
- * A shared PBO is used as storage for the fractal image.<br/>
- * http://en.wikipedia.org/wiki/Mandelbrot_set
- * <p>
- * controls:<br/>
- * keys 1-9 control parallelism level<br/>
- * space enables/disables slice seperator<br/>
- * 'd' toggles between 32/64bit floatingpoint precision<br/>
- * mouse/mousewheel to drag and zoom<br/>
- * </p>
- * @author Michael Bien
- */
-public class MultiDeviceFractal implements GLEventListener {
-
-    // max number of used GPUs
-    private static final int MAX_PARRALLELISM_LEVEL = 8;
-
-    // max per pixel iterations to compute the fractal
-    private static final int MAX_ITERATIONS         = 500;
-
-    private GLCanvas canvas;
-
-    private CLGLContext clContext;
-    private CLCommandQueue[] queues;
-    private CLKernel[] kernels;
-    private CLProgram program;
-    private CLEventList probes;
-    private CLGLBuffer<?>[] pboBuffers;
-    private CLBuffer<IntBuffer>[] colorMap;
-
-    private int width  = 0;
-    private int height = 0;
-
-    private double minX = -2f;
-    private double minY = -1.2f;
-    private double maxX  = 0.6f;
-    private double maxY  = 1.3f;
-
-    private int slices;
-
-    private boolean drawSeperator;
-    private boolean doublePrecision;
-    private boolean buffersInitialized;
-    private boolean rebuild;
-
-    private final TextRenderer textRenderer;
-
-    public MultiDeviceFractal(int width, int height) {
-
-        this.width = width;
-        this.height = height;
-
-        canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
-        canvas.addGLEventListener(this);
-        initSceneInteraction();
-
-        JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set");
-        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
-        canvas.setPreferredSize(new Dimension(width, height));
-        frame.add(canvas);
-        frame.pack();
-
-        frame.setVisible(true);
-
-        textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
-    }
-
-    public void init(GLAutoDrawable drawable) {
-
-        if(clContext == null) {
-            // enable GL error checking using the composable pipeline
-            drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
-
-            drawable.getGL().glFinish();
-            initCL(drawable.getContext());
-
-            GL2 gl = drawable.getGL().getGL2();
-
-            gl.setSwapInterval(0);
-            gl.glDisable(GL_DEPTH_TEST);
-            gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
-
-            initView(gl, drawable.getWidth(), drawable.getHeight());
-
-            initPBO(gl);
-            drawable.getGL().glFinish();
-
-            setKernelConstants();
-        }
-    }
-
-    private void initCL(GLContext glCtx){
-        try {
-            // create context managing all available GPUs
-//            clContext = CLGLContext.create(glCtx, GPU);
-            clContext = CLGLContext.create(glCtx, CLPlatform.getDefault().listCLDevices()[0]);
-
-
-            CLDevice[] devices = clContext.getDevices();
-
-            slices = min(devices.length, MAX_PARRALLELISM_LEVEL);
-
-            // create command queues for every GPU, setup colormap and init kernels
-            queues = new CLCommandQueue[slices];
-            kernels = new CLKernel[slices];
-            probes = new CLEventList(slices);
-            colorMap = new CLBuffer[slices];
-
-            for (int i = 0; i < slices; i++) {
-
-                colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY);
-                initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED);
-
-                // create command queue and upload color map buffer on each used device
-                queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload
-
-            }
-
-            // load and build program
-            program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl"));
-            buildProgram();
-
-        } catch (IOException ex) {
-            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex);
-        } catch (CLException ex) {
-            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex);
-        }
-
-    }
-
-    private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) {
-        
-        for (int n = 0; n < colors.length - 1; n++) {
-
-            Color color = colors[n];
-            int r0 = color.getRed();
-            int g0 = color.getGreen();
-            int b0 = color.getBlue();
-
-            color = colors[n + 1];
-            int r1 = color.getRed();
-            int g1 = color.getGreen();
-            int b1 = color.getBlue();
-
-            int deltaR = r1 - r0;
-            int deltaG = g1 - g0;
-            int deltaB = b1 - b0;
-
-            for (int step = 0; step < stepSize; step++) {
-                float alpha = (float) step / (stepSize - 1);
-                int r = (int) (r0 + alpha * deltaR);
-                int g = (int) (g0 + alpha * deltaG);
-                int b = (int) (b0 + alpha * deltaB);
-                colorMap.put((r << 16) | (g << 8) | (b << 0));
-            }
-        }
-        colorMap.rewind();
-
-    }
-
-    private void initView(GL2 gl, int width, int height) {
-
-        gl.glViewport(0, 0, width, height);
-
-        gl.glMatrixMode(GL_MODELVIEW);
-        gl.glLoadIdentity();
-
-        gl.glMatrixMode(GL_PROJECTION);
-        gl.glLoadIdentity();
-        gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0);
-    }
-
-    @SuppressWarnings("unchecked")
-    private void initPBO(GL gl) {
-
-        if(pboBuffers != null) {
-            int[] oldPbos = new int[pboBuffers.length];
-            for (int i = 0; i < pboBuffers.length; i++) {
-                CLGLBuffer<?> buffer = pboBuffers[i];
-                oldPbos[i] = buffer.GLID;
-                buffer.release();
-            }
-            gl.glDeleteBuffers(oldPbos.length, oldPbos, 0);
-        }
-
-        pboBuffers = new CLGLBuffer[slices];
-
-        int[] pbo = new int[slices];
-        gl.glGenBuffers(slices, pbo, 0);
-
-        // setup one empty PBO per slice
-        for (int i = 0; i < slices; i++) {
-
-            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]);
-            gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW);
-            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-            pboBuffers[i] = clContext.createFromGLBuffer(pbo[i], WRITE_ONLY);
-
-        }
-
-        buffersInitialized = true;
-    }
-
-    private void buildProgram() {
-
-        /*
-         * workaround: The driver keeps using the old binaries for some reason.
-         * to solve this we simple create a new program and release the old.
-         * however rebuilding programs should be possible -> remove when drivers are fixed.
-         */
-        if(program != null && rebuild) {
-            String source = program.getSource();
-            program.release();
-            program = clContext.createProgram(source);
-        }
-
-        // disable 64bit floating point math if not available
-        if(doublePrecision) {
-            for (CLDevice device : program.getCLDevices()) {
-                if(!device.isDoubleFPAvailable()) {
-                    doublePrecision = false;
-                    break;
-                }
-            }
-        }
-
-        if(doublePrecision) {
-            program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP");
-        }else{
-            program.build(CompilerOptions.FAST_RELAXED_MATH);
-        }
-        rebuild = false;
-
-        for (int i = 0; i < kernels.length; i++) {
-            // init kernel with constants
-            kernels[i] = program.createCLKernel("mandelbrot");
-        }
-
-    }
-
-    // init kernels with constants
-    private void setKernelConstants() {
-        for (int i = 0; i < slices; i++) {
-            kernels[i].setForce32BitArgs(!doublePrecision)
-                      .setArg(6, pboBuffers[i])
-                      .setArg(7, colorMap[i])
-                      .setArg(8, colorMap[i].getBuffer().capacity())
-                      .setArg(9, MAX_ITERATIONS);
-        }
-    }
-
-    // rendering cycle
-    public void display(GLAutoDrawable drawable) {
-        GL gl = drawable.getGL();
-
-        // make sure GL does not use our objects before we start computeing
-        gl.glFinish();
-        if(!buffersInitialized) {
-            initPBO(gl);
-            setKernelConstants();
-        }
-        if(rebuild) {
-            buildProgram();
-            setKernelConstants();
-        }
-        compute();
-
-        render(gl.getGL2());
-    }
-
-    // OpenCL
-    private void compute() {
-
-        int sliceWidth = width / slices;
-        double rangeX   = (maxX - minX) / slices;
-        double rangeY   = (maxY - minY);
-
-        // release all old events, you can't reuse events in OpenCL
-        probes.release();
-
-        // start computation
-        for (int i = 0; i < slices; i++) {
-
-            kernels[i].putArg(     sliceWidth).putArg(height)
-                      .putArg(minX + rangeX*i).putArg(  minY)
-                      .putArg(       rangeX  ).putArg(rangeY)
-                      .rewind();
-
-            // aquire GL objects, and enqueue a kernel with a probe from the list
-            queues[i].putAcquireGLObject(pboBuffers[i].ID)
-                     .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes)
-                     .putReleaseGLObject(pboBuffers[i].ID);
-
-        }
-
-        // block until done (important: finish before doing further gl work)
-        for (int i = 0; i < slices; i++) {
-            queues[i].finish();
-        }
-
-    }
-
-    // OpenGL
-    private void render(GL2 gl) {
-
-        gl.glClear(GL_COLOR_BUFFER_BIT);
-
-        //draw slices
-        int sliceWidth = width / slices;
-
-        for (int i = 0; i < slices; i++) {
-
-            int seperatorOffset = drawSeperator?i:0;
-
-            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID);
-            gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0);
-
-            gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0);
-
-        }
-        gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-        //draw info text
-        textRenderer.beginRendering(width, height, false);
-
-            textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15);
-
-            for (int i = 0; i < slices; i++) {
-                CLEvent event = probes.getEvent(i);
-                long start = event.getProfilingInfo(START);
-                long end = event.getProfilingInfo(END);
-                textRenderer.draw("GPU"+i +" "+(int)((end-start)/1000000.0f)+"ms", 10, height-(20+16*(slices-i)));
-            }
-
-        textRenderer.endRendering();
-    }
-
-    public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) {
-
-        if(this.width == width && this.height == height)
-            return;
-
-        this.width = width;
-        this.height = height;
-
-        initPBO(drawable.getGL());
-
-        initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight());
-    }
-
-    private void initSceneInteraction() {
-
-        MouseAdapter mouseAdapter = new MouseAdapter() {
-
-            Point lastpos = new Point();
-
-            @Override
-            public void mouseDragged(MouseEvent e) {
-                
-                double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width;
-                double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height;
-
-                minX += offsetX;
-                minY -= offsetY;
-
-                maxX += offsetX;
-                maxY -= offsetY;
-
-                lastpos = e.getPoint();
-
-                canvas.display();
-
-            }
-
-            @Override
-            public void mouseMoved(MouseEvent e) {
-                lastpos = e.getPoint();
-            }
-            
-            @Override
-            public void mouseWheelMoved(MouseWheelEvent e) {
-                float rotation = e.getWheelRotation() / 25.0f;
-
-                double deltaX = rotation * (maxX - minX);
-                double deltaY = rotation * (maxY - minY);
-
-                // offset for "zoom to cursor"
-                double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2;
-                double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2;
-
-                minX += deltaX+offsetX;
-                minY += deltaY-offsetY;
-
-                maxX +=-deltaX+offsetX;
-                maxY +=-deltaY-offsetY;
-
-                canvas.display();
-            }
-        };
-
-        KeyAdapter keyAdapter = new KeyAdapter() {
-
-            @Override
-            public void keyPressed(KeyEvent e) {
-                if(e.getKeyCode() == KeyEvent.VK_SPACE) {
-                    drawSeperator = !drawSeperator;
-                }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') {
-                    int number = e.getKeyChar()-'0';
-                    slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL));
-                    buffersInitialized = false;
-                }else if(e.getKeyCode() == KeyEvent.VK_D) {
-                    doublePrecision = !doublePrecision;
-                    rebuild = true;
-                }
-                canvas.display();
-            }
-
-        };
-
-        canvas.addMouseMotionListener(mouseAdapter);
-        canvas.addMouseWheelListener(mouseAdapter);
-        canvas.addKeyListener(keyAdapter);
-    }
-
-    public void dispose(GLAutoDrawable drawable) {
-    }
-
-    public static void main(String args[]) {
-        SwingUtilities.invokeLater(new Runnable() {
-            public void run() {
-                new MultiDeviceFractal(512, 512);
-            }
-        });
-    }
-
-}
diff --git a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig b/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig
deleted file mode 100644
index fc09d19..0000000
--- a/src/com/mbien/opencl/demos/fractal/MultiDeviceFractal.java.orig
+++ /dev/null
@@ -1,484 +0,0 @@
-package com.mbien.opencl.demos.fractal;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLEvent;
-import com.mbien.opencl.CLEventList;
-import com.mbien.opencl.CLException;
-import com.mbien.opencl.CLGLBuffer;
-import com.mbien.opencl.CLGLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLProgram.CompilerOptions;
-import com.sun.opengl.util.awt.TextRenderer;
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Font;
-import java.awt.Point;
-import java.awt.event.KeyAdapter;
-import java.awt.event.KeyEvent;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseWheelEvent;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import javax.media.opengl.DebugGL2;
-import javax.media.opengl.GL;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLContext;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.swing.JFrame;
-import javax.swing.SwingUtilities;
-
-import static com.sun.gluegen.runtime.BufferFactory.*;
-import static javax.media.opengl.GL2.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLEvent.ProfilingCommand.*;
-import static com.mbien.opencl.CLCommandQueue.Mode.*;
-import static com.mbien.opencl.CLDevice.Type.*;
-import static java.lang.Math.*;
-
-/**
- * Computes the Mandelbrot set with OpenCL using multiple GPUs and renders the result with OpenGL.
- * A shared PBO is used as storage for the fractal image.<br/>
- * http://en.wikipedia.org/wiki/Mandelbrot_set
- * <p>
- * controls:<br/>
- * keys 1-9 control parallelism level<br/>
- * space enables/disables slice seperator<br/>
- * 'd' toggles between 32/64bit floatingpoint precision<br/>
- * mouse/mousewheel to drag and zoom<br/>
- * </p>
- * @author Michael Bien
- */
-public class MultiDeviceFractal implements GLEventListener {
-
-    // max number of used GPUs
-    private static final int MAX_PARRALLELISM_LEVEL = 8;
-
-    // max per pixel iterations to compute the fractal
-    private static final int MAX_ITERATIONS         = 1000;
-
-    private GLCanvas canvas;
-
-    private CLContext clContext;
-    private CLCommandQueue[] queues;
-    private CLKernel[] kernels;
-    private CLProgram program;
-    private CLEventList probes;
-    private CLBuffer<?>[] pboBuffers;
-    private CLBuffer<IntBuffer>[] colorMap;
-
-    private int width  = 0;
-    private int height = 0;
-
-    private double minX = -2f;
-    private double minY = -1.2f;
-    private double maxX  = 0.6f;
-    private double maxY  = 1.3f;
-
-    private int slices;
-
-    private boolean drawSeperator;
-    private boolean doublePrecision;
-    private boolean buffersInitialized;
-    private boolean rebuild;
-
-    private final TextRenderer textRenderer;
-
-    public MultiDeviceFractal(int width, int height) {
-
-        this.width = width;
-        this.height = height;
-
-        canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
-        canvas.addGLEventListener(this);
-        initSceneInteraction();
-
-        JFrame frame = new JFrame("JOCL Multi GPU Mandelbrot Set");
-        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
-        canvas.setPreferredSize(new Dimension(width, height));
-        frame.add(canvas);
-        frame.pack();
-
-        frame.setVisible(true);
-
-        textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
-    }
-
-    public void init(GLAutoDrawable drawable) {
-
-        // enable GL error checking using the composable pipeline
-        drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
-
-        initCL(drawable.getContext());
-
-        GL2 gl = drawable.getGL().getGL2();
-
-        gl.setSwapInterval(0);
-        gl.glDisable(GL_DEPTH_TEST);
-        gl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
-
-        initView(gl, drawable.getWidth(), drawable.getHeight());
-
-        initPBO(gl);
-        setKernelConstants();
-    }
-
-    private void initCL(GLContext glCtx){
-        try {
-            // create context managing all available GPUs
-            clContext = CLContext.create(GPU);
-
-            CLDevice[] devices = clContext.getCLDevices();
-
-            slices = min(devices.length, MAX_PARRALLELISM_LEVEL);
-
-            // create command queues for every GPU, setup colormap and init kernels
-            queues = new CLCommandQueue[slices];
-            kernels = new CLKernel[slices];
-            probes = new CLEventList(slices);
-            colorMap = new CLBuffer[slices];
-
-            for (int i = 0; i < slices; i++) {
-
-                colorMap[i] = clContext.createIntBuffer(32*2, READ_ONLY);
-                initColorMap(colorMap[i].getBuffer(), 32, Color.BLUE, Color.GREEN, Color.RED);
-
-                // create command queue and upload color map buffer on each used device
-                queues[i] = devices[i].createCommandQueue(PROFILING_MODE).putWriteBuffer(colorMap[i], true); // blocking upload
-
-            }
-
-            // load and build program
-            program = clContext.createProgram(getClass().getResourceAsStream("Mandelbrot.cl"));
-            buildProgram();
-
-        } catch (IOException ex) {
-            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "can not find 'Mandelbrot.cl' in classpath.", ex);
-        } catch (CLException ex) {
-            Logger.getLogger(getClass().getName()).log(Level.SEVERE, "something went wrong, hopefully no one got hurt", ex);
-        }
-
-    }
-
-    private void initColorMap(IntBuffer colorMap, int stepSize, Color... colors) {
-        
-        for (int n = 0; n < colors.length - 1; n++) {
-
-            Color color = colors[n];
-            int r0 = color.getRed();
-            int g0 = color.getGreen();
-            int b0 = color.getBlue();
-
-            color = colors[n + 1];
-            int r1 = color.getRed();
-            int g1 = color.getGreen();
-            int b1 = color.getBlue();
-
-            int deltaR = r1 - r0;
-            int deltaG = g1 - g0;
-            int deltaB = b1 - b0;
-
-            for (int step = 0; step < stepSize; step++) {
-                float alpha = (float) step / (stepSize - 1);
-                int r = (int) (r0 + alpha * deltaR);
-                int g = (int) (g0 + alpha * deltaG);
-                int b = (int) (b0 + alpha * deltaB);
-                colorMap.put((r << 16) | (g << 8) | (b << 0));
-            }
-        }
-        colorMap.rewind();
-
-    }
-
-    private void initView(GL2 gl, int width, int height) {
-
-        gl.glViewport(0, 0, width, height);
-
-        gl.glMatrixMode(GL_MODELVIEW);
-        gl.glLoadIdentity();
-
-        gl.glMatrixMode(GL_PROJECTION);
-        gl.glLoadIdentity();
-        gl.glOrtho(0.0, width, 0.0, height, 0.0, 1.0);
-    }
-
-    @SuppressWarnings("unchecked")
-    private void initPBO(GL gl) {
-
-        if(pboBuffers != null) {
-            int[] oldPbos = new int[pboBuffers.length];
-            for (int i = 0; i < pboBuffers.length; i++) {
-                CLBuffer<?> buffer = pboBuffers[i];
-//                oldPbos[i] = buffer.GLID;
-                buffer.release();
-            }
-//            gl.glDeleteBuffers(oldPbos.length, oldPbos, 0);
-        }
-
-        pboBuffers = new CLBuffer[slices];
-
-//        int[] pbo = new int[slices];
-//        gl.glGenBuffers(slices, pbo, 0);
-
-        // setup one empty PBO per slice
-        for (int i = 0; i < slices; i++) {
-
-//            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo[i]);
-//            gl.glBufferData(GL_PIXEL_UNPACK_BUFFER, width*height * SIZEOF_INT / slices, null, GL_STREAM_DRAW);
-//            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-            pboBuffers[i] = clContext.createByteBuffer(width*height * SIZEOF_INT / slices, WRITE_ONLY);
-//            pboBuffers[i] = clContext.createFromGLBuffer(null, pbo[i], WRITE_ONLY);
-
-        }
-
-        buffersInitialized = true;
-    }
-
-    private void buildProgram() {
-
-        /*
-         * workaround: The driver keeps using the old binaries for some reason.
-         * to solve this we simple create a new program and release the old.
-         * however rebuilding programs should be possible -> remove when drivers are fixed.
-         */
-        if(program != null && rebuild) {
-            String source = program.getSource();
-            program.release();
-            program = clContext.createProgram(source);
-        }
-
-        // disable 64bit floating point math if not available
-        if(doublePrecision) {
-            for (CLDevice device : program.getCLDevices()) {
-                if(!device.isDoubleFPAvailable()) {
-                    doublePrecision = false;
-                    break;
-                }
-            }
-        }
-
-        if(doublePrecision) {
-            program.build(CompilerOptions.FAST_RELAXED_MATH, "-D DOUBLE_FP");
-        }else{
-            program.build(CompilerOptions.FAST_RELAXED_MATH);
-        }
-        rebuild = false;
-
-        for (int i = 0; i < kernels.length; i++) {
-            // init kernel with constants
-            kernels[i] = program.createCLKernel("mandelbrot");
-        }
-
-    }
-
-    // init kernels with constants
-    private void setKernelConstants() {
-        for (int i = 0; i < slices; i++) {
-            kernels[i].setForce32BitArgs(!doublePrecision)
-                      .setArg(6, pboBuffers[i])
-                      .setArg(7, colorMap[i])
-                      .setArg(8, colorMap[i].getBuffer().capacity())
-                      .setArg(9, MAX_ITERATIONS);
-        }
-    }
-
-    // rendering cycle
-    public void display(GLAutoDrawable drawable) {
-        GL gl = drawable.getGL();
-
-        if(!buffersInitialized) {
-            initPBO(gl);
-            setKernelConstants();
-        }
-        if(rebuild) {
-            buildProgram();
-            setKernelConstants();
-        }
-        // make sure GL does not use our objects before we start computeing
-        gl.glFinish();
-        compute();
-
-        render(gl.getGL2());
-    }
-
-    // OpenCL
-    private void compute() {
-
-        int sliceWidth = width / slices;
-        double rangeX   = (maxX - minX) / slices;
-        double rangeY   = (maxY - minY);
-
-        // release all old events, you can't reuse events in OpenCL
-        probes.release();
-
-        long time = System.currentTimeMillis();
-        // start computation
-        for (int i = 0; i < slices; i++) {
-
-            kernels[i].putArg(     sliceWidth).putArg(height)
-                      .putArg(minX + rangeX*i).putArg(  minY)
-                      .putArg(       rangeX  ).putArg(rangeY)
-                      .rewind();
-
-            // aquire GL objects, and enqueue a kernel with a probe from the list
-            queues[i]
-//                    .putAcquireGLObject(pboBuffers[i].ID)
-                     .put2DRangeKernel(kernels[i], 0, 0, sliceWidth, height, 0, 0, probes)
-//                     .putReleaseGLObject(pboBuffers[i].ID)
-                     ;
-
-        }
-
-        // block until done
-        for (int i = 0; i < slices; i++) {
-            queues[i].finish();
-        }
-        System.out.println((System.currentTimeMillis()-time)/1000.0f);
-
-    }
-
-    // OpenGL
-    private void render(GL2 gl) {
-
-        gl.glClear(GL_COLOR_BUFFER_BIT);
-
-        //draw slices
-        int sliceWidth = width / slices;
-
-//        for (int i = 0; i < slices; i++) {
-//
-//            int seperatorOffset = drawSeperator?i:0;
-//
-//            gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pboBuffers[i].GLID);
-//            gl.glRasterPos2i(sliceWidth*i + seperatorOffset, 0);
-//
-//            gl.glDrawPixels(sliceWidth, height, GL_BGRA, GL_UNSIGNED_BYTE, 0);
-//
-//        }
-//        gl.glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-        //draw info text
-        textRenderer.beginRendering(width, height, false);
-
-            textRenderer.draw("precision: "+ (doublePrecision?"64bit":"32bit"), 10, height-15);
-
-            for (int i = 0; i < slices; i++) {
-                CLEvent event = probes.getEvent(i);
-                long start = event.getProfilingInfo(START);
-                long end = event.getProfilingInfo(END);
-                textRenderer.draw("GPU"+i +" "+((end-start)/1000000000.0f)+"s", 10, height-(20+16*(slices-i)));
-            }
-
-        textRenderer.endRendering();
-    }
-
-    public void reshape(GLAutoDrawable drawable, int x, int y, int width, int height) {
-
-        if(this.width == width && this.height == height)
-            return;
-
-        this.width = width;
-        this.height = height;
-
-        initPBO(drawable.getGL());
-
-        initView(drawable.getGL().getGL2(), drawable.getWidth(), drawable.getHeight());
-    }
-
-    private void initSceneInteraction() {
-
-        MouseAdapter mouseAdapter = new MouseAdapter() {
-
-            Point lastpos = new Point();
-
-            @Override
-            public void mouseDragged(MouseEvent e) {
-                
-                double offsetX = (lastpos.x - e.getX()) * (maxX - minX) / width;
-                double offsetY = (lastpos.y - e.getY()) * (maxY - minY) / height;
-
-                minX += offsetX;
-                minY -= offsetY;
-
-                maxX += offsetX;
-                maxY -= offsetY;
-
-                lastpos = e.getPoint();
-
-                canvas.display();
-
-            }
-
-            @Override
-            public void mouseMoved(MouseEvent e) {
-                lastpos = e.getPoint();
-            }
-            
-            @Override
-            public void mouseWheelMoved(MouseWheelEvent e) {
-                float rotation = e.getWheelRotation() / 25.0f;
-
-                double deltaX = rotation * (maxX - minX);
-                double deltaY = rotation * (maxY - minY);
-
-                // offset for "zoom to cursor"
-                double offsetX = (e.getX() / (float)width - 0.5f) * deltaX * 2;
-                double offsetY = (e.getY() / (float)height- 0.5f) * deltaY * 2;
-
-                minX += deltaX+offsetX;
-                minY += deltaY-offsetY;
-
-                maxX +=-deltaX+offsetX;
-                maxY +=-deltaY-offsetY;
-
-                canvas.display();
-            }
-        };
-
-        KeyAdapter keyAdapter = new KeyAdapter() {
-
-            @Override
-            public void keyPressed(KeyEvent e) {
-                if(e.getKeyCode() == KeyEvent.VK_SPACE) {
-                    drawSeperator = !drawSeperator;
-                }else if(e.getKeyChar() > '0' && e.getKeyChar() < '9') {
-                    int number = e.getKeyChar()-'0';
-                    slices = min(number, min(queues.length, MAX_PARRALLELISM_LEVEL));
-                    buffersInitialized = false;
-                }else if(e.getKeyCode() == KeyEvent.VK_D) {
-                    doublePrecision = !doublePrecision;
-                    rebuild = true;
-                }
-                canvas.display();
-            }
-
-        };
-
-        canvas.addMouseMotionListener(mouseAdapter);
-        canvas.addMouseWheelListener(mouseAdapter);
-        canvas.addKeyListener(keyAdapter);
-    }
-
-    public void dispose(GLAutoDrawable drawable) {
-    }
-
-    public static void main(String args[]) {
-        SwingUtilities.invokeLater(new Runnable() {
-            public void run() {
-                new MultiDeviceFractal(512, 512);
-            }
-        });
-    }
-
-}
diff --git a/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java b/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java
deleted file mode 100644
index acbd84c..0000000
--- a/src/com/mbien/opencl/demos/hellojocl/HelloJOCL.java
+++ /dev/null
@@ -1,91 +0,0 @@
-package com.mbien.opencl.demos.hellojocl;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import java.io.IOException;
-import java.nio.FloatBuffer;
-import java.util.Random;
-
-import static java.lang.System.*;
-import static com.mbien.opencl.CLMemory.Mem.*;
-
-/**
- * Hello Java OpenCL example. Adds all elements of buffer A to buffer B
- * and stores the result in buffer C.<br/>
- * Sample was inspired by the Nvidia VectorAdd example written in C/C++
- * which is bundled in the Nvidia OpenCL SDK.
- * @author Michael Bien
- */
-public class HelloJOCL {
-
-    public static void main(String[] args) throws IOException {
-
-        int elementCount = 11444777;                                // Length of arrays to process
-        int localWorkSize = 256;                                    // Local work size dimensions
-        int globalWorkSize = roundUp(localWorkSize, elementCount);  // rounded up to the nearest multiple of the localWorkSize
-
-        // set up
-        CLContext context = CLContext.create();
-
-        CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
-
-        CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
-        CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
-        CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
-
-        out.println("used device memory: "
-            + (clBufferA.getSize()+clBufferB.getSize()+clBufferC.getSize())/1000000 +"MB");
-
-        // fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between runs).
-        fillBuffer(clBufferA.getBuffer(), 12345);
-        fillBuffer(clBufferB.getBuffer(), 67890);
-
-        // get a reference to the kernel functon with the name 'VectorAdd'
-        // and map the buffers to its input parameters.
-        CLKernel kernel = program.createCLKernel("VectorAdd");
-        kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
-
-        // create command queue on fastest device.
-        CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
-
-        // asynchronous write of data to GPU device, blocking read later to get the computed results back.
-        long time = nanoTime();
-        queue.putWriteBuffer(clBufferA, false)
-             .putWriteBuffer(clBufferB, false)
-             .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
-             .putReadBuffer(clBufferC, true);
-        time = nanoTime() - time;
-
-        // cleanup all resources associated with this context.
-        context.release();
-
-        // print first few elements of the resulting buffer to the console.
-        out.println("a+b=c results snapshot: ");
-        for(int i = 0; i < 10; i++)
-            out.print(clBufferC.getBuffer().get() + ", ");
-        out.println("...; " + clBufferC.getBuffer().remaining() + " more");
-
-        out.println("computation took: "+(time/1000000)+"ms");
-
-    }
-
-    private static final void fillBuffer(FloatBuffer buffer, int seed) {
-        Random rnd = new Random(seed);
-        while(buffer.remaining() != 0)
-            buffer.put(rnd.nextFloat()*100);
-        buffer.rewind();
-    }
-
-    private static final int roundUp(int groupSize, int globalSize) {
-        int r = globalSize % groupSize;
-        if (r == 0) {
-            return globalSize;
-        } else {
-            return globalSize + groupSize - r;
-        }
-    }
-
-}
\ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl b/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl
deleted file mode 100644
index ac9dde2..0000000
--- a/src/com/mbien/opencl/demos/hellojocl/VectorAdd.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-    // OpenCL Kernel Function for element by element vector addition
-    kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {
-
-        // get index into global data array
-        int iGID = get_global_id(0);
-
-        // bound check (equivalent to the limit on a 'for' loop for standard/serial C code
-        if (iGID >= numElements)  {
-            return;
-        }
-
-        // add the vector elements
-        c[iGID] = a[iGID] + b[iGID];
-    }
\ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java b/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
deleted file mode 100644
index 74783f3..0000000
--- a/src/com/mbien/opencl/demos/joglinterop/GLCLInteroperabilityDemo.java
+++ /dev/null
@@ -1,277 +0,0 @@
-package com.mbien.opencl.demos.joglinterop;
-
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.gl.CLGLBuffer;
-import com.mbien.opencl.gl.CLGLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLPlatform;
-import com.mbien.opencl.CLProgram;
-import com.jogamp.opengl.util.Animator;
-import java.io.IOException;
-import javax.media.opengl.DebugGL2;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.media.opengl.glu.gl2.GLUgl2;
-import javax.swing.JFrame;
-import javax.swing.SwingUtilities;
-
-import static com.jogamp.common.nio.Buffers.*;
-
-/**
- * JOCL - JOGL interoperability example.
- * @author Michael Bien
- */
-public class GLCLInteroperabilityDemo implements GLEventListener {
-
-    private final GLUgl2 glu = new GLUgl2();
-
-    private final int MESH_SIZE = 256;
-
-    private int width;
-    private int height;
-
-//    private final FloatBuffer vb;
-//    private final IntBuffer ib;
-
-    private final int[] glObjects = new int[2];
-    private final int VERTICES = 0;
-//    private final int INDICES  = 1;
-
-    private final UserSceneInteraction usi;
-
-    private CLGLContext clContext;
-    private CLKernel kernel;
-    private CLCommandQueue commandQueue;
-    private CLGLBuffer<?> clBuffer;
-
-    private float step = 0;
-
-    public GLCLInteroperabilityDemo() {
-
-        this.usi = new UserSceneInteraction();
-
-        // create direct memory buffers
-//        vb = newFloatBuffer(MESH_SIZE * MESH_SIZE * 4);
-//        ib = newIntBuffer((MESH_SIZE - 1) * (MESH_SIZE - 1) * 2 * 3);
-//
-//        // build indices
-//        //    0---3
-//        //    | \ |
-//        //    1---2
-//        for (int h = 0; h < MESH_SIZE - 1; h++) {
-//            for (int w = 0; w < MESH_SIZE - 1; w++) {
-//
-//                // 0 - 3 - 2
-//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6,      w + (h    ) * (MESH_SIZE)    );
-//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 1,  w + (h    ) * (MESH_SIZE) + 1);
-//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 2,  w + (h + 1) * (MESH_SIZE) + 1);
-//
-//                // 0 - 2 - 1
-//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 3,  w + (h    ) * (MESH_SIZE)    );
-//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 4,  w + (h + 1) * (MESH_SIZE) + 1);
-//                ib.put(w * 6 + h * (MESH_SIZE - 1) * 6 + 5,  w + (h + 1) * (MESH_SIZE)    );
-//
-//            }
-//        }
-//        ib.rewind();
-
-        SwingUtilities.invokeLater(new Runnable() {
-            public void run() {
-                initUI();
-            }
-        });
-
-    }
-
-    private void initUI() {
-
-        this.width  = 600;
-        this.height = 400;
-
-        GLCapabilities config = new GLCapabilities(GLProfile.get(GLProfile.GL2));
-        config.setSampleBuffers(true);
-        config.setNumSamples(4);
-
-        GLCanvas canvas = new GLCanvas(config);
-        canvas.addGLEventListener(this);
-        usi.init(canvas);
-
-        JFrame frame = new JFrame("JOGL-JOCL Interoperability Example");
-        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
-        frame.add(canvas);
-        frame.setSize(width, height);
-
-        frame.setVisible(true);
-
-    }
-
-
-    public void init(GLAutoDrawable drawable) {
-
-        if(clContext == null) {
-
-            // find gl compatible device
-            CLDevice[] devices = CLPlatform.getDefault().listCLDevices();
-            CLDevice device = null;
-            for (CLDevice d : devices) {
-                if(d.isGLMemorySharingSupported()) {
-                    device = d;
-                    break;
-                }
-            }
-            // create OpenCL context before creating any OpenGL objects
-            // you want to share with OpenCL (AMD driver requirement)
-            clContext = CLGLContext.create(drawable.getContext(), device);
-
-            // enable GL error checking using the composable pipeline
-            drawable.setGL(new DebugGL2(drawable.getGL().getGL2()));
-
-            // OpenGL initialization
-            GL2 gl = drawable.getGL().getGL2();
-
-            gl.setSwapInterval(1);
-
-            gl.glPolygonMode(GL2.GL_FRONT_AND_BACK, GL2.GL_LINE);
-
-            gl.glGenBuffers(glObjects.length, glObjects, 0);
-
-    //        gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]);
-    //        gl.glBufferData(GL2.GL_ELEMENT_ARRAY_BUFFER, ib.capacity() * SIZEOF_INT, ib, GL2.GL_STATIC_DRAW);
-    //        gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, 0);
-
-            gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
-                gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]);
-                gl.glBufferData(GL2.GL_ARRAY_BUFFER, MESH_SIZE * MESH_SIZE * 4 * SIZEOF_FLOAT, null, GL2.GL_DYNAMIC_DRAW);
-                gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
-            gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
-
-            pushPerspectiveView(gl);
-            gl.glFinish();
-
-            // init OpenCL
-            initCL();
-
-            // start rendering thread
-            Animator animator = new Animator(drawable);
-            animator.start();
-
-        }
-    }
-
-    private void initCL() {
-
-        CLProgram program;
-        try {
-            program = clContext.createProgram(getClass().getResourceAsStream("JoglInterop.cl"));
-            program.build();
-            System.out.println(program.getBuildStatus());
-            System.out.println(program.isExecutable());
-            System.out.println(program.getBuildLog());
-        } catch (IOException ex) {
-            throw new RuntimeException("can not handle exception", ex);
-        }
-
-        commandQueue = clContext.getMaxFlopsDevice().createCommandQueue();
-
-        clBuffer = clContext.createFromGLBuffer(glObjects[VERTICES], CLGLBuffer.Mem.WRITE_ONLY);
-
-        System.out.println("cl buffer type: " + clBuffer.getGLObjectType());
-        System.out.println("shared with gl buffer: " + clBuffer.getGLObjectID());
-
-        kernel = program.createCLKernel("sineWave")
-                        .putArg(clBuffer)
-                        .putArg(MESH_SIZE)
-                        .rewind();
-
-        System.out.println("cl initialised");
-    }
-
-
-    public void display(GLAutoDrawable drawable) {
-
-        GL2 gl = drawable.getGL().getGL2();
-
-        // ensure pipeline is clean before doing cl work
-        gl.glFinish();
-
-        computeHeightfield();
-
-        gl.glClear(GL2.GL_COLOR_BUFFER_BIT | GL2.GL_DEPTH_BUFFER_BIT);
-        gl.glLoadIdentity();
-
-        usi.interact(gl);
-
-        gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, glObjects[VERTICES]);
-        gl.glVertexPointer(4, GL2.GL_FLOAT, 0, 0);
-
-//            gl.glBindBuffer(GL2.GL_ELEMENT_ARRAY_BUFFER, glObjects[INDICES]);
-
-        gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
-        gl.glDrawArrays(GL2.GL_POINTS, 0, MESH_SIZE * MESH_SIZE);
-//            gl.glDrawElements(GL2.GL_TRIANGLES, ib.capacity(), GL2.GL_UNSIGNED_INT, 0);
-        gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
-
-//            gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
-
-    }
-
-    /*
-     * Computes a heightfield using a OpenCL kernel.
-     */
-    private void computeHeightfield() {
-
-        kernel.setArg(2, step += 0.05f);
-
-        commandQueue.putAcquireGLObject(clBuffer.ID)
-                    .put2DRangeKernel(kernel, 0, 0, MESH_SIZE, MESH_SIZE, 0, 0)
-                    .putReleaseGLObject(clBuffer.ID)
-                    .finish();
-
-    }
-
-    private void pushPerspectiveView(GL2 gl) {
-
-        gl.glMatrixMode(GL2.GL_PROJECTION);
-        gl.glPushMatrix();
-
-            gl.glLoadIdentity();
-
-            glu.gluPerspective(60, width / (float)height, 1, 1000);
-            gl.glMatrixMode(GL2.GL_MODELVIEW);
-
-            gl.glPushMatrix();
-                gl.glLoadIdentity();
-
-    }
-
-    private void popView(GL2 gl) {
-
-                gl.glMatrixMode(GL2.GL_PROJECTION);
-            gl.glPopMatrix();
-
-            gl.glMatrixMode(GL2.GL_MODELVIEW);
-        gl.glPopMatrix();
-
-    }
-
-
-    public void reshape(GLAutoDrawable drawable, int arg1, int arg2, int width, int height) {
-        this.width = width;
-        this.height = height;
-        GL2 gl = drawable.getGL().getGL2();
-        popView(gl);
-        pushPerspectiveView(gl);
-    }
-
-    public void dispose(GLAutoDrawable drawable) {  }
-
-    public static void main(String[] args) {
-        new GLCLInteroperabilityDemo();
-    }
-
-}
\ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl b/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl
deleted file mode 100644
index 0f0bcfc..0000000
--- a/src/com/mbien/opencl/demos/joglinterop/JoglInterop.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-/**
-* animated 2D sine pattern.
-*/
-kernel void sineWave(global float4 * vertex, int size, float time) {
-
-    unsigned int x = get_global_id(0);
-    unsigned int y = get_global_id(1);
-
-    // calculate uv coordinates
-    float u = x / (float) size;
-    float v = y / (float) size;
-
-    u = u*2.0f - 1.0f;
-    v = v*2.0f - 1.0f;
-
-    // calculate simple sine wave pattern
-    float freq = 4.0f;
-    float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;
-
-    // write output vertex
-    vertex[y*size + x] = (float4)(u*10.0f, w*10.0f, v*10.0f, 1.0f);
-}
diff --git a/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java b/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java
deleted file mode 100644
index bec2d28..0000000
--- a/src/com/mbien/opencl/demos/joglinterop/UserSceneInteraction.java
+++ /dev/null
@@ -1,103 +0,0 @@
-package com.mbien.opencl.demos.joglinterop;
-
-import java.awt.Component;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseMotionAdapter;
-import java.awt.event.MouseWheelEvent;
-import java.awt.event.MouseWheelListener;
-import javax.media.opengl.GL2;
-
-/**
- * Utility class for interacting with a scene. Supports rotation and zoom around origin.
- * @author Michael Bien
- */
-public class UserSceneInteraction {
-
-    private float z = -20;
-    private float rotx = 45;
-    private float roty = 30;
-
-    private Point dragstart;
-    private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM }
-    private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE;
-
-
-    public void init(Component component) {
-        initMouseListeners(component);
-    }
-
-    private void initMouseListeners(Component component) {
-        component.addMouseMotionListener(new MouseMotionAdapter() {
-
-            @Override
-            public void mouseDragged(MouseEvent e) {
-
-                if (dragstart != null) {
-                    switch (dragmode) {
-                        case DRAG_ROTATE:
-                            rotx += e.getY() - dragstart.getY();
-                            roty += e.getX() - dragstart.getX();
-                            break;
-                        case DRAG_ZOOM:
-                            z += (e.getY() - dragstart.getY()) / 5.0f;
-                            break;
-                    }
-                }
-
-                dragstart = e.getPoint();
-            }
-        });
-        component.addMouseWheelListener(new MouseWheelListener() {
-
-            public void mouseWheelMoved(MouseWheelEvent e) {
-                z += e.getWheelRotation()*5;
-            }
-
-        });
-        component.addMouseListener(new MouseAdapter() {
-
-            @Override
-            public void mousePressed(MouseEvent e) {
-                switch (e.getButton()) {
-                    case (MouseEvent.BUTTON1):
-                        dragmode = MOUSE_MODE.DRAG_ROTATE;
-                        break;
-                    case (MouseEvent.BUTTON2):
-                        dragmode = MOUSE_MODE.DRAG_ZOOM;
-                        break;
-                    case (MouseEvent.BUTTON3):
-                        dragmode = MOUSE_MODE.DRAG_ZOOM;
-                        break;
-                }
-            }
-
-            @Override
-            public void mouseReleased(MouseEvent e) {
-                switch (e.getButton()) {
-                    case (MouseEvent.BUTTON1):
-                        dragmode = MOUSE_MODE.DRAG_ZOOM;
-                        break;
-                    case (MouseEvent.BUTTON2):
-                        dragmode = MOUSE_MODE.DRAG_ROTATE;
-                        break;
-                    case (MouseEvent.BUTTON3):
-                        dragmode = MOUSE_MODE.DRAG_ROTATE;
-                        break;
-                }
-
-                dragstart = null;
-            }
-        });
-    }
-
-
-    public void interact(GL2 gl) {
-        gl.glTranslatef(0, 0, z);
-        gl.glRotatef(rotx, 1f, 0f, 0f);
-        gl.glRotatef(roty, 0f, 1.0f, 0f);
-    }
-
-
-}
\ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/julia3d/Julia3d.java b/src/com/mbien/opencl/demos/julia3d/Julia3d.java
deleted file mode 100644
index 5d5378f..0000000
--- a/src/com/mbien/opencl/demos/julia3d/Julia3d.java
+++ /dev/null
@@ -1,212 +0,0 @@
-package com.mbien.opencl.demos.julia3d;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLDevice;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLPlatform;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.demos.julia3d.structs.Camera;
-import com.mbien.opencl.demos.julia3d.structs.RenderingConfig;
-import com.mbien.opencl.demos.julia3d.structs.Vec;
-import java.io.IOException;
-import java.nio.Buffer;
-import java.nio.ByteBuffer;
-import java.nio.FloatBuffer;
-import javax.swing.SwingUtilities;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.CompilerOptions.*;
-import static com.mbien.opencl.demos.julia3d.UserSceneController.*;
-
-/**
- * This sample has been ported from David Buciarelli's juliaGPU v1.2 written in C.
- * @author Michael Bien
- */
-public class Julia3d {
-
-    private final CLContext context;
-    private CLBuffer<FloatBuffer> pixelBuffer;
-    private final CLBuffer<ByteBuffer> configBuffer;
-    private final CLCommandQueue commandQueue;
-    private final CLProgram program;
-    private final CLKernel julia;
-    private final CLKernel multiply;
-
-    private final int workGroupSize;
-    private final String kernelFileName = "rendering_kernel.cl";
-
-    final RenderingConfig config;
-
-    private Julia3d(RenderingConfig renderConfig) {
-        this.config = renderConfig;
-        updateCamera();
-
-        //setup
-        CLDevice gpu = CLPlatform.getDefault().getMaxFlopsDevice();
-        context = CLContext.create(gpu);
-
-        workGroupSize = 256;
-
-        //allocate buffers
-        configBuffer = context.createBuffer(config.getBuffer(), READ_ONLY);
-        commandQueue = gpu.createCommandQueue();
-//        update(true);
-
-        try {
-            program = context.createProgram(Julia3d.class.getResourceAsStream(kernelFileName))
-                             .build(FAST_RELAXED_MATH);
-        } catch (IOException ex) {
-            throw new RuntimeException("unable to load program from source", ex);
-        }
-
-        julia = program.createCLKernel("JuliaGPU");
-        multiply = program.createCLKernel("multiply");
-        System.out.println(program.getBuildStatus(gpu));
-        System.out.println(program.getBuildLog());
-
-    }
-
-    void update(boolean reallocate) {
-
-        updateCamera();
-
-        int bufferSize = config.getWidth() * config.getHeight() * 3;
-        if(reallocate) {
-            if(pixelBuffer != null) {
-                pixelBuffer.release();
-            }
-
-            pixelBuffer = context.createFloatBuffer(bufferSize, READ_WRITE, USE_BUFFER);
-        }
-
-        commandQueue.putWriteBuffer(configBuffer, true);
-        
-        julia.putArg(pixelBuffer)
-             .putArg(configBuffer)
-             .rewind();
-
-        multiply.putArg(pixelBuffer)
-                .putArg(bufferSize)
-                .rewind();
-    }
-
-
-    void compute(boolean fastRendering) {
-
-        // calculate workgroup size
-        int globalThreads = config.getWidth() * config.getHeight();
-        if(globalThreads % workGroupSize != 0)
-            globalThreads = (globalThreads / workGroupSize + 1) * workGroupSize;
-
-        int localThreads = workGroupSize;
-        int superSamplingSize = config.getSuperSamplingSize();
-
-        if (!fastRendering && superSamplingSize > 1) {
-            
-            for (int y = 0; y < superSamplingSize; ++y) {
-                for (int x = 0; x < superSamplingSize; ++x) {
-                    
-                    float sampleX = (x + 0.5f) / superSamplingSize;
-                    float sampleY = (y + 0.5f) / superSamplingSize;
-                    
-                    if (x == 0 && y == 0) {
-                        // First pass
-                        julia.setArg(2, 0)
-                             .setArg(3, sampleX)
-                             .setArg(4, sampleY);
-
-                        commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
-                            
-                    } else if (x == (superSamplingSize - 1) && y == (superSamplingSize - 1)) {
-                        // Last pass
-                        julia.setArg(2, 1)
-                             .setArg(3, sampleX)
-                             .setArg(4, sampleY);
-
-                        // normalize the values we accumulated
-                        multiply.setArg(2, 1.0f/(superSamplingSize*superSamplingSize));
-                        
-                        commandQueue.put1DRangeKernel(julia,    0, globalThreads,   localThreads)
-                                    .put1DRangeKernel(multiply, 0, globalThreads*3, localThreads);
-                    } else {
-                        julia.setArg(2, 1)
-                             .setArg(3, sampleX)
-                             .setArg(4, sampleY);
-                        
-                        commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
-                        
-                    }
-                }
-            }
-            
-        }else{
-            
-            //fast rendering
-            julia.setArg(2, 0)
-                 .setArg(3, 0.0f)
-                 .setArg(4, 0.0f);
-
-            commandQueue.put1DRangeKernel(julia, 0, globalThreads, localThreads);
-        }
-        
-        commandQueue.putBarrier()
-                    .putReadBuffer(pixelBuffer, true);
-
-    }
-
-    private void updateCamera() {
-
-        Camera camera = config.getCamera();
-
-        Vec dir    = camera.getDir();
-        Vec target = camera.getTarget();
-        Vec camX   = camera.getX();
-        Vec camY   = camera.getY();
-        Vec orig   = camera.getOrig();
-
-        vsub(dir, target, orig);
-        vnorm(dir);
-
-        Vec up = Vec.create().setX(0).setY(1).setZ(0);
-        vxcross(camX, dir, up);
-        vnorm(camX);
-        vmul(camX, config.getWidth() * .5135f / config.getHeight(), camX);
-
-        vxcross(camY, camX, dir);
-        vnorm(camY);
-        vmul(camY, .5135f, camY);
-    }
-
-
-    public static void main(String[] args) {
-
-        RenderingConfig config = RenderingConfig.create()
-            .setWidth(640).setHeight(480)
-            .setEnableShadow(1)
-            .setSuperSamplingSize(2)
-            .setActvateFastRendering(1)
-            .setMaxIterations(9)
-            .setEpsilon(0.003f * 0.75f)
-            .setLight(new float[] {5, 10, 15})
-            .setMu(new float[] {-0.2f, 0.4f, -0.4f, -0.4f});
-
-        config.getCamera().getOrig()  .setX(1).setY(2).setZ(8);
-        config.getCamera().getTarget().setX(0).setY(0).setZ(0);
-
-        final Julia3d julia3d = new Julia3d(config);
-
-        SwingUtilities.invokeLater(new Runnable() {
-            public void run() {
-                new Renderer(julia3d);
-            }
-        });
-    }
-
-    Buffer getPixelBuffer() {
-        return pixelBuffer.getBuffer();
-    }
-
-
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/Renderer.java b/src/com/mbien/opencl/demos/julia3d/Renderer.java
deleted file mode 100644
index 9266aba..0000000
--- a/src/com/mbien/opencl/demos/julia3d/Renderer.java
+++ /dev/null
@@ -1,203 +0,0 @@
-package com.mbien.opencl.demos.julia3d;
-
-import com.mbien.opencl.demos.julia3d.structs.RenderingConfig;
-import com.jogamp.opengl.util.awt.TextRenderer;
-import java.awt.Dimension;
-import java.awt.Font;
-import java.nio.FloatBuffer;
-import java.util.Timer;
-import java.util.TimerTask;
-import javax.media.opengl.GL2;
-import javax.media.opengl.GLAutoDrawable;
-import javax.media.opengl.GLCapabilities;
-import javax.media.opengl.GLEventListener;
-import javax.media.opengl.GLProfile;
-import javax.media.opengl.awt.GLCanvas;
-import javax.swing.JFrame;
-
-import static com.jogamp.common.nio.Buffers.*;
-import static javax.media.opengl.GL2.*;
-import static java.lang.String.*;
-
-/**
- * JOGL renderer for displaying the julia set.
- * @author Michael Bien
- */
-public class Renderer implements GLEventListener {
-
-    public final static int MU_RECT_SIZE = 80;
-
-    private final Julia3d julia3d;
-    private final GLCanvas canvas;
-    private final RenderingConfig config;
-    private final FloatBuffer juliaSlice;
-    private final UserSceneController usi;
-    private final TextRenderer textRenderer;
-
-    private TimerTask task;
-    private final Timer timer;
-
-    public Renderer(Julia3d julia3d) {
-        this.julia3d = julia3d;
-        this.config = julia3d.config;
-
-        timer = new Timer();
-
-        juliaSlice = newDirectFloatBuffer(MU_RECT_SIZE * MU_RECT_SIZE * 4);
-
-        canvas = new GLCanvas(new GLCapabilities(GLProfile.get(GLProfile.GL2)));
-        canvas.addGLEventListener(this);
-
-        usi = new UserSceneController();
-        usi.init(this, canvas, config);
-
-        JFrame frame = new JFrame("Java OpenCL - Julia3D GPU");
-        frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
-        canvas.setPreferredSize(new Dimension(config.getWidth(), config.getHeight()));
-        frame.add(canvas);
-        frame.pack();
-
-        textRenderer = new TextRenderer(frame.getFont().deriveFont(Font.BOLD, 14), true, true, null, false);
-
-        frame.setVisible(true);
-    }
-
-    public void init(GLAutoDrawable drawable) {
-        drawable.getGL().getGL2().glMatrixMode(GL_PROJECTION);
-    }
-
-    void update() {
-        julia3d.update(false);
-        canvas.display();
-    }
-
-    public void display(GLAutoDrawable drawable) {
-
-        //compute
-        julia3d.compute(config.getActvateFastRendering() == 1);
-
-        GL2 gl = drawable.getGL().getGL2();
-        gl.glClear(GL_COLOR_BUFFER_BIT);
-
-        // draw julia set
-	gl.glRasterPos2i(0, 0);
-	gl.glDrawPixels(config.getWidth(), config.getHeight(), GL_RGB, GL_FLOAT, julia3d.getPixelBuffer());
-
-
-        // Draw Mu constant
-        int width = config.getWidth();
-        int height = config.getHeight();
-        float[] mu = config.getMu();
-
-	gl.glEnable(GL_BLEND);
-            gl.glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
-            int baseMu1 = width - MU_RECT_SIZE - 2;
-            int baseMu2 = 1;
-            drawJuliaSlice(gl, baseMu1, baseMu2, mu[0], mu[1]);
-            int baseMu3 = width - MU_RECT_SIZE - 2;
-            int baseMu4 = MU_RECT_SIZE + 2;
-            drawJuliaSlice(gl, baseMu3, baseMu4, mu[2], mu[3]);
-	gl.glDisable(GL_BLEND);
-
-	gl.glColor3f(1, 1, 1);
-	int mu1 = (int) (baseMu1 + MU_RECT_SIZE * (mu[0] + 1.5f) / 3.f);
-	int mu2 = (int) (baseMu2 + MU_RECT_SIZE * (mu[1] + 1.5f) / 3.f);
-	gl.glBegin(GL_LINES);
-            gl.glVertex2i(mu1 - 4, mu2);
-            gl.glVertex2i(mu1 + 4, mu2);
-            gl.glVertex2i(mu1, mu2 - 4);
-            gl.glVertex2i(mu1, mu2 + 4);
-	gl.glEnd();
-
-	int mu3 = (int) (baseMu3 + MU_RECT_SIZE * (mu[2] + 1.5f) / 3.f);
-	int mu4 = (int) (baseMu4 + MU_RECT_SIZE * (mu[3] + 1.5f) / 3.f);
-	gl.glBegin(GL_LINES);
-            gl.glVertex2i(mu3 - 4, mu4);
-            gl.glVertex2i(mu3 + 4, mu4);
-            gl.glVertex2i(mu3, mu4 - 4);
-            gl.glVertex2i(mu3, mu4 + 4);
-	gl.glEnd();
-
-        // info text
-        textRenderer.beginRendering(width, height);
-        textRenderer.draw(format("Epsilon %.5f - Max. Iter. %d", config.getEpsilon(), config.getMaxIterations()), 8, 10);
-        textRenderer.draw(format("Mu = (%.3f, %.3f, %.3f, %.3f)", mu[0], mu[1], mu[2], mu[3]), 8, 25);
-        textRenderer.draw(format("Shadow %s - SuperSampling %dx%d - Fast rendering %s",
-			config.getEnableShadow() == 1 ? "on" : "off",
-                        config.getSuperSamplingSize(), config.getSuperSamplingSize(),
-			config.getActvateFastRendering() == 1 ? "on" : "off"), 8, 40);
-        textRenderer.endRendering();
-
-        // timer task scheduling, delay gpu intensive high quality rendering
-        if(task != null) {
-            task.cancel();
-        }
-        if(config.getActvateFastRendering() == 1) {
-            task = new TimerTask() {
-                @Override
-                public void run() {
-                    config.setActvateFastRendering(0);
-                    update();
-                    config.setActvateFastRendering(1);
-                }
-            };
-            timer.schedule(task, 2000);
-        }
-    }
-
-    private void drawJuliaSlice(GL2 gl, int origX, int origY, float cR, float cI) {
-
-        int index = 0;
-        float invSize = 3.0f / MU_RECT_SIZE;
-        for (int i = 0; i < MU_RECT_SIZE; ++i) {
-            for (int j = 0; j < MU_RECT_SIZE; ++j) {
-
-                float x = i * invSize - 1.5f;
-                float y = j * invSize - 1.5f;
-
-                int iter;
-                for (iter = 0; iter < 64; ++iter) {
-                    float x2 = x * x;
-                    float y2 = y * y;
-                    if (x2 + y2 > 4.0f) {
-                        break;
-                    }
-
-                    float newx = x2 - y2 + cR;
-                    float newy = 2.f * x * y + cI;
-                    x = newx;
-                    y = newy;
-                }
-                
-                juliaSlice.put(index++, iter / 64.0f);
-                juliaSlice.put(index++, 0.0f);
-                juliaSlice.put(index++, 0.0f);
-                juliaSlice.put(index++, 0.5f);
-            }
-        }
-
-	gl.glRasterPos2i(origX, origY);
-	gl.glDrawPixels(MU_RECT_SIZE, MU_RECT_SIZE, GL_RGBA, GL_FLOAT, juliaSlice);
-    }
-
-
-    public void reshape(GLAutoDrawable drawable, int x, int y, int newWidth, int newHeight) {
-
-        config.setWidth(newWidth);
-	config.setHeight(newHeight);
-
-        GL2 gl = drawable.getGL().getGL2();
-
-	gl.glViewport(0, 0, newWidth, newHeight);
-	gl.glLoadIdentity();
-	gl.glOrtho(-0.5f, newWidth - 0.5f, -0.5f, newHeight - 0.5f, -1.0f, 1.0f);
-
-        julia3d.update(true);
-
-    }
-
-    public void dispose(GLAutoDrawable drawable) {
-    }
-
-
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/UserSceneController.java b/src/com/mbien/opencl/demos/julia3d/UserSceneController.java
deleted file mode 100644
index 849de3a..0000000
--- a/src/com/mbien/opencl/demos/julia3d/UserSceneController.java
+++ /dev/null
@@ -1,249 +0,0 @@
-package com.mbien.opencl.demos.julia3d;
-
-import com.mbien.opencl.demos.julia3d.structs.RenderingConfig;
-import com.mbien.opencl.demos.julia3d.structs.Vec;
-import java.awt.Component;
-import java.awt.Point;
-import java.awt.event.KeyAdapter;
-import java.awt.event.KeyEvent;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.MouseWheelEvent;
-
-import static java.lang.Math.*;
-import static com.mbien.opencl.demos.julia3d.Renderer.*;
-
-/**
- * Utility class for interacting with a scene. Supports rotation and zoom around origin.
- * @author Michael Bien
- */
-public class UserSceneController {
-
-    private Point dragstart;
-    private RenderingConfig model;
-    private Renderer view;
-
-    private enum MOUSE_MODE { DRAG_ROTATE, DRAG_ZOOM }
-    private MOUSE_MODE dragmode = MOUSE_MODE.DRAG_ROTATE;
-
-
-    public void init(Renderer view, Component component, RenderingConfig model) {
-        initMouseListeners(component);
-        this.view = view;
-        this.model = model;
-    }
-
-    private void initMouseListeners(Component component) {
-
-        MouseAdapter mouseAdapter = new MouseAdapter() {
-            @Override
-            public void mouseDragged(MouseEvent e) {
-
-                int x = e.getX();
-                int y = e.getY();
-
-                switch (dragmode) {
-                    case DRAG_ROTATE:
-                        if (dragstart != null) {
-                            int height = model.getHeight();
-                            int width = model.getWidth();
-
-                            int ry = height - y - 1;
-                            int baseMu1 = width - MU_RECT_SIZE - 2;
-                            int baseMu2 = 1;
-                            int baseMu3 = width - MU_RECT_SIZE - 2;
-                            int baseMu4 = MU_RECT_SIZE + 2;
-
-                            if ((x >= baseMu1 && x <= baseMu1 + MU_RECT_SIZE) && (ry >= baseMu2 && ry <= baseMu2 + MU_RECT_SIZE)) {
-                                float[] mu = model.getMu();
-                                mu[0] = 3.f * ( x - baseMu1) / (float)MU_RECT_SIZE - 1.5f;
-                                mu[1] = 3.f * (ry - baseMu2) / (float)MU_RECT_SIZE - 1.5f;
-                                model.setMu(mu);
-                            } else if ((x >= baseMu3 && x <= baseMu3 + MU_RECT_SIZE) && (ry >= baseMu4 && ry <= baseMu4 + MU_RECT_SIZE)) {
-                                float[] mu = model.getMu();
-                                mu[2] = 3.f * ( x - baseMu3) / (float)MU_RECT_SIZE - 1.5f;
-                                mu[3] = 3.f * (ry - baseMu4) / (float)MU_RECT_SIZE - 1.5f;
-                                model.setMu(mu);
-                            } else {
-                                rotateCameraYbyOrig(0.01f * (x - dragstart.getX()));
-                                rotateCameraXbyOrig(0.01f * (y - dragstart.getY()));
-                            }
-                        }
-                        dragstart = e.getPoint();
-                        view.update();
-                        break;
-                    case DRAG_ZOOM:
-                        if (dragstart != null) {
-                            float zoom = (float) ((y - dragstart.getY()) / 10.0f);
-                            zoom(zoom);
-                        }
-                        dragstart = e.getPoint();
-                        view.update();
-                        break;
-                }
-
-            }
-
-            @Override
-            public void mousePressed(MouseEvent e) {
-                switch (e.getButton()) {
-                    case (MouseEvent.BUTTON1):
-                        dragmode = MOUSE_MODE.DRAG_ROTATE;
-                        break;
-                    case (MouseEvent.BUTTON2):
-                        dragmode = MOUSE_MODE.DRAG_ZOOM;
-                        break;
-                    case (MouseEvent.BUTTON3):
-                        dragmode = MOUSE_MODE.DRAG_ZOOM;
-                        break;
-                }
-            }
-
-            @Override
-            public void mouseReleased(MouseEvent e) {
-                switch (e.getButton()) {
-                    case (MouseEvent.BUTTON1):
-                        dragmode = MOUSE_MODE.DRAG_ZOOM;
-                        break;
-                    case (MouseEvent.BUTTON2):
-                        dragmode = MOUSE_MODE.DRAG_ROTATE;
-                        break;
-                    case (MouseEvent.BUTTON3):
-                        dragmode = MOUSE_MODE.DRAG_ROTATE;
-                        break;
-                }
-
-                dragstart = null;
-            }
-
-            @Override
-            public void mouseWheelMoved(MouseWheelEvent e) {
-                float zoom = e.getWheelRotation() * 0.1f;
-                zoom(zoom);
-                view.update();
-            }
-
-        };
-
-        KeyAdapter keyAdapter = new KeyAdapter() {
-
-            @Override
-            public void keyPressed(KeyEvent e) {
-
-                switch (e.getKeyChar()) {
-                    case 'l':
-                        model.setEnableShadow(model.getEnableShadow()==0 ? 1 : 0);
-                        break;
-                    case '1':
-                        model.setEpsilon(model.getEpsilon() * 0.75f);
-                        break;
-                    case '2':
-                        model.setEpsilon(model.getEpsilon() * 1.f / 0.75f);
-                        break;
-                    case '3':
-                        model.setMaxIterations(max(1, model.getMaxIterations() -1));
-                        break;
-                    case '4':
-                        model.setMaxIterations(min(12, model.getMaxIterations()+1));
-                        break;
-                    case '5':
-                        model.setSuperSamplingSize(max(1, model.getSuperSamplingSize() -1));
-                        break;
-                    case '6':
-                        model.setSuperSamplingSize(min(5, model.getSuperSamplingSize() +1));
-                        break;
-                    default:
-                        break;
-                }
-                view.update();
-
-            }
-
-        };
-
-        component.addKeyListener(keyAdapter);
-
-        component.addMouseListener(mouseAdapter);
-        component.addMouseMotionListener(mouseAdapter);
-        component.addMouseWheelListener(mouseAdapter);
-
-    }
-    private void zoom(float zoom) {
-        Vec orig = model.getCamera().getOrig();
-        orig.setX(orig.getX()+zoom)
-            .setY(orig.getY()+zoom)
-            .setZ(orig.getZ()+zoom);
-    }
-
-    private void rotateLightX(float k) {
-        float[] light = model.getLight();
-        float y = light[1];
-        float z = light[2];
-        light[1] = (float) ( y * cos(k) + z * sin(k));
-        light[2] = (float) (-y * sin(k) + z * cos(k));
-        model.setLight(light);
-    }
-
-    private void rotateLightY(float k) {
-        float[] light = model.getLight();
-        float x = light[0];
-        float z = light[2];
-        light[0] = (float) (x * cos(k) - z * sin(k));
-        light[2] = (float) (x * sin(k) + z * cos(k));
-        model.setLight(light);
-    }
-
-    private void rotateCameraXbyOrig(double k) {
-        Vec orig = model.getCamera().getOrig();
-        float y = orig.getY();
-        float z = orig.getZ();
-        orig.setY((float) ( y * cos(k) + z * sin(k)));
-        orig.setZ((float) (-y * sin(k) + z * cos(k)));
-    }
-
-    private void rotateCameraYbyOrig(double k) {
-        Vec orig = model.getCamera().getOrig();
-        float x = orig.getX();
-        float z = orig.getZ();
-        orig.setX((float) (x * cos(k) - z * sin(k)));
-        orig.setZ((float) (x * sin(k) + z * cos(k)));
-    }
-
-
-    public final static void vadd(Vec v, Vec a, Vec b) {
-        v.setX(a.getX() + b.getX());
-        v.setY(a.getY() + b.getY());
-        v.setZ(a.getZ() + b.getZ());
-    }
-
-    public final static void vsub(Vec v, Vec a, Vec b) {
-        v.setX(a.getX() - b.getX());
-        v.setY(a.getY() - b.getY());
-        v.setZ(a.getZ() - b.getZ());
-    }
-
-    public final static void vmul(Vec v, float s, Vec b) {
-        v.setX(s * b.getX());
-        v.setY(s * b.getY());
-        v.setZ(s * b.getZ());
-    }
-
-    public final static float vdot(Vec a, Vec b) {
-        return a.getX() * b.getX()
-             + a.getY() * b.getY()
-             + a.getZ() * b.getZ();
-    }
-
-    public final static void vnorm(Vec v) {
-        float s = (float) (1.0f / sqrt(vdot(v, v)));
-        vmul(v, s, v);
-    }
-
-    public final static void vxcross(Vec v, Vec a, Vec b) {
-        v.setX(a.getY() * b.getZ() - a.getZ() * b.getY());
-        v.setY(a.getZ() * b.getX() - a.getX() * b.getZ());
-        v.setZ(a.getX() * b.getY() - a.getY() * b.getX());
-    }
-
-
-}
\ No newline at end of file
diff --git a/src/com/mbien/opencl/demos/julia3d/config.h b/src/com/mbien/opencl/demos/julia3d/config.h
deleted file mode 100644
index 72df3ff..0000000
--- a/src/com/mbien/opencl/demos/julia3d/config.h
+++ /dev/null
@@ -1,24 +0,0 @@
-
-typedef struct {
-	float x, y, z; // position, also color (r,g,b)
-} Vec;
-
-typedef struct {
-	/* User defined values */
-	Vec orig, target;
-	/* Calculated values */
-	Vec dir, x, y;
-} Camera;
-
-typedef struct {
-	unsigned int width, height;
-	int superSamplingSize;
-	int actvateFastRendering;
-	int enableShadow;
-
-	unsigned int maxIterations;
-	float epsilon;
-	float mu[4];
-	float light[3];
-	Camera camera;
-} RenderingConfig;
diff --git a/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl b/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl
deleted file mode 100644
index d5acd02..0000000
--- a/src/com/mbien/opencl/demos/julia3d/mandelbrot_kernel.cl
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
-Copyright (c) 2009 David Bucciarelli (davibu@interfree.it)
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#define GPU_KERNEL
-
-
-typedef struct {
-	float x, y, z; // position, also color (r,g,b)
-} Vec;
-
-typedef struct {
-	/* User defined values */
-	Vec orig, target;
-	/* Calculated values */
-	Vec dir, x, y;
-} Camera;
-
-typedef struct {
-	unsigned int width, height;
-	int superSamplingSize;
-	int actvateFastRendering;
-	int enableShadow;
-
-	unsigned int maxIterations;
-	float epsilon;
-	float mu[4];
-	float light[3];
-	Camera camera;
-} RenderingConfig;
-
-#define BOUNDING_RADIUS_2 4.f
-
-// Scalar derivative approach by Enforcer:
-// http://www.fractalforums.com/mandelbulb-implementation/realtime-renderingoptimisations/
-static float IterateIntersect(const float4 z0, const float4 c0, const uint maxIterations) {
-	float4 z = z0;
-	float4 c = c0;
-
-	float dr = 1.0f;
-	float r2 = dot(z, z);
-	float r = sqrt(r2);
-	for (int n = 0; (n < maxIterations) && (r < 2.f); ++n) {
-		const float zo0 = asin(z.z / r);
-		const float zi0 = atan2(z.y, z.x);
-		float zr = r2 * r2 * r2 * r;
-		const float zo = zo0 * 7.f;
-		const float zi = zi0 * 7.f;
-		const float czo = cos(zo);
-
-		dr = zr * dr * 7.f + 1.f;
-		zr *= r;
-
-		z = zr * (float4)(czo * cos(zi), czo * sin(zi), sin(zo), 0.f);
-		z += c;
-
-		r2 = dot(z, z);
-		r = sqrt(r2);
-	}
-
-	return 0.5f * log(r) * r / dr;
-}
-
-static float IntersectBulb(const float4 eyeRayOrig, const float4 eyeRayDir,
-		const float4 c, const uint maxIterations, const float epsilon,
-		const float maxDist, float4 *hitPoint, uint *steps) {
-	float dist;
-	float4 r0 = eyeRayOrig;
-	float distDone = 0.f;
-
-	uint s = 0;
-	do {
-		dist = IterateIntersect(r0, c, maxIterations);
-		distDone += dist;
-		// We are inside
-		if (dist <= 0.f)
-			break;
-
-		r0 += eyeRayDir * dist;
-		s++;
-	} while ((dist > epsilon) && (distDone < maxDist));
-
-	*hitPoint = r0;
-	*steps = s;
-	return dist;
-}
-
-#define WORLD_RADIUS 1000.f
-#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f))
-float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
-	const float4 op = WORLD_CENTER - eyeRayOrig;
-	const float b = dot(op, eyeRayDir);
-	float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS;
-
-	if (det < 0.f)
-		return -1.f;
-	else
-		det = sqrt(det);
-
-	float t = b - det;
-	if (t > 0.f)
-		return t;
-	else {
-		// We are inside, avoid the hit
-		return -1.f;
-	}
-}
-
-int IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir,
-		float *tmin, float*tmax) {
-	const float4 op = -eyeRayOrig;
-	const float b = dot(op, eyeRayDir);
-	float det = b * b - dot(op, op) + BOUNDING_RADIUS_2;
-
-	if (det < 0.f)
-		return 0;
-	else
-		det = sqrt(det);
-
-	float t1 = b - det;
-	float t2 = b + det;
-	if (t1 > 0.f) {
-		*tmin = t1;
-		*tmax = t2;
-		return 1;
-	} else {
-		if (t2 > 0.f) {
-			// We are inside, start from the ray origin
-			*tmin = 0.f;
-			*tmax = t2;
-
-			return 1;
-		} else
-			return 0;
-	}
-}
-
-static float4 NormEstimate(const float4 p, const float4 c,
-		const float delta, const uint maxIterations) {
-	const float4 qP = p;
-	const float4 gx1 = qP - (float4)(delta, 0.f, 0.f, 0.f);
-	const float4 gx2 = qP + (float4)(delta, 0.f, 0.f, 0.f);
-	const float4 gy1 = qP - (float4)(0.f, delta, 0.f, 0.f);
-	const float4 gy2 = qP + (float4)(0.f, delta, 0.f, 0.f);
-	const float4 gz1 = qP - (float4)(0.f, 0.f, delta, 0.f);
-	const float4 gz2 = qP + (float4)(0.f, 0.f, delta, 0.f);
-
-	const float gradX = length(IterateIntersect(gx2, c, maxIterations)) -
-		length(IterateIntersect(gx1, c, maxIterations));
-	const float gradY = length(IterateIntersect(gy2, c, maxIterations)) -
-		length(IterateIntersect(gy1, c, maxIterations));
-	const float gradZ = length(IterateIntersect(gz2, c, maxIterations)) -
-		length(IterateIntersect(gz1, c, maxIterations));
-
-	const float4 N = normalize((float4)(gradX, gradY, gradZ, 0.f));
-
-	return N;
-}
-
-static float4 Phong(const float4 light, const float4 eye, const float4 pt,
-		const float4 N, const float4 diffuse) {
-	const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f);
-	float4 L = normalize(light - pt);
-	float NdotL = dot(N, L);
-	if (NdotL < 0.f)
-		return diffuse * ambient;
-
-	const float specularExponent = 30.f;
-	const float specularity = 0.65f;
-
-	float4 E = normalize(eye - pt);
-	float4 H = (L + E) * (float)0.5f;
-
-	return diffuse * NdotL +
-			specularity * pow(dot(N, H), specularExponent) +
-			diffuse * ambient;
-}
-
-__kernel void MandelbulbGPU(
-	__global float *pixels,
-	const __global RenderingConfig *config,
-	const int enableAccumulation,
-	const float sampleX,
-	const float sampleY) {
-    const int gid = get_global_id(0);
-	const unsigned width = config->width;
-	const unsigned height = config->height;
-
-	const unsigned int x = gid % width;
-	const int y = gid / width;
-
-	// Check if we have to do something
-	if (y >= height)
-		return;
-
-	const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.5f / 0.75f)) : config->epsilon;
-	const uint maxIterations = config->actvateFastRendering ? (max(3u, config->maxIterations) - 2u) : config->maxIterations;
-
-	const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]);
-	const float4 light = (float4)(config->light[0], config->light[1], config->light[2], 0.f);
-	const __global Camera *camera = &config->camera;
-
-	//--------------------------------------------------------------------------
-	// Calculate eye ray
-	//--------------------------------------------------------------------------
-
-	const float invWidth = 1.f / width;
-	const float invHeight = 1.f / height;
-	const float kcx = (x + sampleX) * invWidth - .5f;
-	const float4 kcx4 = (float4)kcx;
-	const float kcy = (y + sampleY) * invHeight - .5f;
-	const float4 kcy4 = (float4)kcy;
-
-	const float4 cameraX = (float4)(camera->x.x, camera->x.y, camera->x.z, 0.f);
-	const float4 cameraY = (float4)(camera->y.x, camera->y.y, camera->y.z, 0.f);
-	const float4 cameraDir = (float4)(camera->dir.x, camera->dir.y, camera->dir.z, 0.f);
-	const float4 cameraOrig = (float4)(camera->orig.x, camera->orig.y, camera->orig.z, 0.f);
-
-	const float4 eyeRayDir =  normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir);
-	const float4 eyeRayOrig = eyeRayDir * (float4)0.1f + cameraOrig;
-
-	//--------------------------------------------------------------------------
-	// Check if we hit the bounding sphere
-	//--------------------------------------------------------------------------
-
-	int useAO = 1;
-	float4 diffuse, n, color;
-
-	float4 hitPoint;
-	float dist, tmin, tmax;
-	if (IntersectBoundingSphere(eyeRayOrig, eyeRayDir, &tmin, &tmax)) {
-		//--------------------------------------------------------------------------
-		// Find the intersection with the set
-		//--------------------------------------------------------------------------
-
-		uint steps;
-		float4 rayOrig = eyeRayOrig + eyeRayDir * (float4)tmin;
-		dist = IntersectBulb(rayOrig, eyeRayDir, mu, maxIterations,
-				epsilon, tmax - tmin, &hitPoint, &steps);
-
-		if (dist <= epsilon) {
-			// Set hit
-			diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f);
-			n = NormEstimate(hitPoint, mu, dist, maxIterations);
-		} else
-			dist = -1.f;
-	} else
-		dist = -1.f;
-
-	//--------------------------------------------------------------------------
-	// Check if we hit the floor
-	//--------------------------------------------------------------------------
-
-	if (dist < 0.f) {
-		dist = IntersectFloorSphere(eyeRayOrig, eyeRayDir);
-
-		if (dist >= 0.f) {
-			// Floor hit
-			hitPoint = eyeRayOrig + eyeRayDir * (float4)dist;
-			n = hitPoint - WORLD_CENTER;
-			n = normalize(n);
-			// The most important feature in a ray tracer: a checker texture !
-			const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x);
-			const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z);
-			if ((ix + iz) % 2)
-				diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f);
-			else
-				diffuse = (float4) (0.75f, 0.f, 0.f, 0.f);
-			useAO = 0;
-		} else {
-			// Sky hit
-			color = (float4)(0.f, 0.1f, 0.3f, 0.f);
-		}
-	} else {
-		// Sky hit
-		color = (float4)(0.f, 0.1f, 0.3f, 0.f);
-	}
-
-	//--------------------------------------------------------------------------
-	// Select the shadow pass
-	//--------------------------------------------------------------------------
-
-	if (dist >= 0.f) {
-		float shadowFactor = 1.f;
-		if (config->enableShadow) {
-			float4 L = normalize(light -  hitPoint);
-			float4 rO = hitPoint + n * 1e-2f;
-			float4 shadowHitPoint;
-
-			// Check bounding sphere
-			if (IntersectBoundingSphere(rO, L, &tmin, &tmax)) {
-				float shadowDistSet = tmin;
-				uint steps;
-
-				rO = rO + L * (float4)shadowDistSet;
-				shadowDistSet = IntersectBulb(rO, L, mu, maxIterations, epsilon,
-						tmax - tmin, &shadowHitPoint, &steps);
-				if (shadowDistSet < epsilon) {
-					if (useAO) {
-						// Use steps count to simulate ambient occlusion
-						shadowFactor = 0.6f - min(steps / 255.f, 0.5f);
-					} else
-						shadowFactor = 0.6f;
-				}
-			}
-		}
-
-		//--------------------------------------------------------------------------
-		// Direct lighting of hit point
-		//--------------------------------------------------------------------------
-
-		color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor;
-	}
-
-	//--------------------------------------------------------------------------
-	// Write pixel
-	//--------------------------------------------------------------------------
-
-	int offset = 3 * (x + y * width);
-	color = clamp(color, (float4)(0.f, 0.f ,0.f, 0.f), (float4)(1.f, 1.f ,1.f, 0.f));
-	if (enableAccumulation) {
-		pixels[offset++] += color.s0;
-		pixels[offset++] += color.s1;
-		pixels[offset] += color.s2;
-	} else {
-		pixels[offset++] = color.s0;
-		pixels[offset++] = color.s1;
-		pixels[offset] = color.s2;
-	}
-}
-
-kernel void multiply(global float *array, const int numElements, const float s) {
-    const int gid = get_global_id(0);
-    if (gid >= numElements)  {
-        return;
-    }
-    array[gid] *= s;
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl b/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl
deleted file mode 100644
index 9c25c1b..0000000
--- a/src/com/mbien/opencl/demos/julia3d/rendering_kernel.cl
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
-Copyright (c) 2009 David Bucciarelli (davibu@interfree.it)
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#define GPU_KERNEL
-
-
-#define BOUNDING_RADIUS_2 4.f
-#define ESCAPE_THRESHOLD 1e1f
-#define DELTA 1e-4f
-
-typedef struct {
-    float x, y, z; // position, also color (r,g,b)
-} Vec;
-
-typedef struct {
-    Vec orig, target;
-    Vec dir, x, y;
-} Camera;
-
-typedef struct {
-    unsigned int width, height;
-    int superSamplingSize;
-    int actvateFastRendering;
-    int enableShadow;
-
-    unsigned int maxIterations;
-    float epsilon;
-    float mu[4];
-    float light[3];
-    Camera camera;
-} RenderingConfig;
-
-
-static float4 QuatMult(const float4 q1, const float4 q2) {
-    float4 r;
-
-    // a1a2 - b1b2 - c1c2 - d1d2
-    r.x = q1.x * q2.x - q1.y * q2.y - q1.z * q2.z - q1.w * q2.w;
-    // a1b2 + b1a2 + c1d2 - d1c2
-    r.y = q1.x * q2.y + q1.y * q2.x + q1.z * q2.w - q1.w * q2.z;
-    // a1c2 - b1d2 + c1a2 + d1b2
-    r.z = q1.x * q2.z - q1.y * q2.w + q1.z * q2.x + q1.w * q2.y;
-    // a1d2 + b1c2 - c1b2 + d1a2
-    r.w = q1.x * q2.w + q1.y * q2.z - q1.z * q2.y + q1.w * q2.x;
-
-    return r;
-}
-
-static float4 QuatSqr(const float4 q) {
-    float4 r;
-
-    r.x = q.x * q.x - q.y * q.y - q.z * q.z - q.w * q.w;
-    r.y = 2.f * q.x * q.y;
-    r.z = 2.f * q.x * q.z;
-    r.w = 2.f * q.x * q.w;
-
-    return r;
-}
-
-static void IterateIntersect(float4 *q, float4 *qp, const float4 c, const uint maxIterations) {
-    float4 q0 = *q;
-    float4 qp0 = *qp;
-
-    for (uint i = 0; i < maxIterations; ++i) {
-        qp0 = 2.f * QuatMult(q0, qp0);
-        q0 = QuatSqr(q0) + c;
-
-        if (dot(q0, q0) > ESCAPE_THRESHOLD)
-            break;
-    }
-
-    *q = q0;
-    *qp = qp0;
-}
-
-static float IntersectJulia(const float4 eyeRayOrig, const float4 eyeRayDir,
-        const float4 c, const uint maxIterations, const float epsilon,
-        float4 *hitPoint, uint *steps) {
-    float dist;
-    float4 r0 = eyeRayOrig;
-
-    uint s = 0;
-    do {
-        float4 z = r0;
-        float4 zp = (float4) (1.f, 0.f, 0.f, 0.f);
-
-        IterateIntersect(&z, &zp, c, maxIterations);
-
-        const float normZP = length(zp);
-
-        // We are inside
-        if (normZP == 0.f)
-            break;
-
-        const float normZ = length(z);
-        dist = 0.5f * normZ * log(normZ) / normZP;
-
-        r0 += eyeRayDir * dist;
-        s++;
-    } while ((dist > epsilon) && (dot(r0, r0) < BOUNDING_RADIUS_2));
-
-    *hitPoint = r0;
-    *steps = s;
-    return dist;
-}
-
-#define WORLD_RADIUS 1000.f
-#define WORLD_CENTER ((float4)(0.f, -WORLD_RADIUS - 2.f, 0.f, 0.f))
-
-float IntersectFloorSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
-    const float4 op = WORLD_CENTER - eyeRayOrig;
-    const float b = dot(op, eyeRayDir);
-    float det = b * b - dot(op, op) + WORLD_RADIUS * WORLD_RADIUS;
-
-    if (det < 0.f)
-        return -1.f;
-    else
-        det = sqrt(det);
-
-    float t = b - det;
-    if (t > 0.f)
-        return t;
-    else {
-        // We are inside, avoid the hit
-        return -1.f;
-    }
-}
-
-float IntersectBoundingSphere(const float4 eyeRayOrig, const float4 eyeRayDir) {
-    const float4 op = -eyeRayOrig;
-    const float b = dot(op, eyeRayDir);
-    float det = b * b - dot(op, op) + BOUNDING_RADIUS_2;
-
-    if (det < 0.f)
-        return -1.f;
-    else
-        det = sqrt(det);
-
-    float t = b - det;
-    if (t > 0.f)
-        return t;
-    else {
-        t = b + det;
-
-        if (t > 0.f) {
-            // We are inside, start from the ray origin
-            return 0.0f;
-        } else
-            return -1.f;
-    }
-}
-
-static float4 NormEstimate(const float4 p, const float4 c,
-        const float delta, const uint maxIterations) {
-    float4 N;
-    float4 qP = p;
-    float gradX, gradY, gradZ;
-
-    float4 gx1 = qP - (float4) (DELTA, 0.f, 0.f, 0.f);
-    float4 gx2 = qP + (float4) (DELTA, 0.f, 0.f, 0.f);
-    float4 gy1 = qP - (float4) (0.f, DELTA, 0.f, 0.f);
-    float4 gy2 = qP + (float4) (0.f, DELTA, 0.f, 0.f);
-    float4 gz1 = qP - (float4) (0.f, 0.f, DELTA, 0.f);
-    float4 gz2 = qP + (float4) (0.f, 0.f, DELTA, 0.f);
-
-    for (uint i = 0; i < maxIterations; ++i) {
-        gx1 = QuatSqr(gx1) + c;
-        gx2 = QuatSqr(gx2) + c;
-        gy1 = QuatSqr(gy1) + c;
-        gy2 = QuatSqr(gy2) + c;
-        gz1 = QuatSqr(gz1) + c;
-        gz2 = QuatSqr(gz2) + c;
-    }
-
-    gradX = length(gx2) - length(gx1);
-    gradY = length(gy2) - length(gy1);
-    gradZ = length(gz2) - length(gz1);
-
-    N = normalize((float4) (gradX, gradY, gradZ, 0.f));
-
-    return N;
-}
-
-static float4 Phong(const float4 light, const float4 eye, const float4 pt, const float4 N, const float4 diffuse) {
-
-    const float4 ambient = (float4) (0.05f, 0.05f, 0.05f, 0.f);
-    float4 L = normalize(light - pt);
-    float NdotL = dot(N, L);
-    if (NdotL < 0.f)
-        return diffuse * ambient;
-
-    const float specularExponent = 30.f;
-    const float specularity = 0.65f;
-
-    float4 E = normalize(eye - pt);
-    float4 H = (L + E) * (float) 0.5f;
-
-    return diffuse * NdotL +
-           specularity * pow(dot(N, H), specularExponent) +
-           diffuse * ambient;
-}
-
-kernel void JuliaGPU(   global float *pixels,
-                        const global RenderingConfig *config,
-                        int enableAccumulation,
-                        float sampleX,
-                        float sampleY ) {
-
-    const int gid = get_global_id(0);
-    unsigned width = config->width;
-    unsigned height = config->height;
-
-    const unsigned int x = gid % width;
-    const int y = gid / width;
-
-    // Check if we have to do something
-    if (y >= height)
-        return;
-
-    const float epsilon = config->actvateFastRendering ? (config->epsilon * (1.f / 0.75f)) : config->epsilon;
-    const uint maxIterations = max(1u, config->actvateFastRendering ? (config->maxIterations - 1) : config->maxIterations);
-
-    const float4 mu = (float4)(config->mu[0], config->mu[1], config->mu[2], config->mu[3]);
-    const float4 light = (float4) (config->light[0], config->light[1], config->light[2], 0.f);
-    const global Camera *camera = &config->camera;
-
-    //--------------------------------------------------------------------------
-    // Calculate eye ray
-    //--------------------------------------------------------------------------
-
-    const float invWidth = 1.f / width;
-    const float invHeight = 1.f / height;
-    const float kcx = (x + sampleX) * invWidth - .5f;
-    const float4 kcx4 = (float4) kcx;
-    const float kcy = (y + sampleY) * invHeight - .5f;
-    const float4 kcy4 = (float4) kcy;
-
-    const float4 cameraX = (float4) (camera->x.x, camera->x.y, camera->x.z, 0.f);
-    const float4 cameraY = (float4) (camera->y.x, camera->y.y, camera->y.z, 0.f);
-    const float4 cameraDir = (float4) (camera->dir.x, camera->dir.y, camera->dir.z, 0.f);
-    const float4 cameraOrig = (float4) (camera->orig.x, camera->orig.y, camera->orig.z, 0.f);
-
-    const float4 eyeRayDir = normalize(cameraX * kcx4 + cameraY * kcy4 + cameraDir);
-    const float4 eyeRayOrig = eyeRayDir * (float4) 0.1f + cameraOrig;
-
-    //--------------------------------------------------------------------------
-    // Check if we hit the bounding sphere
-    //--------------------------------------------------------------------------
-
-    float distSet = IntersectBoundingSphere(eyeRayOrig, eyeRayDir);
-    float4 hitPoint;
-    if (distSet >= 0.f) {
-        //--------------------------------------------------------------------------
-        // Find the intersection with the set
-        //--------------------------------------------------------------------------
-
-        uint steps;
-        float4 rayOrig = eyeRayOrig + eyeRayDir * (float4) distSet;
-        distSet = IntersectJulia(rayOrig, eyeRayDir, mu, maxIterations,
-                epsilon, &hitPoint, &steps);
-        if (distSet > epsilon)
-            distSet = -1.f;
-    }
-
-    //--------------------------------------------------------------------------
-    // Check if we hit the floor
-    //--------------------------------------------------------------------------
-
-    float distFloor = IntersectFloorSphere(eyeRayOrig, eyeRayDir);
-
-    //--------------------------------------------------------------------------
-    // Select the hit point
-    //--------------------------------------------------------------------------
-
-    int doShade = 0;
-    int useAO = 1;
-    float4 diffuse, n, color;
-    if ((distSet < 0.f) && (distFloor < 0.f)) {
-        // Sky hit
-        color = (float4) (0.f, 0.1f, 0.3f, 0.f);
-    } else if ((distSet >= 0.f) && ((distFloor < 0.f) || (distSet <= distFloor))) {
-        // Set hit
-        diffuse = (float4) (1.f, 0.35f, 0.15f, 0.f);
-        n = NormEstimate(hitPoint, mu, distSet, maxIterations);
-        doShade = 1;
-    } else if ((distFloor >= 0.f) && ((distSet < 0.f) || (distFloor <= distSet))) {
-        // Floor hit
-        hitPoint = eyeRayOrig + eyeRayDir * (float4) distFloor;
-        n = hitPoint - WORLD_CENTER;
-        n = normalize(n);
-        // The most important feature in a ray tracer: a checker texture !
-        const int ix = (hitPoint.x > 0.f) ? hitPoint.x : (1.f - hitPoint.x);
-        const int iz = (hitPoint.z > 0.f) ? hitPoint.z : (1.f - hitPoint.z);
-        if ((ix + iz) % 2)
-            diffuse = (float4) (0.75f, 0.75f, 0.75f, 0.f);
-        else
-            diffuse = (float4) (0.75f, 0.f, 0.f, 0.f);
-        doShade = 1;
-        useAO = 0;
-    }
-
-    //--------------------------------------------------------------------------
-    // Select the shadow pass
-    //--------------------------------------------------------------------------
-
-    if (doShade) {
-        float shadowFactor = 1.f;
-        if (config->enableShadow) {
-            float4 L = normalize(light - hitPoint);
-            float4 rO = hitPoint + n * 1e-2f;
-            float4 shadowHitPoint;
-
-            // Check bounding sphere
-            float shadowDistSet = IntersectBoundingSphere(rO, L);
-            if (shadowDistSet >= 0.f) {
-                uint steps;
-
-                rO = rO + L * (float4) shadowDistSet;
-                shadowDistSet = IntersectJulia(rO, L, mu, maxIterations, epsilon,
-                        &shadowHitPoint, &steps);
-                if (shadowDistSet < epsilon) {
-                    if (useAO) {
-                        // Use steps count to simulate ambient occlusion
-                        shadowFactor = 0.6f - min(steps / 255.f, 0.5f);
-                    } else
-                        shadowFactor = 0.6f;
-                }
-            } else
-                shadowDistSet = -1.f;
-        }
-
-        //--------------------------------------------------------------------------
-        // Direct lighting of hit point
-        //--------------------------------------------------------------------------
-
-        color = Phong(light, eyeRayOrig, hitPoint, n, diffuse) * shadowFactor;
-    }
-
-    //--------------------------------------------------------------------------
-    // Write pixel
-    //--------------------------------------------------------------------------
-
-    int offset = 3 * (x + y * width);
-    color = clamp(color, (float4) (0.f, 0.f, 0.f, 0.f), (float4) (1.f, 1.f, 1.f, 0.f));
-    if (enableAccumulation) {
-        pixels[offset++] += color.s0;
-        pixels[offset++] += color.s1;
-        pixels[offset  ] += color.s2;
-    } else {
-        pixels[offset++] = color.s0;
-        pixels[offset++] = color.s1;
-        pixels[offset  ] = color.s2;
-    }
-}
-
-kernel void multiply(global float *array, const int numElements, const float s) {
-    const int gid = get_global_id(0);
-    if (gid >= numElements)  {
-        return;
-    }
-    array[gid] *= s;
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera.java
deleted file mode 100644
index 50d5868..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Camera.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.common.nio.*;
-
-
-public abstract class Camera {
-
-  StructAccessor accessor;
-
-  public static int size() {
-//    if (CPU.is32Bit()) {
-//      return Camera32.size();
-//    } else {
-      return Camera64.size();
-//    }
-  }
-
-  public static Camera create() {
-    return create(Buffers.newDirectByteBuffer(size()));
-  }
-
-  public static Camera create(java.nio.ByteBuffer buf) {
-//    if (CPU.is32Bit()) {
-//      return new Camera32(buf);
-//    } else {
-      return new Camera64(buf);
-//    }
-  }
-
-  Camera(java.nio.ByteBuffer buf) {
-    accessor = new StructAccessor(buf);
-  }
-
-  public java.nio.ByteBuffer getBuffer() {
-    return accessor.getBuffer();
-  }
-
-  public abstract Vec getOrig();
-
-  public abstract Vec getTarget();
-
-  public abstract Vec getDir();
-
-  public abstract Vec getX();
-
-  public abstract Vec getY();
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java
deleted file mode 100644
index 85dea51..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Camera32.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Camera32 extends Camera {
-
-  public static int size() {
-    return 76;
-  }
-
-  Camera32(java.nio.ByteBuffer buf) {
-    super(buf);
-  }
-
-
-  public Vec getOrig() {
-    return Vec.create(accessor.slice(0, 12));
-  }
-
-  public Vec getTarget() {
-    return Vec.create(accessor.slice(16, 12));
-  }
-
-  public Vec getDir() {
-    return Vec.create(accessor.slice(32, 12));
-  }
-
-  public Vec getX() {
-    return Vec.create(accessor.slice(48, 12));
-  }
-
-  public Vec getY() {
-    return Vec.create(accessor.slice(64, 12));
-  }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java b/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java
deleted file mode 100644
index afe4acd..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Camera64.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Camera64 extends Camera {
-
-  private final Vec orig;
-  private final Vec target;
-  private final Vec dir;
-  private final Vec x;
-  private final Vec y;
-
-  public static int size() {
-    return 60;
-  }
-
-  Camera64(java.nio.ByteBuffer buf) {
-    super(buf);
-    orig = Vec.create(accessor.slice(0, 12));
-    target = Vec.create(accessor.slice(12, 12));
-    dir = Vec.create(accessor.slice(24, 12));
-    x = Vec.create(accessor.slice(36, 12));
-    y = Vec.create(accessor.slice(48, 12));
-  }
-
-
-  public Vec getOrig() {
-    return orig;
-  }
-
-  public Vec getTarget() {
-    return target;
-  }
-
-  public Vec getDir() {
-    return dir;
-  }
-
-  public Vec getX() {
-    return x;
-  }
-
-  public Vec getY() {
-    return y;
-  }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java
deleted file mode 100644
index 5bc65b5..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.common.nio.*;
-
-
-public abstract class RenderingConfig {
-
-  StructAccessor accessor;
-
-  public static int size() {
-//    if (CPU.is32Bit()) {
-//      return RenderingConfig32.size();
-//    } else {
-      return RenderingConfig64.size();
-//    }
-  }
-
-  public static RenderingConfig create() {
-    return create(Buffers.newDirectByteBuffer(size()));
-  }
-
-  public static RenderingConfig create(java.nio.ByteBuffer buf) {
-//    if (CPU.is32Bit()) {
-//      return new RenderingConfig32(buf);
-//    } else {
-      return new RenderingConfig64(buf);
-//    }
-  }
-
-  RenderingConfig(java.nio.ByteBuffer buf) {
-    accessor = new StructAccessor(buf);
-  }
-
-  public java.nio.ByteBuffer getBuffer() {
-    return accessor.getBuffer();
-  }
-
-  public abstract RenderingConfig setWidth(int val);
-
-  public abstract int getWidth();
-
-  public abstract RenderingConfig setHeight(int val);
-
-  public abstract int getHeight();
-
-  public abstract RenderingConfig setSuperSamplingSize(int val);
-
-  public abstract int getSuperSamplingSize();
-
-  public abstract RenderingConfig setActvateFastRendering(int val);
-
-  public abstract int getActvateFastRendering();
-
-  public abstract RenderingConfig setEnableShadow(int val);
-
-  public abstract int getEnableShadow();
-
-  public abstract RenderingConfig setMaxIterations(int val);
-
-  public abstract int getMaxIterations();
-
-  public abstract RenderingConfig setEpsilon(float val);
-
-  public abstract float getEpsilon();
-
-  public abstract RenderingConfig setMu(float[] val);
-
-  public abstract float[] getMu();
-
-  public abstract RenderingConfig setLight(float[] val);
-
-  public abstract float[] getLight();
-
-  public abstract Camera getCamera();
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java
deleted file mode 100644
index 3898d60..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig32.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class RenderingConfig32 extends RenderingConfig {
-
-  public static int size() {
-    return 140;
-  }
-
-  RenderingConfig32(java.nio.ByteBuffer buf) {
-    super(buf);
-  }
-
-
-  public RenderingConfig setWidth(int val) {
-    accessor.setIntAt(0, val);
-    return this;
-  }
-
-  public int getWidth() {
-    return accessor.getIntAt(0);
-  }
-
-  public RenderingConfig setHeight(int val) {
-    accessor.setIntAt(1, val);
-    return this;
-  }
-
-  public int getHeight() {
-    return accessor.getIntAt(1);
-  }
-
-  public RenderingConfig setSuperSamplingSize(int val) {
-    accessor.setIntAt(2, val);
-    return this;
-  }
-
-  public int getSuperSamplingSize() {
-    return accessor.getIntAt(2);
-  }
-
-  public RenderingConfig setActvateFastRendering(int val) {
-    accessor.setIntAt(3, val);
-    return this;
-  }
-
-  public int getActvateFastRendering() {
-    return accessor.getIntAt(3);
-  }
-
-  public RenderingConfig setEnableShadow(int val) {
-    accessor.setIntAt(4, val);
-    return this;
-  }
-
-  public int getEnableShadow() {
-    return accessor.getIntAt(4);
-  }
-
-  public RenderingConfig setMaxIterations(int val) {
-    accessor.setIntAt(5, val);
-    return this;
-  }
-
-  public int getMaxIterations() {
-    return accessor.getIntAt(5);
-  }
-
-  public RenderingConfig setEpsilon(float val) {
-    accessor.setFloatAt(6, val);
-    return this;
-  }
-
-  public float getEpsilon() {
-    return accessor.getFloatAt(6);
-  }
-
-  public RenderingConfig setMu(float[] val) {
-    accessor.setFloatsAt(8, val);
-    return this;
-  }
-
-  public float[] getMu() {
-    return accessor.getFloatsAt(8, new float[4]);
-  }
-
-  public RenderingConfig setLight(float[] val) {
-    accessor.setFloatsAt(12, val);
-    return this;
-  }
-
-  public float[] getLight() {
-    return accessor.getFloatsAt(12, new float[3]);
-  }
-
-  public Camera getCamera() {
-    return Camera.create(accessor.slice(64, 76));
-  }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java b/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java
deleted file mode 100644
index e3a0267..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/RenderingConfig64.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class RenderingConfig64 extends RenderingConfig {
-
-  private final Camera camera;
-
-  public static int size() {
-    return 116;
-  }
-
-  RenderingConfig64(java.nio.ByteBuffer buf) {
-    super(buf);
-    camera = Camera.create(accessor.slice(56, 60));
-  }
-
-
-  public RenderingConfig setWidth(int val) {
-    accessor.setIntAt(0, val);
-    return this;
-  }
-
-  public int getWidth() {
-    return accessor.getIntAt(0);
-  }
-
-  public RenderingConfig setHeight(int val) {
-    accessor.setIntAt(1, val);
-    return this;
-  }
-
-  public int getHeight() {
-    return accessor.getIntAt(1);
-  }
-
-  public RenderingConfig setSuperSamplingSize(int val) {
-    accessor.setIntAt(2, val);
-    return this;
-  }
-
-  public int getSuperSamplingSize() {
-    return accessor.getIntAt(2);
-  }
-
-  public RenderingConfig setActvateFastRendering(int val) {
-    accessor.setIntAt(3, val);
-    return this;
-  }
-
-  public int getActvateFastRendering() {
-    return accessor.getIntAt(3);
-  }
-
-  public RenderingConfig setEnableShadow(int val) {
-    accessor.setIntAt(4, val);
-    return this;
-  }
-
-  public int getEnableShadow() {
-    return accessor.getIntAt(4);
-  }
-
-  public RenderingConfig setMaxIterations(int val) {
-    accessor.setIntAt(5, val);
-    return this;
-  }
-
-  public int getMaxIterations() {
-    return accessor.getIntAt(5);
-  }
-
-  public RenderingConfig setEpsilon(float val) {
-    accessor.setFloatAt(6, val);
-    return this;
-  }
-
-  public float getEpsilon() {
-    return accessor.getFloatAt(6);
-  }
-
-  public RenderingConfig setMu(float[] val) {
-    accessor.setFloatsAt(7, val);
-    return this;
-  }
-
-  public float[] getMu() {
-    return accessor.getFloatsAt(7, new float[4]);
-  }
-
-  public RenderingConfig setLight(float[] val) {
-    accessor.setFloatsAt(11, val);
-    return this;
-  }
-
-  public float[] getLight() {
-    return accessor.getFloatsAt(11, new float[3]);
-  }
-
-  public Camera getCamera() {
-    return camera;
-  }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec.java
deleted file mode 100644
index 3bd6b18..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Vec.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-
-import com.jogamp.common.nio.*;
-
-
-public abstract class Vec {
-
-  StructAccessor accessor;
-
-  public static int size() {
-//    if (CPU.is32Bit()) {
-//      return Vec32.size();
-//    } else {
-      return Vec64.size();
-//    }
-  }
-
-  public static Vec create() {
-    return create(Buffers.newDirectByteBuffer(size()));
-  }
-
-  public static Vec create(java.nio.ByteBuffer buf) {
-//    if (CPU.is32Bit()) {
-//      return new Vec32(buf);
-//    } else {
-      return new Vec64(buf);
-//    }
-  }
-
-  Vec(java.nio.ByteBuffer buf) {
-    accessor = new StructAccessor(buf);
-  }
-
-  public java.nio.ByteBuffer getBuffer() {
-    return accessor.getBuffer();
-  }
-
-  public abstract Vec setX(float val);
-
-  public abstract float getX();
-
-  public abstract Vec setY(float val);
-
-  public abstract float getY();
-
-  public abstract Vec setZ(float val);
-
-  public abstract float getZ();
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java
deleted file mode 100644
index 3c0ee5d..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Vec32.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Vec32 extends Vec {
-
-  public static int size() {
-    return 12;
-  }
-
-  Vec32(java.nio.ByteBuffer buf) {
-    super(buf);
-  }
-
-
-  public Vec setX(float val) {
-    accessor.setFloatAt(0, val);
-    return this;
-  }
-
-  public float getX() {
-    return accessor.getFloatAt(0);
-  }
-
-  public Vec setY(float val) {
-    accessor.setFloatAt(1, val);
-    return this;
-  }
-
-  public float getY() {
-    return accessor.getFloatAt(1);
-  }
-
-  public Vec setZ(float val) {
-    accessor.setFloatAt(2, val);
-    return this;
-  }
-
-  public float getZ() {
-    return accessor.getFloatAt(2);
-  }
-}
diff --git a/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java b/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java
deleted file mode 100644
index 5952021..0000000
--- a/src/com/mbien/opencl/demos/julia3d/structs/Vec64.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* !---- DO NOT EDIT: This file autogenerated by com/sun/gluegen/JavaEmitter.java on Tue Feb 09 18:20:26 CET 2010 ----! */
-
-
-package com.mbien.opencl.demos.julia3d.structs;
-
-import java.nio.*;
-
-import com.jogamp.gluegen.runtime.*;
-
-
-class Vec64 extends Vec {
-
-  public static int size() {
-    return 12;
-  }
-
-  Vec64(java.nio.ByteBuffer buf) {
-    super(buf);
-  }
-
-
-  public Vec setX(float val) {
-    accessor.setFloatAt(0, val);
-    return this;
-  }
-
-  public float getX() {
-    return accessor.getFloatAt(0);
-  }
-
-  public Vec setY(float val) {
-    accessor.setFloatAt(1, val);
-    return this;
-  }
-
-  public float getY() {
-    return accessor.getFloatAt(1);
-  }
-
-  public Vec setZ(float val) {
-    accessor.setFloatAt(2, val);
-    return this;
-  }
-
-  public float getZ() {
-    return accessor.getFloatAt(2);
-  }
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl b/src/com/mbien/opencl/demos/radixsort/RadixSort.cl
deleted file mode 100644
index d014692..0000000
--- a/src/com/mbien/opencl/demos/radixsort/RadixSort.cl
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
-* Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
-*
-* NVIDIA Corporation and its licensors retain all intellectual property and 
-* proprietary rights in and to this software and related documentation.
-* Any use, reproduction, disclosure, or distribution of this software
-* and related documentation without an express license agreement from 
-* NVIDIA Corporation is strictly prohibited.
-* 
-* Please refer to the applicable NVIDIA end user license agreement (EULA) 
-* associated with this source code for terms and conditions that govern 
-* your use of this NVIDIA software.
-* 
-*/
-
-//----------------------------------------------------------------------------
-// Scans each warp in parallel ("warp-scan"), one element per thread.
-// uses 2 numElements of shared memory per thread (64 = elements per warp)
-//----------------------------------------------------------------------------
-//#define WARP_SIZE 32
-uint scanwarp(uint val, __local uint* sData, int maxlevel)
-{
-    // The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp = 
-    // 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
-    int localId = get_local_id(0);
-    int idx = 2 * localId - (localId & (WARP_SIZE - 1));
-    sData[idx] = 0;
-    idx += WARP_SIZE;
-    sData[idx] = val;     
-
-    if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; }
-    if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; }
-    if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; }
-    if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; }
-    if (4 <= maxlevel) { sData[idx] += sData[idx -16]; }
-
-    return sData[idx] - val;  // convert inclusive -> exclusive
-}
-
-//----------------------------------------------------------------------------
-// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using 
-// a warp-scan algorithm
-//----------------------------------------------------------------------------
-uint4 scan4(uint4 idata, __local uint* ptr)
-{    
-    
-    uint idx = get_local_id(0);
-
-    uint4 val4 = idata;
-    uint sum[3];
-    sum[0] = val4.x;
-    sum[1] = val4.y + sum[0];
-    sum[2] = val4.z + sum[1];
-    
-    uint val = val4.w + sum[2];
-    
-    val = scanwarp(val, ptr, 4);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
-    {
-        ptr[idx >> 5] = val + val4.w + sum[2];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-	if (idx < WARP_SIZE)
-		ptr[idx] = scanwarp(ptr[idx], ptr, 2);
-    
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    val += ptr[idx >> 5];
-
-    val4.x = val;
-    val4.y = val + sum[0];
-    val4.z = val + sum[1];
-    val4.w = val + sum[2];
-
-    return val4;
-}
-
-#ifdef MAC
-__kernel uint4 rank4(uint4 preds, __local uint* sMem)
-#else
-uint4 rank4(uint4 preds, __local uint* sMem)
-#endif
-{
-	int localId = get_local_id(0);
-	int localSize = get_local_size(0);
-
-	uint4 address = scan4(preds, sMem);
-	
-	__local uint numtrue;
-	if (localId == localSize - 1) 
-	{
-		numtrue = address.w + preds.w;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	uint4 rank;
-	int idx = localId*4;
-	rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
-	rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
-	rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
-	rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
-	
-	return rank;
-}
-
-void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem)
-{
-	int localId = get_local_id(0);
-    int localSize = get_local_size(0);
-	
-	for(uint shift = startbit; shift < (startbit + nbits); ++shift)
-	{
-		uint4 lsb;
-		lsb.x = !(((*key).x >> shift) & 0x1);
-		lsb.y = !(((*key).y >> shift) & 0x1);
-        lsb.z = !(((*key).z >> shift) & 0x1);
-        lsb.w = !(((*key).w >> shift) & 0x1);
-        
-		uint4 r;
-		
-		r = rank4(lsb, sMem);
-
-        // This arithmetic strides the ranks across 4 CTA_SIZE regions
-        sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x;
-        sMem[(r.y & 3) * localSize + (r.y >> 2)] = (*key).y;
-        sMem[(r.z & 3) * localSize + (r.z >> 2)] = (*key).z;
-        sMem[(r.w & 3) * localSize + (r.w >> 2)] = (*key).w;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // The above allows us to read without 4-way bank conflicts:
-        (*key).x = sMem[localId];
-        (*key).y = sMem[localId +     localSize];
-        (*key).z = sMem[localId + 2 * localSize];
-        (*key).w = sMem[localId + 3 * localSize];
-
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-}
-
-__kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, 
-                                      __global uint4* keysOut,
-                                      uint nbits,
-                                      uint startbit,
-                                      uint numElements,
-                                      uint totalBlocks,
-                                      __local uint* sMem)
-{
-	int globalId = get_global_id(0);
-	
-	uint4 key;
-	key = keysIn[globalId];
-	
-	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	radixSortBlockKeysOnly(&key, nbits, startbit, sMem);
-	
-	keysOut[globalId] = key;
-}
-
-//----------------------------------------------------------------------------
-// Given an array with blocks sorted according to a 4-bit radix group, each 
-// block counts the number of keys that fall into each radix in the group, and 
-// finds the starting offset of each radix in the block.  It then writes the radix 
-// counts to the counters array, and the starting offsets to the blockOffsets array.
-//
-// Template parameters are used to generate efficient code for various special cases
-// For example, we have to handle arrays that are a multiple of the block size 
-// (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
-// CTAs are used. 
-//
-// By persistent CTAs we mean that we launch only as many thread blocks as can 
-// be resident in the GPU and no more, rather than launching as many threads as
-// we have elements. Persistent CTAs loop over blocks of elements until all work
-// is complete.  This can be faster in some cases.  In our tests it is faster
-// for large sorts (and the threshold is higher on compute version 1.1 and earlier
-// GPUs than it is on compute version 1.2 GPUs.
-//                                
-//----------------------------------------------------------------------------
-__kernel void findRadixOffsets(__global uint2* keys,
-                               __global uint* counters,
-                               __global uint* blockOffsets,
-                               uint startbit,
-                               uint numElements,
-                               uint totalBlocks,
-                               __local uint* sRadix1)
-{
-	__local uint  sStartPointers[16];
-
-    uint groupId = get_group_id(0);
-    uint localId = get_local_id(0);
-    uint groupSize = get_local_size(0);
-
-    uint2 radix2;
-
-    radix2 = keys[get_global_id(0)];
-        
-
-    sRadix1[2 * localId]     = (radix2.x >> startbit) & 0xF;
-    sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
-
-    // Finds the position where the sRadix1 entries differ and stores start 
-    // index for each radix.
-    if(localId < 16) 
-    {
-        sStartPointers[localId] = 0; 
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
-    {
-        sStartPointers[sRadix1[localId]] = localId;
-    }
-    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) 
-    {
-        sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(localId < 16) 
-    {
-        blockOffsets[groupId*16 + localId] = sStartPointers[localId];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Compute the sizes of each block.
-    if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
-    {
-        sStartPointers[sRadix1[localId - 1]] = 
-            localId - sStartPointers[sRadix1[localId - 1]];
-    }
-    if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) 
-    {
-        sStartPointers[sRadix1[localId + groupSize - 1]] = 
-            localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
-    }
-        
-
-    if(localId == groupSize - 1) 
-    {
-        sStartPointers[sRadix1[2 * groupSize - 1]] = 
-            2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(localId < 16) 
-    {
-        counters[localId * totalBlocks + groupId] = sStartPointers[localId];
-    }
-}
-
-// a naive scan routine that works only for array that
-// can fit into a single block, just for debugging purpose,
-// not used in the sort now
-__kernel void scanNaive(__global uint *g_odata, 
-                        __global uint *g_idata, 
-                        uint n,
-                        __local uint* temp)
-{
-
-    int localId = get_local_id(0);
-
-    int pout = 0;
-    int pin = 1;
-
-    // Cache the computational window in shared memory
-    temp[pout*n + localId] = (localId > 0) ? g_idata[localId-1] : 0;
-
-    for (int offset = 1; offset < n; offset *= 2)
-    {
-        pout = 1 - pout;
-        pin  = 1 - pout;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        temp[pout*n+localId] = temp[pin*n+localId];
-
-        if (localId >= offset)
-            temp[pout*n+localId] += temp[pin*n+localId - offset];
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    g_odata[localId] = temp[pout*n+localId];
-}
-
-//----------------------------------------------------------------------------
-// reorderData shuffles data in the array globally after the radix offsets 
-// have been found. On compute version 1.1 and earlier GPUs, this code depends 
-// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
-// 
-// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
-// that all writes are coalesced using extra work in the kernel.  On later
-// GPUs coalescing rules have been relaxed, so this extra overhead hurts 
-// performance.  On these GPUs we set manualCoalesce=false and directly store
-// the results.
-//
-// Template parameters are used to generate efficient code for various special cases
-// For example, we have to handle arrays that are a multiple of the block size 
-// (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
-// CTAs are used. 
-//
-// By persistent CTAs we mean that we launch only as many thread blocks as can 
-// be resident in the GPU and no more, rather than launching as many threads as
-// we have elements. Persistent CTAs loop over blocks of elements until all work
-// is complete.  This can be faster in some cases.  In our tests it is faster
-// for large sorts (and the threshold is higher on compute version 1.1 and earlier
-// GPUs than it is on compute version 1.2 GPUs.
-//----------------------------------------------------------------------------
-__kernel void reorderDataKeysOnly(__global uint  *outKeys, 
-                                  __global uint2  *keys, 
-                                  __global uint  *blockOffsets, 
-                                  __global uint  *offsets, 
-                                  __global uint  *sizes, 
-                                  uint startbit,
-                                  uint numElements,
-                                  uint totalBlocks,
-                                  __local uint2* sKeys2)
-{
-    __local uint sOffsets[16];
-    __local uint sBlockOffsets[16];
-
-    __local uint *sKeys1 = (__local uint*)sKeys2; 
-
-    uint groupId = get_group_id(0);
-
-	uint globalId = get_global_id(0);
-    uint localId = get_local_id(0);
-    uint groupSize = get_local_size(0);
-
-    sKeys2[localId]   = keys[globalId];
-
-    if(localId < 16)  
-    {
-        sOffsets[localId]      = offsets[localId * totalBlocks + groupId];
-        sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    uint radix = (sKeys1[localId] >> startbit) & 0xF;
-    uint globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
-
-    if (globalOffset < numElements)
-    {
-        outKeys[globalOffset]   = sKeys1[localId];
-    }
-
-    radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
-    globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
-
-    if (globalOffset < numElements)
-    {
-        outKeys[globalOffset]   = sKeys1[localId + groupSize];
-    }
- 
-
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSort.java b/src/com/mbien/opencl/demos/radixsort/RadixSort.java
deleted file mode 100644
index 783c094..0000000
--- a/src/com/mbien/opencl/demos/radixsort/RadixSort.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * 20:38 Sunday, February 28 2010
- */
-
-package com.mbien.opencl.demos.radixsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLResource;
-import java.io.IOException;
-import java.nio.IntBuffer;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.*;
-import static com.mbien.opencl.CLProgram.CompilerOptions.*;
-import static java.lang.System.*;
-
-/**
- *
- * @author Michael Bien
- */
-public class RadixSort implements CLResource {
-
-    private static final int NUM_BANKS = 16;
-    private static final int WARP_SIZE = 32;
-    private static final int bitStep   = 4;
-
-    private final int CTA_SIZE;
-
-    private final CLKernel ckRadixSortBlocksKeysOnly;
-    private final CLKernel ckFindRadixOffsets;
-    private final CLKernel ckScanNaive;
-    private final CLKernel ckReorderDataKeysOnly;
-
-    private final CLBuffer<?> tempKeys;
-    private final CLBuffer<?> mCounters;
-    private final CLBuffer<?> mCountersSum;
-    private final CLBuffer<?> mBlockOffsets;
-
-    private final CLCommandQueue queue;
-    private final Scan scan;
-    private final CLProgram program;
-
-    public RadixSort(CLCommandQueue queue, int maxElements, int CTA_SIZE) throws IOException {
-
-        this.CTA_SIZE = CTA_SIZE;
-        scan = new Scan(queue, maxElements / 2 / CTA_SIZE * 16);
-
-        int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0)
-                ? (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1);
-
-        this.queue = queue;
-
-        CLContext context  = queue.getContext();
-        this.tempKeys      = context.createBuffer(4 * maxElements,           READ_WRITE);
-        this.mCounters     = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
-        this.mCountersSum  = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
-        this.mBlockOffsets = context.createBuffer(4 * WARP_SIZE * numBlocks, READ_WRITE);
-
-        program = context.createProgram(getClass().getResourceAsStream("RadixSort.cl"))
-                         .build(ENABLE_MAD, define("WARP_SIZE", WARP_SIZE));
-
-//        out.println(program.getBuildLog());
-
-        ckRadixSortBlocksKeysOnly  = program.createCLKernel("radixSortBlocksKeysOnly");
-        ckFindRadixOffsets         = program.createCLKernel("findRadixOffsets");
-        ckScanNaive                = program.createCLKernel("scanNaive");
-        ckReorderDataKeysOnly      = program.createCLKernel("reorderDataKeysOnly");
-
-    }
-
-    void sort(CLBuffer<IntBuffer> d_keys, int numElements, int keyBits) {
-        radixSortKeysOnly(d_keys, numElements, keyBits);
-    }
-
-    //----------------------------------------------------------------------------
-    // Main key-only radix sort function.  Sorts in place in the keys and values
-    // arrays, but uses the other device arrays as temporary storage.  All pointer
-    // parameters are device pointers.  Uses cudppScan() for the prefix sum of
-    // radix counters.
-    //----------------------------------------------------------------------------
-    void radixSortKeysOnly(CLBuffer<IntBuffer> keys, int numElements, int keyBits) {
-        int i = 0;
-        while (keyBits > i * bitStep) {
-            radixSortStepKeysOnly(keys, bitStep, i * bitStep, numElements);
-            i++;
-        }
-    }
-
-    //----------------------------------------------------------------------------
-    // Perform one step of the radix sort.  Sorts by nbits key bits per step,
-    // starting at startbit.
-    //----------------------------------------------------------------------------
-    void radixSortStepKeysOnly(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
-
-        // Four step algorithms from Satish, Harris & Garland
-        radixSortBlocksKeysOnlyOCL(keys, nbits, startbit, numElements);
-
-        findRadixOffsetsOCL(startbit, numElements);
-
-        scan.scanExclusiveLarge(mCountersSum, mCounters, 1, numElements / 2 / CTA_SIZE * 16);
-
-        reorderDataKeysOnlyOCL(keys, startbit, numElements);
-    }
-
-    //----------------------------------------------------------------------------
-    // Wrapper for the kernels of the four steps
-    //----------------------------------------------------------------------------
-    void radixSortBlocksKeysOnlyOCL(CLBuffer<IntBuffer> keys, int nbits, int startbit, int numElements) {
-
-        int totalBlocks = numElements / 4 / CTA_SIZE;
-        int globalWorkSize = CTA_SIZE * totalBlocks;
-        int localWorkSize = CTA_SIZE;
-
-        ckRadixSortBlocksKeysOnly.putArg(keys).putArg(tempKeys).putArg(nbits).putArg(startbit)
-                                 .putArg(numElements).putArg(totalBlocks).putNullArg(4 * CTA_SIZE * 4)
-                                 .rewind();
-
-        queue.put1DRangeKernel(ckRadixSortBlocksKeysOnly, 0, globalWorkSize, localWorkSize);
-    }
-
-    void findRadixOffsetsOCL(int startbit, int numElements) {
-
-        int totalBlocks = numElements / 2 / CTA_SIZE;
-        int globalWorkSize = CTA_SIZE * totalBlocks;
-        int localWorkSize = CTA_SIZE;
-
-        ckFindRadixOffsets.putArg(tempKeys).putArg(mCounters).putArg(mBlockOffsets)
-                          .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4)
-                          .rewind();
-
-        queue.put1DRangeKernel(ckFindRadixOffsets, 0, globalWorkSize, localWorkSize);
-    }
-
-    void scanNaiveOCL(int numElements) {
-        
-        int nHist = numElements / 2 / CTA_SIZE * 16;
-        int globalWorkSize = nHist;
-        int localWorkSize = nHist;
-        int extra_space = nHist / NUM_BANKS;
-        int shared_mem_size = 4 * (nHist + extra_space);
-
-        ckScanNaive.putArg(mCountersSum).putArg(mCounters).putArg(nHist).putNullArg(2 * shared_mem_size).rewind();
-
-        queue.put1DRangeKernel(ckScanNaive, 0, globalWorkSize, localWorkSize);
-    }
-
-    void reorderDataKeysOnlyOCL(CLBuffer<IntBuffer> keys, int startbit, int numElements) {
-
-        int totalBlocks = numElements / 2 / CTA_SIZE;
-        int globalWorkSize = CTA_SIZE * totalBlocks;
-        int localWorkSize = CTA_SIZE;
-
-        ckReorderDataKeysOnly.putArg(keys).putArg(tempKeys).putArg(mBlockOffsets).putArg(mCountersSum).putArg(mCounters)
-                             .putArg(startbit).putArg(numElements).putArg(totalBlocks).putNullArg(2 * CTA_SIZE * 4).rewind();
-
-        queue.put1DRangeKernel(ckReorderDataKeysOnly, 0, globalWorkSize, localWorkSize);
-    }
-
-    public void release() {
-
-        scan.release();
-
-        //program & kernels
-        program.release();
-
-        //buffers
-        tempKeys.release();
-        mCounters.release();
-        mCountersSum.release();
-        mBlockOffsets.release();
-    }
-
-    public void close() {
-        release();
-    }
-
-
-
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java b/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java
deleted file mode 100644
index 93bc071..0000000
--- a/src/com/mbien/opencl/demos/radixsort/RadixSortDemo.java
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * 20:48 Sunday, February 28 2010
- */
-
-package com.mbien.opencl.demos.radixsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLPlatform;
-import java.io.IOException;
-import java.nio.IntBuffer;
-import java.util.Random;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static java.lang.System.*;
-import static com.mbien.opencl.CLDevice.Type.*;
-
-/**
- * GPU radix sort demo.
- * @author Michael Bien
- */
-public class RadixSortDemo {
-
-    public RadixSortDemo() throws IOException {
-
-        CLContext context = null;
-        try{
-            //single GPU setup
-            context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU));
-            CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
-
-            int maxValue = Integer.MAX_VALUE;
-            int samples  = 10;
-
-            int[] workgroupSizes = new int[] {128, 256};
-
-            int[] runs = new int[] {   32768,
-                                       65536,
-                                      131072,
-                                      262144,
-                                      524288,
-                                     1048576,
-                                     2097152,
-                                     4194304,
-                                     8388608 };
-
-            for (int i = 0; i < workgroupSizes.length; i++) {
-
-                int workgroupSize = workgroupSizes[i];
-
-                out.println("\n = = = workgroup size: "+workgroupSize+" = = = ");
-
-                for(int run = 0; run < runs.length; run++) {
-
-                    if(  workgroupSize==128 && runs[run] >= 8388608
-                      || workgroupSize==256 && runs[run] <= 32768) {
-                        continue; // we can only sort up to 4MB with wg size of 128
-                    }
-
-                    int numElements = runs[run];
-
-                    CLBuffer<IntBuffer> array = context.createIntBuffer(numElements, READ_WRITE);
-                    out.print("array size: " + array.getCLSize()/1000000.0f+"MB; ");
-                    out.println("elements: " + array.getCapacity()/1000+"K");
-
-                    fillBuffer(array, maxValue);
-
-                    RadixSort radixSort = new RadixSort(queue, numElements, workgroupSize);
-                    for(int a = 0; a < samples; a++) {
-
-                        queue.finish();
-
-                        long time = nanoTime();
-
-                        queue.putWriteBuffer(array, false);
-                        radixSort.sort(array, numElements, 32);
-                        queue.putReadBuffer(array, true);
-
-                        out.println("time: " + (nanoTime() - time)/1000000.0f+"ms");
-                    }
-
-                    out.print("snapshot: ");
-                    printSnapshot(array.getBuffer(), 20);
-
-                    out.println("validating...");
-                    checkIfSorted(array.getBuffer());
-                    out.println("values sorted");
-
-                    array.release();
-                    radixSort.release();
-                }
-            }
-
-        }finally{
-            if(context != null) {
-                context.release();
-            }
-        }
-
-    }
-
-    private void fillBuffer(CLBuffer<IntBuffer> array, int maxValue) {
-        Random random = new Random(42);
-        for (int n = 0; n < array.getBuffer().capacity(); n++) {
-            int rnd = random.nextInt(maxValue);
-            array.getBuffer().put(n, rnd);
-        }
-    }
-
-    private void printSnapshot(IntBuffer buffer, int snapshot) {
-        for(int i = 0; i < snapshot; i++)
-            out.print(buffer.get() + ", ");
-        out.println("...; " + buffer.remaining() + " more");
-        buffer.rewind();
-    }
-
-    private void checkIfSorted(IntBuffer keys) {
-        for (int i = 1; i < keys.capacity(); i++) {
-            if (keys.get(i - 1) > keys.get(i)) {
-                throw new RuntimeException("not sorted "+ keys.get(i - 1) +" !> "+ keys.get(i));
-            }
-        }
-    }
-
-    public static void main(String[] args) throws IOException {
-        new RadixSortDemo();
-    }
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/Scan.java b/src/com/mbien/opencl/demos/radixsort/Scan.java
deleted file mode 100644
index 1426e9b..0000000
--- a/src/com/mbien/opencl/demos/radixsort/Scan.java
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * 22:12 Sunday, February 28 2010
- */
-package com.mbien.opencl.demos.radixsort;
-
-import com.mbien.opencl.CLBuffer;
-import com.mbien.opencl.CLCommandQueue;
-import com.mbien.opencl.CLContext;
-import com.mbien.opencl.CLKernel;
-import com.mbien.opencl.CLProgram;
-import com.mbien.opencl.CLResource;
-import java.io.IOException;
-
-import static com.mbien.opencl.CLMemory.Mem.*;
-import static com.mbien.opencl.CLProgram.CompilerOptions.*;
-
-/**
- *
- * @author Michael Bien
- */
-public class Scan implements CLResource {
-
-    private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024;
-    private final static int MAX_LOCAL_GROUP_SIZE = 256;
-    private final static int WORKGROUP_SIZE = 256;
-    private final static int MAX_BATCH_ELEMENTS = 64 * 1048576;
-    private final static int MIN_SHORT_ARRAY_SIZE = 4;
-    private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE;
-    private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE;
-    private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE;
-
-    private final CLKernel ckScanExclusiveLocal1;
-    private final CLKernel ckScanExclusiveLocal2;
-    private final CLKernel ckUniformUpdate;
-
-    private final CLCommandQueue queue;
-    private final CLProgram program;
-    private CLBuffer<?> buffer;
-
-    public Scan(CLCommandQueue queue, int numElements) throws IOException {
-
-        this.queue = queue;
-
-        CLContext context = queue.getContext();
-        if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) {
-            buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE);
-        }
-        program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl"))
-                         .build(ENABLE_MAD);
-
-        ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1");
-        ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2");
-        ckUniformUpdate       = program.createCLKernel("uniformUpdate");
-    }
-
-    // main exclusive scan routine
-    void scanExclusiveLarge(CLBuffer<?> dst, CLBuffer<?> src, int batchSize, int arrayLength) {
-
-        //Check power-of-two factorization
-        if(!isPowerOf2(arrayLength)) {
-            throw new RuntimeException();
-        }
-
-        //Check supported size range
-        if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) {
-            throw new RuntimeException();
-        }
-
-        //Check total batch size limit
-        if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) {
-            throw new RuntimeException();
-        }
-
-        scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE);
-        scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE));
-        uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE));
-    }
-
-    void scanExclusiveLocal1(CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
-
-        ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size)
-                             .rewind();
-
-        int localWorkSize = WORKGROUP_SIZE;
-        int globalWorkSize = (n * size) / 4;
-
-        queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize);
-    }
-
-    void scanExclusiveLocal2(CLBuffer<?> buffer, CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {
-
-        int elements = n * size;
-        ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4)
-                             .putArg(elements).putArg(size).rewind();
-
-        int localWorkSize = WORKGROUP_SIZE;
-        int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);
-
-        queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize);
-    }
-
-    void uniformUpdate(CLBuffer<?> dst, CLBuffer<?> buffer, int n) {
-
-        ckUniformUpdate.setArgs(dst, buffer);
-
-        int localWorkSize  = WORKGROUP_SIZE;
-        int globalWorkSize = n * WORKGROUP_SIZE;
-
-        queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize);
-    }
-
-    private int iSnapUp(int dividend, int divisor) {
-        return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor);
-    }
-
-    public static boolean isPowerOf2(int x) {
-        return ((x - 1) & x) == 0;
-    }
-
-    public void release() {
-        program.release();
-
-        if(buffer!=null) {
-            buffer.release();
-        }
-    }
-
-    public void close() {
-        release();
-    }
-}
diff --git a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl b/src/com/mbien/opencl/demos/radixsort/Scan_b.cl
deleted file mode 100644
index 32fd4dd..0000000
--- a/src/com/mbien/opencl/demos/radixsort/Scan_b.cl
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
- */
-
-
-
-//All three kernels run 512 threads per workgroup
-//Must be a power of two
-#define WORKGROUP_SIZE 256
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Scan codelets
-////////////////////////////////////////////////////////////////////////////////
-#if(1)
-    //Naive inclusive scan: O(N * log2(N)) operations
-    //Allocate 2 * 'size' local memory, initialize the first half
-    //with 'size' zeros avoiding if(pos >= offset) condition evaluation
-    //and saving instructions
-    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
-        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
-        l_Data[pos] = 0;
-        pos += size;
-        l_Data[pos] = idata;
-
-        for(uint offset = 1; offset < size; offset <<= 1){
-            barrier(CLK_LOCAL_MEM_FENCE);
-            uint t = l_Data[pos] + l_Data[pos - offset];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            l_Data[pos] = t;
-        }
-
-        return l_Data[pos];
-    }
-
-    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
-        return scan1Inclusive(idata, l_Data, size) - idata;
-    }
-
-#else
-    #define LOG2_WARP_SIZE 5U
-    #define      WARP_SIZE (1U << LOG2_WARP_SIZE)
-
-    //Almost the same as naiveScan1 but doesn't need barriers
-    //assuming size <= WARP_SIZE
-    inline uint warpScanInclusive(uint idata, __local uint *l_Data, uint size){
-        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
-        l_Data[pos] = 0;
-        pos += size;
-        l_Data[pos] = idata;
-
-        for(uint offset = 1; offset < size; offset <<= 1)
-            l_Data[pos] += l_Data[pos - offset];
-
-        return l_Data[pos];
-    }
-
-    inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){
-        return warpScanInclusive(idata, l_Data, size) - idata;
-    }
-
-    inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
-        if(size > WARP_SIZE){
-            //Bottom-level inclusive warp scan
-            uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE);
-
-            //Save top elements of each warp for exclusive warp scan
-            //sync to wait for warp scans to complete (because l_Data is being overwritten)
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
-                l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult;
-
-            //wait for warp scans to complete
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){
-                //grab top warp elements
-                uint val = l_Data[get_local_id(0)];
-                //calculate exclsive scan and write back to shared memory
-                l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE);
-            }
-
-            //return updated warp scans with exclusive scan results
-            barrier(CLK_LOCAL_MEM_FENCE);
-            return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE];
-        }else{
-            return warpScanInclusive(idata, l_Data, size);
-        }
-    }
-
-    inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
-        return scan1Inclusive(idata, l_Data, size) - idata;
-    }
-#endif
-
-
-//Vector scan: the array to be scanned is stored
-//in work-item private memory as uint4
-inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){
-    //Level-0 inclusive scan
-    data4.y += data4.x;
-    data4.z += data4.y;
-    data4.w += data4.z;
-
-    //Level-1 exclusive scan
-    uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w;
-
-    return (data4 + (uint4)val);
-}
-
-inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){
-    return scan4Inclusive(data4, l_Data, size) - data4;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Scan kernels
-////////////////////////////////////////////////////////////////////////////////
-__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
-void scanExclusiveLocal1(
-    __global uint4 *d_Dst,
-    __global uint4 *d_Src,
-    __local uint* l_Data,
-    uint size
-){
-    //Load data
-    uint4 idata4 = d_Src[get_global_id(0)];
-
-    //Calculate exclusive scan
-    uint4 odata4  = scan4Exclusive(idata4, l_Data, size);
-
-    //Write back
-    d_Dst[get_global_id(0)] = odata4;
-}
-
-//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
-__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
-void scanExclusiveLocal2(
-    __global uint *d_Buf,
-    __global uint *d_Dst,
-    __global uint *d_Src,
-    __local uint* l_Data,
-    uint N,
-    uint arrayLength
-){
-    //Load top elements
-    //Convert results of bottom-level scan back to inclusive
-    //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N)
-    uint data = 0;
-    if(get_global_id(0) < N)
-    data =
-        d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + 
-        d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)];
-
-    //Compute
-    uint odata = scan1Exclusive(data, l_Data, arrayLength);
-
-    //Avoid out-of-bound access
-    if(get_global_id(0) < N)
-        d_Buf[get_global_id(0)] = odata;
-}
-
-//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
-__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
-void uniformUpdate(
-    __global uint4 *d_Data,
-    __global uint *d_Buf
-){
-    __local uint buf[1];
-
-    uint4 data4 = d_Data[get_global_id(0)];
-
-    if(get_local_id(0) == 0)
-        buf[0] = d_Buf[get_group_id(0)];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    data4 += (uint4)buf[0];
-    d_Data[get_global_id(0)] = data4;
-}
-- 
cgit v1.2.3