From 056e8808a5ab3076c5a48c75b1ec7f811122caf9 Mon Sep 17 00:00:00 2001 From: Wade Walker Date: Mon, 6 Jul 2015 14:50:58 -0500 Subject: Fix radix sort on Mac Fixes a __local variable which must be declared at __kernel function scope (only the Mac complains about this, but it seems to be that way in the spec too). Has the nice effect of removing a Mac-specific ifdef from the kernel code which wasn't being used correctly. Also added some code to check allowed work group sizes. --- src/com/jogamp/opencl/demos/radixsort/RadixSort.cl | 28 ++++++++++------------ .../opencl/demos/radixsort/RadixSortDemo.java | 11 ++++++++- 2 files changed, 23 insertions(+), 16 deletions(-) (limited to 'src/com/jogamp') diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl index d014692..83a0388 100644 --- a/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl +++ b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl @@ -78,35 +78,30 @@ uint4 scan4(uint4 idata, __local uint* ptr) return val4; } -#ifdef MAC -__kernel uint4 rank4(uint4 preds, __local uint* sMem) -#else -uint4 rank4(uint4 preds, __local uint* sMem) -#endif +uint4 rank4(uint4 preds, __local uint* sMem, __local uint* pnumtrue) { int localId = get_local_id(0); int localSize = get_local_size(0); uint4 address = scan4(preds, sMem); - - __local uint numtrue; + if (localId == localSize - 1) { - numtrue = address.w + preds.w; + *pnumtrue = address.w + preds.w; } barrier(CLK_LOCAL_MEM_FENCE); uint4 rank; int idx = localId*4; - rank.x = (preds.x) ? address.x : numtrue + idx - address.x; - rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y; - rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z; - rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w; + rank.x = (preds.x) ? address.x : *pnumtrue + idx - address.x; + rank.y = (preds.y) ? address.y : *pnumtrue + idx + 1 - address.y; + rank.z = (preds.z) ? address.z : *pnumtrue + idx + 2 - address.z; + rank.w = (preds.w) ? address.w : *pnumtrue + idx + 3 - address.w; return rank; } -void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem) +void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem, __local uint* pnumtrue) { int localId = get_local_id(0); int localSize = get_local_size(0); @@ -121,7 +116,7 @@ void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* uint4 r; - r = rank4(lsb, sMem); + r = rank4(lsb, sMem, pnumtrue); // This arithmetic strides the ranks across 4 CTA_SIZE regions sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x; @@ -152,10 +147,13 @@ __kernel void radixSortBlocksKeysOnly(__global uint4* keysIn, uint4 key; key = keysIn[globalId]; + // must be declared at kernel function scope on Mac, was previously declared + // down inside the rank4() function + __local uint numtrue; barrier(CLK_LOCAL_MEM_FENCE); - radixSortBlockKeysOnly(&key, nbits, startbit, sMem); + radixSortBlockKeysOnly(&key, nbits, startbit, sMem, &numtrue); keysOut[globalId] = key; } diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java index 8650be2..a974ebf 100644 --- a/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java +++ b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java @@ -7,6 +7,7 @@ package com.jogamp.opencl.demos.radixsort; import com.jogamp.opencl.CLBuffer; import com.jogamp.opencl.CLCommandQueue; import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; import com.jogamp.opencl.CLPlatform; import java.io.IOException; import java.nio.IntBuffer; @@ -28,13 +29,21 @@ public class RadixSortDemo { try{ //single GPU setup context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU)); - CLCommandQueue queue = context.getDevices()[0].createCommandQueue(); + CLDevice device = context.getDevices()[0]; + CLCommandQueue queue = device.createCommandQueue(); int maxValue = Integer.MAX_VALUE; int samples = 10; int[] workgroupSizes = new int[] {128, 256}; + // make sure workgroup sizes don't exceed device maximum + int maxWorkgroupSize = device.getMaxWorkGroupSize(); + for( int i = 0; i < workgroupSizes.length; ++i ) { + if( workgroupSizes[i] > maxWorkgroupSize ) + throw new RuntimeException("Workgroup size " + workgroupSizes[i] + " greater than device max of "+ maxWorkgroupSize); + } + int[] runs = new int[] { 32768, 65536, 131072, -- cgit v1.2.3