blob: 5f124d62d0e61f7ddbffaffe1126e99c511aa772 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
// created on Sep 8, 2011
//@author mbien
#ifdef DOUBLE_FP
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double precision floating point not supported."
#endif
#endif
#ifndef TYPE
typedef int4 TYPE;
#warning "TYPE was not set"
#endif
inline TYPE op(TYPE a, TYPE b) {
#if OP_ADD
return a+b;
#elif OP_MUL
return a*b;
#elif OP_MIN
return min(a, b);
#elif OP_MAX
return max(a, b);
#else
return 0;
#warning "operation was not set"
#endif
}
kernel void reduce(const global TYPE* input, global TYPE* output, local TYPE* shared, const uint length) {
uint localID = get_local_id(0);
uint groupID = get_group_id(0);
uint globalID = get_global_id(0);
uint groupSize = get_local_size(0);
uint stride = globalID * 2;
// store results of first round in cache
if(globalID < length) {
shared[localID] = op(input[stride], input[stride+1]);
}else{
// no-ops if out of bounds
#if OP_ADD
shared[localID] = 0;
#elif OP_MUL
shared[localID] = 1;
#elif OP_MIN
shared[localID] = input[0];
#elif OP_MAX
shared[localID] = input[0];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
// recursive reduction
for(uint i = groupSize >> 1; i > 0; i >>= 1) {
if(localID < i) {
shared[localID] = op(shared[localID], shared[localID + i]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// return first element as result
if(localID == 0) {
output[groupID] = shared[0];
}
}
|