aboutsummaryrefslogtreecommitdiffstats
path: root/src/com/jogamp/opencl/demos/radixsort/Scan.java
blob: 7fa1f09599aabf6fa85dbc350f4b509bcb8541cf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/*
 * 22:12 Sunday, February 28 2010
 */
package com.jogamp.opencl.demos.radixsort;

import com.jogamp.opencl.CLBuffer;
import com.jogamp.opencl.CLCommandQueue;
import com.jogamp.opencl.CLContext;
import com.jogamp.opencl.CLKernel;
import com.jogamp.opencl.CLProgram;
import com.jogamp.opencl.CLResource;
import java.io.IOException;

import static com.jogamp.opencl.CLMemory.Mem.*;
import static com.jogamp.opencl.CLProgram.CompilerOptions.*;

/**
 *
 * @author Michael Bien
 */
public class Scan implements CLResource {

    private final static int MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE = 1024;
//    private final static int MAX_LOCAL_GROUP_SIZE = 256;
    private final static int WORKGROUP_SIZE = 256;
    private final static int MAX_BATCH_ELEMENTS = 64 * 1048576;
//    private final static int MIN_SHORT_ARRAY_SIZE = 4;
//    private final static int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE;
    private final static int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE;
    private final static int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE;

    private final CLKernel ckScanExclusiveLocal1;
    private final CLKernel ckScanExclusiveLocal2;
    private final CLKernel ckUniformUpdate;

    private final CLCommandQueue queue;
    private final CLProgram program;
    private CLBuffer<?> buffer;

    public Scan(CLCommandQueue queue, int numElements) throws IOException {

        this.queue = queue;

        CLContext context = queue.getContext();
        if (numElements > MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE) {
            buffer = context.createBuffer(numElements / MAX_WORKGROUP_INCLUSIVE_SCAN_SIZE * 4, READ_WRITE);
        }
        program = context.createProgram(getClass().getResourceAsStream("Scan_b.cl"))
                         .build(ENABLE_MAD);

        ckScanExclusiveLocal1 = program.createCLKernel("scanExclusiveLocal1");
        ckScanExclusiveLocal2 = program.createCLKernel("scanExclusiveLocal2");
        ckUniformUpdate       = program.createCLKernel("uniformUpdate");
    }

    // main exclusive scan routine
    void scanExclusiveLarge(CLBuffer<?> dst, CLBuffer<?> src, int batchSize, int arrayLength) {

        //Check power-of-two factorization
        if(!isPowerOf2(arrayLength)) {
            throw new RuntimeException();
        }

        //Check supported size range
        if (!((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE))) {
            throw new RuntimeException();
        }

        //Check total batch size limit
        if (!((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS)) {
            throw new RuntimeException();
        }

        scanExclusiveLocal1(dst, src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE);
        scanExclusiveLocal2(buffer, dst, src, batchSize, arrayLength / (4 * WORKGROUP_SIZE));
        uniformUpdate(dst, buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE));
    }

    void scanExclusiveLocal1(CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {

        ckScanExclusiveLocal1.putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4).putArg(size)
                             .rewind();

        int localWorkSize = WORKGROUP_SIZE;
        int globalWorkSize = (n * size) / 4;

        queue.put1DRangeKernel(ckScanExclusiveLocal1, 0, globalWorkSize, localWorkSize);
    }

    void scanExclusiveLocal2(CLBuffer<?> buffer, CLBuffer<?> dst, CLBuffer<?> src, int n, int size) {

        int elements = n * size;
        ckScanExclusiveLocal2.putArg(buffer).putArg(dst).putArg(src).putNullArg(2 * WORKGROUP_SIZE * 4)
                             .putArg(elements).putArg(size).rewind();

        int localWorkSize = WORKGROUP_SIZE;
        int globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);

        queue.put1DRangeKernel(ckScanExclusiveLocal2, 0, globalWorkSize, localWorkSize);
    }

    void uniformUpdate(CLBuffer<?> dst, CLBuffer<?> buffer, int n) {

        ckUniformUpdate.setArgs(dst, buffer);

        int localWorkSize  = WORKGROUP_SIZE;
        int globalWorkSize = n * WORKGROUP_SIZE;

        queue.put1DRangeKernel(ckUniformUpdate, 0, globalWorkSize, localWorkSize);
    }

    private int iSnapUp(int dividend, int divisor) {
        return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor);
    }

    public static boolean isPowerOf2(int x) {
        return ((x - 1) & x) == 0;
    }

    @Override
    public void release() {
        program.release();

        if(buffer!=null) {
            buffer.release();
        }
    }

    @Override
    public boolean isReleased() {
        return program.isReleased();
    }
}