1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
package com.mbien.opencl.demos.hellojocl;
import com.mbien.opencl.CL;
import com.mbien.opencl.CLBuffer;
import com.mbien.opencl.CLCommandQueue;
import com.mbien.opencl.CLContext;
import com.mbien.opencl.CLKernel;
import com.mbien.opencl.CLProgram;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Random;
import static java.lang.System.*;
import static com.sun.gluegen.runtime.BufferFactory.*;
/**
* Hello Java OpenCL example. Adds all elements of buffer A to buffer B
* and stores the result in buffer C.<br/>
* Sample was inspired by the Nvidia VectorAdd example written in C/C++
* which is bundled in the Nvidia OpenCL SDK.
* @author Michael Bien
*/
public class HelloJOCL {
public static void main(String[] args) throws IOException {
int elementCount = 11444777; // Length of float arrays to process
int localWorkSize = 256; // set and log Global and Local work size dimensions
int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize
// set up
CLContext context = CLContext.create();
CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
CLBuffer clBufferA = context.createBuffer(CL.CL_MEM_READ_ONLY, globalWorkSize*SIZEOF_INT);
CLBuffer clBufferB = context.createBuffer(CL.CL_MEM_READ_ONLY, globalWorkSize*SIZEOF_INT);
CLBuffer clBufferC = context.createBuffer(CL.CL_MEM_WRITE_ONLY, globalWorkSize*SIZEOF_INT);
out.println("used device memory: "
+ (clBufferA.buffer.capacity()+clBufferB.buffer.capacity()+clBufferC.buffer.capacity())/1000000 +"MB");
// fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between).
fillBuffer(clBufferA.buffer, 12345);
fillBuffer(clBufferB.buffer, 67890);
// get a reference to the kernel functon with the name 'VectorAdd' and map the buffers to its input parameters.
CLKernel kernel = program.getCLKernels().get("VectorAdd");
kernel.setArg(0, SIZEOF_LONG, clBufferA)
.setArg(1, SIZEOF_LONG, clBufferB)
.setArg(2, SIZEOF_LONG, clBufferC)
.setArg(3, SIZEOF_INT, elementCount);
// create command queue on first device.
CLCommandQueue queue = context.getCLDevices()[0].createCommandQueue();
// asynchronous write of data to GPU device, blocking read later to get the computed results back.
long time = nanoTime();
queue.putWriteBuffer(clBufferA, false)
.putWriteBuffer(clBufferB, false)
.putNDRangeKernel(kernel, 1, null, new long[]{ globalWorkSize }, new long[]{ localWorkSize })
.putReadBuffer(clBufferC, true);
time = nanoTime() - time;
// cleanup all resources associated with this context.
context.release();
// print first few elements of the resulting buffer to the console.
out.println("a+b=c results snapshot: ");
for(int i = 0; i < 10; i++)
out.print(clBufferC.buffer.getInt() + ", ");
out.println("...; " + clBufferC.buffer.remaining()/SIZEOF_INT + " more");
System.out.println("computation took: "+(time/1000000)+"ms");
}
public static final void fillBuffer(ByteBuffer buffer, int seed) {
Random rnd = new Random(seed);
while(buffer.remaining() != 0)
buffer.putInt(rnd.nextInt());
buffer.rewind();
}
public static final int roundUp(int groupSize, int globalSize) {
int r = globalSize % groupSize;
if (r == 0) {
return globalSize;
} else {
return globalSize + groupSize - r;
}
}
}
|