JOCL FAQ: Difference between revisions

From JogampWiki
Jump to navigation Jump to search
(Created page with '== Get the Source Code == Create a local copy/branch of the git repository, either anonymous: * git clone git://github.com/mbien/gluegen.git gluegen * git clone git://github.co…')
 
(added rudimentary getting started page)
Line 6: Line 6:
* git clone git://github.com/mbien/gluegen.git gluegen
* git clone git://github.com/mbien/gluegen.git gluegen
* git clone git://github.com/mbien/jocl.git jocl
* git clone git://github.com/mbien/jocl.git jocl
* git clone git://github.com/mbien/jocl-demos.git jocl-demos
* git clone git://github.com/mbien/jogl.git jogl
* git clone git://github.com/mbien/jogl.git jogl


Line 13: Line 14:
* git clone [email protected]:username/gluegen.git gluegen
* git clone [email protected]:username/gluegen.git gluegen
* git clone [email protected]:username/jocl.git jocl
* git clone [email protected]:username/jocl.git jocl
* git clone [email protected]:username/jocl-demos.git jocl-demos
* git clone [email protected]:username/jogl.git jogl
* git clone [email protected]:username/jogl.git jogl
== Getting Started ==
Hello JOCL host program:
<pre>
import com.mbien.opencl.*;
import java.io.IOException;
import java.nio.FloatBuffer;
import java.util.Random;
import static java.lang.System.*;
import static com.mbien.opencl.CLMemory.Mem.*;
/**
* Hello Java OpenCL example. Adds all elements of buffer A to buffer B
* and stores the result in buffer C.<br/>
* Sample was inspired by the Nvidia VectorAdd example written in C/C++
* which is bundled in the Nvidia OpenCL SDK.
* @author Michael Bien
*/
public class HelloJOCL {
    public static void main(String[] args) throws IOException {
        int elementCount = 11444777;    // Length of arrays to process
        int localWorkSize = 256;        // Local work size       
        int globalWorkSize = roundUp(localWorkSize, elementCount);  // rounded up to the nearest multiple of the localWorkSize
        // set up
        CLContext context = CLContext.create();
        CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
        CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
        CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
        CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
        // fill read buffers with random numbers.
        fillBuffer(clBufferA.getBuffer(), 12345);
        fillBuffer(clBufferB.getBuffer(), 67890);
        // get a reference to the kernel functon with the name 'VectorAdd'
        // and map the buffers to its input parameters.
        CLKernel kernel = program.createCLKernel("VectorAdd");
        kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
        // create command queue on fastest device.
        CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
        // asynchronous write of data to GPU device,
        // blocking read later to get the computed results back.
        long time = nanoTime();
        queue.putWriteBuffer(clBufferA, false)
            .putWriteBuffer(clBufferB, false)
            .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
            .putReadBuffer(clBufferC, true);
        time = nanoTime() - time;
        // cleanup all resources associated with this context.
        context.release();
        // print first few elements of the resulting buffer to the console.
        out.println("a+b=c results snapshot: ");
        for(int i = 0; i < 10; i++)
            out.print(clBufferC.getBuffer().get() + ", ");
        out.println("...; " + clBufferC.getBuffer().remaining() + " more");
        out.println("computation took: "+(time/1000000)+"ms");
    }
    /* utilities */
    private static void fillBuffer(FloatBuffer buffer, int seed) {
        Random rnd = new Random(seed);
        while(buffer.remaining() != 0)
            buffer.put(rnd.nextFloat()*100);
        buffer.rewind();
    }
    private static int roundUp(int groupSize, int globalSize) {
        int r = globalSize % groupSize;
        if (r == 0) return globalSize;
        else        return globalSize + groupSize - r;
    }
}
</pre>
Hello JOCL Kernel
<pre>
    // OpenCL Kernel Function for element by element vector addition
    kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {
        // get index into global data array
        int iGID = get_global_id(0);
        // bound check, equivalent to the limit on a 'for' loop
        if (iGID >= numElements)  {
            return;
        }
        // add the vector elements
        c[iGID] = a[iGID] + b[iGID];
    }
</pre>

Revision as of 19:46, 18 March 2010

Get the Source Code

Create a local copy/branch of the git repository, either anonymous:

or via SSH and your user credential, so you can easily push back your changes to the github server:


Getting Started

Hello JOCL host program:

import com.mbien.opencl.*;
import java.io.IOException;
import java.nio.FloatBuffer;
import java.util.Random;

import static java.lang.System.*;
import static com.mbien.opencl.CLMemory.Mem.*;

/**
 * Hello Java OpenCL example. Adds all elements of buffer A to buffer B
 * and stores the result in buffer C.<br/>
 * Sample was inspired by the Nvidia VectorAdd example written in C/C++
 * which is bundled in the Nvidia OpenCL SDK.
 * @author Michael Bien
 */
public class HelloJOCL {

    public static void main(String[] args) throws IOException {

        int elementCount = 11444777;    // Length of arrays to process
        int localWorkSize = 256;        // Local work size        
        int globalWorkSize = roundUp(localWorkSize, elementCount);  // rounded up to the nearest multiple of the localWorkSize

        // set up
        CLContext context = CLContext.create();
        CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();

        CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
        CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
        CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);

        // fill read buffers with random numbers.
        fillBuffer(clBufferA.getBuffer(), 12345);
        fillBuffer(clBufferB.getBuffer(), 67890);

        // get a reference to the kernel functon with the name 'VectorAdd'
        // and map the buffers to its input parameters.
        CLKernel kernel = program.createCLKernel("VectorAdd");
        kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);

        // create command queue on fastest device.
        CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();

        // asynchronous write of data to GPU device,
        // blocking read later to get the computed results back.
        long time = nanoTime();
        queue.putWriteBuffer(clBufferA, false)
             .putWriteBuffer(clBufferB, false)
             .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
             .putReadBuffer(clBufferC, true);
        time = nanoTime() - time;

        // cleanup all resources associated with this context.
        context.release();

        // print first few elements of the resulting buffer to the console.
        out.println("a+b=c results snapshot: ");
        for(int i = 0; i < 10; i++)
            out.print(clBufferC.getBuffer().get() + ", ");
        out.println("...; " + clBufferC.getBuffer().remaining() + " more");

        out.println("computation took: "+(time/1000000)+"ms");
    }

    /* utilities */
    private static void fillBuffer(FloatBuffer buffer, int seed) {
        Random rnd = new Random(seed);
        while(buffer.remaining() != 0)
            buffer.put(rnd.nextFloat()*100);
        buffer.rewind();
    }

    private static int roundUp(int groupSize, int globalSize) {
        int r = globalSize % groupSize;
        if (r == 0) return globalSize;
        else        return globalSize + groupSize - r;
    }
}

Hello JOCL Kernel

    // OpenCL Kernel Function for element by element vector addition
    kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {

        // get index into global data array
        int iGID = get_global_id(0);

        // bound check, equivalent to the limit on a 'for' loop
        if (iGID >= numElements)  {
            return;
        }

        // add the vector elements
        c[iGID] = a[iGID] + b[iGID];
    }