aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Bien <[email protected]>2010-09-16 16:06:28 +0200
committerMichael Bien <[email protected]>2010-09-16 16:06:28 +0200
commitb680c4d4b5091f7f48bb8fd9812725111496ff99 (patch)
tree7b25523b3e78b81127e40519b31e19e0ce93cfd7
parent9cdd142d797518e1aa2834675259523a9a3533a0 (diff)
added bandwidth benchmark.
-rw-r--r--nbproject/configs/BandwidthBenchmark.properties1
-rw-r--r--src/com/jogamp/opencl/demos/bandwidth/BandwidthBenchmark.java456
2 files changed, 457 insertions, 0 deletions
diff --git a/nbproject/configs/BandwidthBenchmark.properties b/nbproject/configs/BandwidthBenchmark.properties
new file mode 100644
index 0000000..280e757
--- /dev/null
+++ b/nbproject/configs/BandwidthBenchmark.properties
@@ -0,0 +1 @@
+main.class=com.jogamp.opencl.demos.bandwidth.BandwidthTest
diff --git a/src/com/jogamp/opencl/demos/bandwidth/BandwidthBenchmark.java b/src/com/jogamp/opencl/demos/bandwidth/BandwidthBenchmark.java
new file mode 100644
index 0000000..2a9acb0
--- /dev/null
+++ b/src/com/jogamp/opencl/demos/bandwidth/BandwidthBenchmark.java
@@ -0,0 +1,456 @@
+/*
+ * Created on Tuesday, September 14 2010 17:19
+ */
+
+package com.jogamp.opencl.demos.bandwidth;
+
+import com.jogamp.common.nio.Buffers;
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.CLPlatform;
+
+import static com.jogamp.opencl.CLMemory.Map.*;
+import com.jogamp.opencl.CLMemory.Mem;
+import static com.jogamp.opencl.CLMemory.Mem.*;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Port of Nvidia's BandwidthTest to JOCL HLB.
+ * @author Michael Bien
+ */
+public class BandwidthBenchmark {
+
+ // defines, project
+ private static int MEMCOPY_ITERATIONS = 100;
+ private static int DEFAULT_SIZE = (32 * (1 << 20)); //32 M
+ private static int DEFAULT_INCREMENT = (1 << 22); //4 M
+ private static int CACHE_CLEAR_SIZE = (1 << 24); //16 M
+
+ //shmoo mode defines
+ private static int SHMOO_MEMSIZE_MAX = (1 << 26); //64 M
+ private static int SHMOO_MEMSIZE_START = (1 << 10); //1 KB
+ private static int SHMOO_INCREMENT_1KB = (1 << 10); //1 KB
+ private static int SHMOO_INCREMENT_2KB = (1 << 11); //2 KB
+ private static int SHMOO_INCREMENT_10KB = (10 * (1 << 10)); //10KB
+ private static int SHMOO_INCREMENT_100KB = (100 * (1 << 10)); //100 KB
+ private static int SHMOO_INCREMENT_1MB = (1 << 20); //1 MB
+ private static int SHMOO_INCREMENT_2MB = (1 << 21); //2 MB
+ private static int SHMOO_INCREMENT_4MB = (1 << 22); //4 MB
+ private static int SHMOO_LIMIT_20KB = (20 * (1 << 10)); //20 KB
+ private static int SHMOO_LIMIT_50KB = (50 * (1 << 10)); //50 KB
+ private static int SHMOO_LIMIT_100KB = (100 * (1 << 10)); //100 KB
+ private static int SHMOO_LIMIT_1MB = (1 << 20); //1 MB
+ private static int SHMOO_LIMIT_16MB = (1 << 24); //16 MB
+ private static int SHMOO_LIMIT_32MB = (1 << 25); //32 MB
+
+ private enum TEST_MODE { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
+ private enum COPY { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
+ private enum MODE { PAGEABLE, PINNED };
+ private enum ACCESS { MAPPED, DIRECT };
+
+
+ public static void main(String[] args) {
+
+ int start = DEFAULT_SIZE;
+ int end = DEFAULT_SIZE;
+ int increment = DEFAULT_INCREMENT;
+
+ TEST_MODE mode = TEST_MODE.QUICK_MODE;
+ MODE memMode = MODE.PAGEABLE;
+ ACCESS accMode = ACCESS.DIRECT;
+
+ CLPlatform[] platforms = CLPlatform.listCLPlatforms();
+ CLPlatform platform = platforms[0];
+
+ // prefere NV
+ for (CLPlatform p : platforms) {
+ if(p.getICDSuffix().equals("NV")) {
+ platform = p;
+ break;
+ }
+ }
+
+ CLContext context = CLContext.create(platform.getMaxFlopsDevice(CLDevice.Type.GPU));
+
+ System.out.println();
+ System.out.println(platform);
+ System.out.println(context);
+ System.out.println();
+
+ // Run tests
+ testBandwidth(context, start, end, increment, mode, COPY.HOST_TO_DEVICE, accMode, memMode);
+ testBandwidth(context, start, end, increment, mode, COPY.DEVICE_TO_HOST, accMode, memMode);
+ testBandwidth(context, start, end, increment, mode, COPY.DEVICE_TO_DEVICE, accMode, memMode);
+
+ context.release();
+ }
+
+ private static void testBandwidth(CLContext context, int start, int end, int increment, TEST_MODE mode, COPY kind, ACCESS accMode, MODE memMode) {
+ switch (mode) {
+ case QUICK_MODE:
+ testBandwidthQuick(context, DEFAULT_SIZE, kind, accMode, memMode);
+ break;
+ case RANGE_MODE:
+ testBandwidthRange(context, start, end, increment, kind, accMode, memMode);
+ break;
+ case SHMOO_MODE:
+ testBandwidthShmoo(context, kind, accMode, memMode);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /**
+ * Run a quick mode bandwidth test
+ */
+ private static void testBandwidthQuick(CLContext context, int size, COPY kind, ACCESS accMode, MODE memMode) {
+ testBandwidthRange(context, size, size, DEFAULT_INCREMENT, kind, accMode, memMode);
+ }
+
+ /**
+ * Run a range mode bandwidth test
+ */
+ private static void testBandwidthRange(CLContext context, int start, int end, int increment, COPY kind, ACCESS accMode, MODE memMode) {
+ //count the number of copies we're going to run
+ int count = 1 + ((end - start) / increment);
+
+ int[] memSizes = new int[count];
+ double[] bandwidths = new double[count];
+
+ // Use the device asked by the user
+ CLDevice[] devices = context.getDevices();
+ for (CLDevice device : devices) {
+ CLCommandQueue queue = device.createCommandQueue();
+
+ //run each of the copies
+ for (int i = 0; i < count; i++) {
+ memSizes[i] = start + i * increment;
+ switch (kind) {
+ case DEVICE_TO_HOST:
+ bandwidths[i] += testDeviceToHostTransfer(queue, memSizes[i], accMode, memMode);
+ break;
+ case HOST_TO_DEVICE:
+ bandwidths[i] += testHostToDeviceTransfer(queue, memSizes[i], accMode, memMode);
+ break;
+ case DEVICE_TO_DEVICE:
+ bandwidths[i] += testDeviceToDeviceTransfer(queue, memSizes[i]);
+ break;
+ }
+ }
+ queue.release();
+ }
+
+ //print results
+ printResultsReadable(memSizes, bandwidths, count, kind, accMode, memMode, count);
+ }
+
+ /**
+ * Intense shmoo mode - covers a large range of values with varying increments
+ */
+ private static void testBandwidthShmoo(CLContext context, COPY kind, ACCESS accMode, MODE memMode) {
+
+ //count the number of copies to make
+ int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
+ + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
+ + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
+ + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
+ + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
+ + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
+ + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
+
+ int[] memSizes = new int[count];
+ double[] bandwidths = new double[count];
+
+ // Use the device asked by the user
+ CLDevice[] devices = context.getDevices();
+ for (CLDevice device : devices) {
+ // Allocate command queue for the device
+ CLCommandQueue queue = device.createCommandQueue();
+
+ //Run the shmoo
+ int iteration = 0;
+ int memSize = 0;
+ while (memSize <= SHMOO_MEMSIZE_MAX) {
+ if (memSize < SHMOO_LIMIT_20KB) {
+ memSize += SHMOO_INCREMENT_1KB;
+ } else if (memSize < SHMOO_LIMIT_50KB) {
+ memSize += SHMOO_INCREMENT_2KB;
+ } else if (memSize < SHMOO_LIMIT_100KB) {
+ memSize += SHMOO_INCREMENT_10KB;
+ } else if (memSize < SHMOO_LIMIT_1MB) {
+ memSize += SHMOO_INCREMENT_100KB;
+ } else if (memSize < SHMOO_LIMIT_16MB) {
+ memSize += SHMOO_INCREMENT_1MB;
+ } else if (memSize < SHMOO_LIMIT_32MB) {
+ memSize += SHMOO_INCREMENT_2MB;
+ } else {
+ memSize += SHMOO_INCREMENT_4MB;
+ }
+
+ memSizes[iteration] = memSize;
+ switch (kind) {
+ case DEVICE_TO_HOST:
+ bandwidths[iteration] += testDeviceToHostTransfer(queue, memSizes[iteration], accMode, memMode);
+ break;
+ case HOST_TO_DEVICE:
+ bandwidths[iteration] += testHostToDeviceTransfer(queue, memSizes[iteration], accMode, memMode);
+ break;
+ case DEVICE_TO_DEVICE:
+ bandwidths[iteration] += testDeviceToDeviceTransfer(queue, memSizes[iteration]);
+ break;
+ }
+ iteration++;
+ }
+ queue.release();
+ }
+
+ //print results
+ printResultsReadable(memSizes, bandwidths, count, kind, accMode, memMode, count);
+
+ }
+
+ /**
+ * test the bandwidth of a device to host memcopy of a specific size
+ */
+ private static double testDeviceToHostTransfer(CLCommandQueue queue, int memSize, ACCESS accMode, MODE memMode) {
+
+ ByteBuffer h_data = null;
+ CLBuffer<?> cmPinnedData = null;
+ CLBuffer<?> cmDevData;
+
+ CLContext context = queue.getContext();
+
+ //allocate and init host memory, pinned or conventional
+ if (memMode == memMode.PINNED) {
+ // Create a host buffer
+ cmPinnedData = context.createBuffer(memSize, Mem.READ_WRITE, Mem.ALLOCATE_BUFFER);
+
+ // Get a mapped pointer
+ h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
+ h_data.clear();
+
+ // unmap and make data in the host buffer valid
+ cmPinnedData = cmPinnedData.cloneWith(h_data);
+ queue.putUnmapMemory(cmPinnedData);
+ } else {
+ // standard host alloc
+ h_data = Buffers.newDirectByteBuffer(memSize);
+ }
+
+ // allocate device memory
+ cmDevData = context.createBuffer(memSize, Mem.READ_WRITE);
+
+ // initialize device memory
+ if (memMode == memMode.PINNED) {
+ // Get a mapped pointer
+ h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
+
+ cmDevData = cmDevData.cloneWith(h_data);
+ queue.putWriteBuffer(cmDevData, false);
+ } else {
+ cmDevData = cmDevData.cloneWith(h_data);
+ queue.putWriteBuffer(cmDevData, false);
+ }
+
+ // Sync queue to host, start timer 0, and copy data from GPU to Host
+ queue.finish();
+
+ long delta = System.nanoTime();
+
+ if (accMode == accMode.DIRECT) {
+ // DIRECT: API access to device buffer
+ cmDevData = cmDevData.cloneWith(h_data);
+ for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ queue.putReadBuffer(cmDevData, false);
+ }
+ queue.finish();
+ } else {
+ // MAPPED: mapped pointers to device buffer for conventional pointer access
+ ByteBuffer dm_idata = queue.putMapBuffer(cmDevData, WRITE, true);
+ for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ dm_idata.put(h_data).rewind();
+ h_data.rewind();
+ }
+ cmDevData = cmDevData.cloneWith(dm_idata);
+ queue.putUnmapMemory(cmDevData);
+ }
+
+ //get the the elapsed time in seconds
+ delta = System.nanoTime() - delta;
+
+ //clean up memory
+ cmDevData.release();
+
+ if (cmPinnedData != null) {
+ cmPinnedData = cmPinnedData.cloneWith(h_data);
+ queue.putUnmapMemory(cmPinnedData);
+ cmPinnedData.release();
+ }
+
+ //calculate bandwidth in MB/s
+ double elapsedTime = delta/1000000000.0;
+ return ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
+ }
+
+ /**
+ * test the bandwidth of a device to host memcopy of a specific size
+ */
+ private static double testHostToDeviceTransfer(CLCommandQueue queue, int memSize, ACCESS accMode, MODE memMode) {
+
+ ByteBuffer h_data;
+ CLBuffer<?> cmPinnedData = null;
+ CLBuffer<?> cmDevData;
+
+ CLContext context = queue.getContext();
+
+ // Allocate and init host memory, pinned or conventional
+ if (memMode == memMode.PINNED) {
+ // Create a host buffer
+ cmPinnedData = context.createBuffer(memSize, Mem.READ_WRITE, Mem.ALLOCATE_BUFFER);
+
+ // Get a mapped pointer
+ h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
+
+ //initialize
+ h_data.clear();
+
+ // unmap and make data in the host buffer valid
+ cmPinnedData = cmPinnedData.cloneWith(h_data);
+ queue.putUnmapMemory(cmPinnedData);
+ } else {
+ // standard host alloc
+ h_data = Buffers.newDirectByteBuffer(memSize);
+ }
+
+ // allocate device memory
+ cmDevData = context.createBuffer(memSize, Mem.READ_WRITE);
+
+ // Sync queue to host, start timer 0, and copy data from Host to GPU
+ queue.finish();
+
+ long delta = System.nanoTime();
+
+ if (accMode == accMode.DIRECT) {
+ if (memMode == memMode.PINNED) {
+ // Get a mapped pointer
+ h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
+ }
+
+ // DIRECT: API access to device buffer
+ cmDevData = cmDevData.cloneWith(h_data);
+ for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ queue.putWriteBuffer(cmDevData, false);
+ }
+ queue.finish();
+ } else {
+
+ // MAPPED: mapped pointers to device buffer and conventional pointer access
+ ByteBuffer dm_idata = queue.putMapBuffer(cmDevData, READ, true);
+ for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ h_data.put(dm_idata).rewind();
+ dm_idata.rewind();
+ }
+ cmDevData = cmDevData.cloneWith(dm_idata);
+ queue.putUnmapMemory(cmDevData);
+ }
+
+ //get the the elapsed time in ms
+ delta = System.nanoTime() - delta;
+
+ //clean up memory
+ cmDevData.release();
+
+ if (cmPinnedData != null) {
+ cmPinnedData = cmPinnedData.cloneWith(h_data);
+ queue.putUnmapMemory(cmPinnedData);
+ cmPinnedData.release();
+ }
+
+ //calculate bandwidth in MB/s
+ double elapsedTime = delta/1000000000.0;
+ return ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
+ }
+
+ /**
+ * test the bandwidth of a device to host memcopy of a specific size
+ */
+ private static double testDeviceToDeviceTransfer(CLCommandQueue queue, int memSize) {
+
+ CLContext context = queue.getContext();
+
+ //allocate host memory
+ ByteBuffer h_idata = Buffers.newDirectByteBuffer(memSize);
+ h_idata.clear();
+
+ // allocate device input and output memory and initialize the device input memory
+ CLBuffer<?> d_idata = context.createBuffer(memSize, READ_ONLY);
+ CLBuffer<?> d_odata = context.createBuffer(memSize, WRITE_ONLY);
+
+ d_idata = d_idata.cloneWith(h_idata);
+ queue.putWriteBuffer(d_idata, true);
+
+ // Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer
+ queue.finish();
+
+ long delta = System.nanoTime();
+
+ for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ queue.putCopyBuffer(d_idata, d_odata);
+ }
+
+ // Sync with GPU
+ queue.finish();
+
+ //get the the elapsed time in ms
+ delta = System.nanoTime() - delta;
+
+ //clean up memory on host and device
+ d_idata.release();
+ d_odata.release();
+
+ // Calculate bandwidth in MB/s
+ // This is for kernels that read and write GMEM simultaneously
+ // Obtained Throughput for unidirectional block copies will be 1/2 of this #
+ double elapsedTime = delta/1000000000.0;
+ return 2.0 * ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
+ }
+
+ /**
+ * print results in an easily read format
+ */
+ private static void printResultsReadable(int[] memSizes, double[] bandwidths, int count, COPY kind, ACCESS accMode, MODE memMode, int iNumDevs) {
+ // log config information
+ if (kind == COPY.DEVICE_TO_DEVICE) {
+ System.out.print("Device to Device Bandwidth, "+iNumDevs+" Device(s), ");
+ } else {
+ if (kind == COPY.DEVICE_TO_HOST) {
+ System.out.print("Device to Host Bandwidth, "+iNumDevs+" Device(s), ");
+ } else if (kind == COPY.HOST_TO_DEVICE) {
+ System.out.print("Host to Device Bandwidth, "+iNumDevs+" Device(s), ");
+ }
+ if (memMode == memMode.PAGEABLE) {
+ System.out.print("Paged memory");
+ } else if (memMode == memMode.PINNED) {
+ System.out.print("Pinned memory");
+ }
+ if (accMode == accMode.DIRECT) {
+ System.out.println(", direct access");
+ } else if (accMode == accMode.MAPPED) {
+ System.out.println(", mapped access");
+ }
+ }
+ System.out.println();
+
+ System.out.println(" Transfer Size (Bytes)\tBandwidth(MB/s)\n");
+ int i;
+ for (i = 0; i < (count - 1); i++) {
+ System.out.printf(" %s\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+ }
+ System.out.printf(" %s\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+ }
+
+}