root / Pi / C / OpenCL / PiOpenCL.c @ 249
Historique | Voir | Annoter | Télécharger (15,06 ko)
1 |
// Pi Dart Dash in OpenCL in C, illustrative example
|
---|---|
2 |
//
|
3 |
// Emmanuel Quemener <emmanuel.quemener@gmail.com>
|
4 |
//
|
5 |
// CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
6 |
// Copyleft Cecill v2
|
7 |
//
|
8 |
// -h : print the documentation and detect devices as (platform,device)
|
9 |
//
|
10 |
// classical use:
|
11 |
// #1 OpenCL Plateform ID: get this information with -h option
|
12 |
// #2 OpenCL Device ID: get this information with -h option
|
13 |
// #3 Minimal number of iterations:
|
14 |
// #4 Parallel Rate: scattering global work in parts executed //ly
|
15 |
// #5 Loops (to improve statistics)
|
16 |
// #6 Type of variables INT32, INT64, FP32, FP64
|
17 |
//
|
18 |
// To compile : gcc -o PiOpenCL PiOpenCL.c -lOpenCL -lm
|
19 |
|
20 |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
21 |
#include <stdio.h> |
22 |
#include <stdlib.h> |
23 |
#include <string.h> |
24 |
#include <CL/cl.h> |
25 |
#include <stdint.h> |
26 |
#include <math.h> |
27 |
#include <sys/time.h> |
28 |
|
29 |
#define TINT32 0 |
30 |
#define TINT64 1 |
31 |
#define TFP32 2 |
32 |
#define TFP64 3 |
33 |
|
34 |
int DetectOpenCLDevices(void) |
35 |
{ |
36 |
int i, j;
|
37 |
char* value;
|
38 |
size_t valueSize; |
39 |
cl_uint platformCount; |
40 |
cl_platform_id* platforms; |
41 |
cl_uint deviceCount; |
42 |
cl_device_id* devices; |
43 |
cl_uint maxComputeUnits; |
44 |
cl_int maxWorkGroupSize; |
45 |
cl_int maxWorkItemSizes; |
46 |
cl_device_type dev_type; |
47 |
|
48 |
// get all platforms
|
49 |
clGetPlatformIDs(0, NULL, &platformCount); |
50 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
51 |
clGetPlatformIDs(platformCount, platforms, NULL);
|
52 |
|
53 |
printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
54 |
|
55 |
for (i = 0; i < platformCount; i++) { |
56 |
|
57 |
// get all devices
|
58 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
59 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
60 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
61 |
|
62 |
// for each device print critical attributes
|
63 |
for (j = 0; j < deviceCount; j++) { |
64 |
|
65 |
// print device name
|
66 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
67 |
value = (char*) malloc(valueSize);
|
68 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
69 |
printf("Device (%d,%d): %s\n",i, j, value);
|
70 |
free(value); |
71 |
|
72 |
// print type device CPU/GPU/ACCELERATOR
|
73 |
clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
74 |
printf("\tDevice Type: ");
|
75 |
if(dev_type & CL_DEVICE_TYPE_GPU)
|
76 |
printf("CL_DEVICE_TYPE_GPU ");
|
77 |
if(dev_type & CL_DEVICE_TYPE_CPU)
|
78 |
printf("CL_DEVICE_TYPE_CPU ");
|
79 |
if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
80 |
printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
81 |
if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
82 |
printf("CL_DEVICE_TYPE_DEFAULT ");
|
83 |
printf("\n");
|
84 |
|
85 |
// print device vendor
|
86 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
87 |
value = (char*) malloc(valueSize);
|
88 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
89 |
printf("\tDevice vendor: %s\n", value);
|
90 |
free(value); |
91 |
|
92 |
// print hardware device version
|
93 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
94 |
value = (char*) malloc(valueSize);
|
95 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
96 |
printf("\tHardware version: %s\n", value);
|
97 |
free(value); |
98 |
|
99 |
// print software driver version
|
100 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
101 |
value = (char*) malloc(valueSize);
|
102 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
103 |
printf("\tSoftware version: %s\n", value);
|
104 |
free(value); |
105 |
|
106 |
// print c version supported by compiler for device
|
107 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
108 |
value = (char*) malloc(valueSize);
|
109 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
110 |
printf("\tOpenCL C version: %s\n", value);
|
111 |
free(value); |
112 |
|
113 |
// print parallel compute units
|
114 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
115 |
sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
116 |
printf("\tParallel compute units: %d\n", maxComputeUnits);
|
117 |
|
118 |
// print max work group size
|
119 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
120 |
sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
121 |
printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
122 |
|
123 |
// print max work items size
|
124 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
125 |
sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
126 |
printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
127 |
|
128 |
} |
129 |
printf("\n");
|
130 |
free(devices); |
131 |
} |
132 |
|
133 |
free(platforms); |
134 |
return 0; |
135 |
|
136 |
} |
137 |
|
138 |
const char* OpenCLSource[] = { |
139 |
"#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
|
140 |
"// Marsaglia RNG very simple implementation \n",
|
141 |
"#define znew ((z=36969*(z&65535)+(z>>16))<<16) \n",
|
142 |
"#define wnew ((w=18000*(w&65535)+(w>>16))&65535) \n",
|
143 |
"#define MWC (znew+wnew) \n",
|
144 |
"#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
|
145 |
"#define CONG (jcong=69069*jcong+1234567) \n",
|
146 |
"#define KISS ((MWC^CONG)+SHR3) \n",
|
147 |
"#define MWCfp MWC * 2.328306435454494e-10f \n",
|
148 |
"#define KISSfp KISS * 2.328306435454494e-10f \n",
|
149 |
"#define CONGfp CONG * 2.328306435454494e-10f \n",
|
150 |
"#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
|
151 |
"#define TINT32 0 \n",
|
152 |
"#define TINT64 1 \n",
|
153 |
"#define TFP32 2 \n",
|
154 |
"#define TFP64 3 \n",
|
155 |
"#define THEONE32I 1073741824 \n",
|
156 |
"#define THEONE32F 1.e0f \n",
|
157 |
"#define THEONE64I 4611686018427387904 \n",
|
158 |
"#define THEONE64F (double)1.e0f \n",
|
159 |
"ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
160 |
"{",
|
161 |
" uint z=seed_z+work;",
|
162 |
" uint w=seed_w+work;",
|
163 |
" ulong total=0;",
|
164 |
" for (ulong i=0;i<iterations;i++)",
|
165 |
" {",
|
166 |
" uint x= MWC>>17;",
|
167 |
" uint y= MWC>>17;",
|
168 |
" ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
|
169 |
" total+=inside;",
|
170 |
" }",
|
171 |
" return(total);",
|
172 |
"}",
|
173 |
"ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
174 |
"{",
|
175 |
" uint z=seed_z+work;",
|
176 |
" uint w=seed_w+work;",
|
177 |
" ulong total=0;",
|
178 |
" for (ulong i=0;i<iterations;i++)",
|
179 |
" {",
|
180 |
" float x=(float)MWCfp ;",
|
181 |
" float y=(float)MWCfp ;",
|
182 |
" ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
|
183 |
" total+=inside;",
|
184 |
" }",
|
185 |
" return(total);",
|
186 |
"}",
|
187 |
"ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
188 |
"{",
|
189 |
" uint z=seed_z+work;",
|
190 |
" uint w=seed_w+work;",
|
191 |
" ulong total=0;",
|
192 |
" for (ulong i=0;i<iterations;i++)",
|
193 |
" {",
|
194 |
" ulong x=(ulong)(MWC>>1);",
|
195 |
" ulong y=(ulong)(MWC>>1);",
|
196 |
" ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
|
197 |
" total+=inside;",
|
198 |
" }",
|
199 |
" return(total);",
|
200 |
"}",
|
201 |
"ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
202 |
"{",
|
203 |
" uint z=seed_z+work;",
|
204 |
" uint w=seed_w+work;",
|
205 |
" ulong total=0;",
|
206 |
" for (ulong i=0;i<iterations;i++)",
|
207 |
"{",
|
208 |
" double x=(double)MWCfp ;",
|
209 |
" double y=(double)MWCfp ;",
|
210 |
" ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
|
211 |
" total+=inside;",
|
212 |
"}",
|
213 |
" return(total);",
|
214 |
"}",
|
215 |
"__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
216 |
"{",
|
217 |
" ulong total;",
|
218 |
" if (MyType==TFP32) {",
|
219 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
220 |
" }",
|
221 |
" else if (MyType==TFP64) {",
|
222 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
223 |
" }",
|
224 |
" else if (MyType==TINT32) {",
|
225 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
226 |
" }",
|
227 |
" else if (MyType==TINT64) {",
|
228 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
229 |
" }",
|
230 |
" barrier(CLK_GLOBAL_MEM_FENCE);",
|
231 |
" s[get_global_id(0)]=(ulong)total;",
|
232 |
"}",
|
233 |
"__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
234 |
"{",
|
235 |
" ulong total;",
|
236 |
" if (MyType==TFP32) {",
|
237 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
|
238 |
" }",
|
239 |
" else if (MyType==TFP64) {",
|
240 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
|
241 |
" }",
|
242 |
" else if (MyType==TINT32) {",
|
243 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
|
244 |
" }",
|
245 |
" else if (MyType==TINT64) {",
|
246 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
|
247 |
" }",
|
248 |
" barrier(CLK_LOCAL_MEM_FENCE);",
|
249 |
" s[get_local_id(0)]=(ulong)total;",
|
250 |
"}",
|
251 |
"__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
252 |
"{",
|
253 |
" ulong total;",
|
254 |
" if (MyType==TFP32) {",
|
255 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
256 |
" }",
|
257 |
" else if (MyType==TFP64) {",
|
258 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
259 |
" }",
|
260 |
" else if (MyType==TINT32) {",
|
261 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
262 |
" }",
|
263 |
" else if (MyType==TINT64) {",
|
264 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
265 |
" }",
|
266 |
" barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
|
267 |
" s[get_global_id(0)]=total;",
|
268 |
"}"
|
269 |
}; |
270 |
|
271 |
int main(int argc, char **argv) |
272 |
{ |
273 |
if ((argc==1)|| |
274 |
(strcmp(argv[1],"-h")==0)|| |
275 |
(strcmp(argv[1],"--help")==0)) |
276 |
{ |
277 |
printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
|
278 |
"\t#1 OpenCL Plateform ID (default 0)\n"
|
279 |
"\t#2 OpenCL Device ID (default 0)\n"
|
280 |
"\t#3 Minimal number of iterations (default 1000000)\n"
|
281 |
"\t#4 Parallel Rate (default 1024)\n"
|
282 |
"\t#5 Loops (default 1)\n"
|
283 |
"\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
|
284 |
DetectOpenCLDevices(); |
285 |
} |
286 |
else
|
287 |
{ |
288 |
|
289 |
int MyPlatform=atoi(argv[1]); |
290 |
int MyDevice=atoi(argv[2]); |
291 |
|
292 |
struct timeval tv1,tv2;
|
293 |
struct timezone tz;
|
294 |
|
295 |
uint64_t Iterations=1000000;
|
296 |
if (argc>3) { |
297 |
Iterations=(uint64_t)atoll(argv[3]);
|
298 |
} |
299 |
|
300 |
uint32_t ParallelRate=1024;
|
301 |
if (argc>4) { |
302 |
ParallelRate=(uint32_t)atoi(argv[4]);
|
303 |
} |
304 |
|
305 |
uint32_t Loops=1;
|
306 |
if (argc>5) { |
307 |
Loops=(uint32_t)atoi(argv[5]);
|
308 |
} |
309 |
|
310 |
uint32_t MyType=TFP32; |
311 |
if (argc>6) { |
312 |
if (strcmp(argv[6],"INT32")==0) { |
313 |
MyType=(uint32_t)TINT32; |
314 |
} |
315 |
else if (strcmp(argv[6],"INT64")==0) { |
316 |
MyType=(uint32_t)TINT64; |
317 |
} |
318 |
else if (strcmp(argv[6],"FP32")==0) { |
319 |
MyType=(uint32_t)TFP32; |
320 |
} |
321 |
else if (strcmp(argv[6],"FP64")==0) { |
322 |
MyType=(uint32_t)TFP64; |
323 |
} |
324 |
} |
325 |
|
326 |
printf("MyType %d\n",MyType);
|
327 |
|
328 |
cl_int err; |
329 |
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
330 |
|
331 |
// Detect, scan, get & initialize platform and device
|
332 |
cl_uint platformCount; |
333 |
cl_platform_id* platforms; |
334 |
cl_uint deviceCount; |
335 |
cl_device_id* devices; |
336 |
size_t valueSize; |
337 |
|
338 |
/* Setup OpenCL environment. */
|
339 |
|
340 |
// Get all platforms
|
341 |
err = clGetPlatformIDs(0, NULL, &platformCount); |
342 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
343 |
err = clGetPlatformIDs(platformCount, platforms, NULL);
|
344 |
|
345 |
// Get Device defined
|
346 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
347 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
348 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
349 |
|
350 |
// print device name
|
351 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
352 |
char* deviceName=(char*)malloc(valueSize); |
353 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
|
354 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
355 |
char* vendorName=(char*)malloc(valueSize); |
356 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
|
357 |
printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
|
358 |
free(deviceName); |
359 |
free(vendorName); |
360 |
|
361 |
props[1] = (cl_context_properties)platforms[MyPlatform];
|
362 |
|
363 |
cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err); |
364 |
cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
|
365 |
|
366 |
cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, |
367 |
sizeof(uint64_t) * ParallelRate, NULL, NULL); |
368 |
|
369 |
// 51 is the number of line for OpenCL code
|
370 |
// 66, sans test
|
371 |
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL); |
372 |
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); |
373 |
cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL); |
374 |
cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL); |
375 |
cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL); |
376 |
|
377 |
// Divide the total number of iterations by the parallel rate
|
378 |
// Add +1 to the number of per work iterations if division not integer
|
379 |
uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1; |
380 |
// Initialize seeds for MWC RNG generator from Marsaglia
|
381 |
uint32_t seed_w=110271;
|
382 |
uint32_t seed_z=101008;
|
383 |
|
384 |
// Set the values of arguments for OpenCL function call
|
385 |
clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside); |
386 |
clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach); |
387 |
clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w); |
388 |
clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z); |
389 |
clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType); |
390 |
|
391 |
size_t WorkSize[1] = {ParallelRate}; // one dimensional Range |
392 |
|
393 |
uint64_t HostInside[ParallelRate]; |
394 |
|
395 |
for (uint32_t loop=0;loop<Loops;loop++) { |
396 |
// Set start timer
|
397 |
gettimeofday(&tv1, &tz); |
398 |
|
399 |
// Execute the OpenCL kernel with datas
|
400 |
clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL, |
401 |
WorkSize, NULL, 0, NULL, NULL); |
402 |
// Copy each result for each PR from Device to Host
|
403 |
clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
|
404 |
ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL); |
405 |
uint64_t inside=0;
|
406 |
|
407 |
for (int i= 0; i < ParallelRate; i++) { |
408 |
inside+=HostInside[i]; |
409 |
} |
410 |
|
411 |
// Set stop timer
|
412 |
gettimeofday(&tv2, &tz); |
413 |
|
414 |
double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + |
415 |
(tv2.tv_usec-tv1.tv_usec))/1000000;
|
416 |
|
417 |
double itops=(double)(ParallelRate*IterationsEach)/elapsed; |
418 |
|
419 |
printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nLogItops %.2f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,log10(itops),(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach)))); |
420 |
} |
421 |
printf("\n\n");
|
422 |
|
423 |
clReleaseKernel(OpenCLMainLoopGlobal); |
424 |
clReleaseProgram(OpenCLProgram); |
425 |
clReleaseCommandQueue(cqCommandQueue); |
426 |
clReleaseContext(GPUContext); |
427 |
clReleaseMemObject(GPUInside); |
428 |
|
429 |
|
430 |
return 0; |
431 |
} |
432 |
} |
433 |
|