root / Pi / OpenCL / PiOpenCL.c @ 156
Historique | Voir | Annoter | Télécharger (14,84 ko)
1 |
// Pi Dart Dash in OpenCL in C, illustrative example
|
---|---|
2 |
//
|
3 |
// Emmanuel Quemener <emmanuel.quemener@ens-lyon.fr
|
4 |
// Copyleft Cecill v2
|
5 |
//
|
6 |
// -h : print the documentation and detect devices as (platform,device)
|
7 |
//
|
8 |
// classical use:
|
9 |
// #1 OpenCL Plateform ID: get this information with -h option
|
10 |
// #2 OpenCL Device ID: get this information with -h option
|
11 |
// #3 Minimal number of iterations:
|
12 |
// #4 Parallel Rate: scattering global work in parts executed //ly
|
13 |
// #5 Loops (to improve statistics)
|
14 |
//
|
15 |
|
16 |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
17 |
#include <stdio.h> |
18 |
#include <stdlib.h> |
19 |
#include <string.h> |
20 |
#include <CL/cl.h> |
21 |
#include <stdint.h> |
22 |
#include <sys/time.h> |
23 |
|
24 |
#define TINT32 0 |
25 |
#define TINT64 1 |
26 |
#define TFP32 2 |
27 |
#define TFP64 3 |
28 |
|
29 |
int DetectOpenCLDevices(void) |
30 |
{ |
31 |
int i, j;
|
32 |
char* value;
|
33 |
size_t valueSize; |
34 |
cl_uint platformCount; |
35 |
cl_platform_id* platforms; |
36 |
cl_uint deviceCount; |
37 |
cl_device_id* devices; |
38 |
cl_uint maxComputeUnits; |
39 |
cl_int maxWorkGroupSize; |
40 |
cl_int maxWorkItemSizes; |
41 |
cl_device_type dev_type; |
42 |
|
43 |
// get all platforms
|
44 |
clGetPlatformIDs(0, NULL, &platformCount); |
45 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
46 |
clGetPlatformIDs(platformCount, platforms, NULL);
|
47 |
|
48 |
printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
49 |
|
50 |
for (i = 0; i < platformCount; i++) { |
51 |
|
52 |
// get all devices
|
53 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
54 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
55 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
56 |
|
57 |
// for each device print critical attributes
|
58 |
for (j = 0; j < deviceCount; j++) { |
59 |
|
60 |
// print device name
|
61 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
62 |
value = (char*) malloc(valueSize);
|
63 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
64 |
printf("Device (%d,%d): %s\n",i, j, value);
|
65 |
free(value); |
66 |
|
67 |
// print type device CPU/GPU/ACCELERATOR
|
68 |
clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
69 |
printf("\tDevice Type: ");
|
70 |
if(dev_type & CL_DEVICE_TYPE_GPU)
|
71 |
printf("CL_DEVICE_TYPE_GPU ");
|
72 |
if(dev_type & CL_DEVICE_TYPE_CPU)
|
73 |
printf("CL_DEVICE_TYPE_CPU ");
|
74 |
if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
75 |
printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
76 |
if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
77 |
printf("CL_DEVICE_TYPE_DEFAULT ");
|
78 |
printf("\n");
|
79 |
|
80 |
// print device vendor
|
81 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
82 |
value = (char*) malloc(valueSize);
|
83 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
84 |
printf("\tDevice vendor: %s\n", value);
|
85 |
free(value); |
86 |
|
87 |
// print hardware device version
|
88 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
89 |
value = (char*) malloc(valueSize);
|
90 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
91 |
printf("\tHardware version: %s\n", value);
|
92 |
free(value); |
93 |
|
94 |
// print software driver version
|
95 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
96 |
value = (char*) malloc(valueSize);
|
97 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
98 |
printf("\tSoftware version: %s\n", value);
|
99 |
free(value); |
100 |
|
101 |
// print c version supported by compiler for device
|
102 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
103 |
value = (char*) malloc(valueSize);
|
104 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
105 |
printf("\tOpenCL C version: %s\n", value);
|
106 |
free(value); |
107 |
|
108 |
// print parallel compute units
|
109 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
110 |
sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
111 |
printf("\tParallel compute units: %d\n", maxComputeUnits);
|
112 |
|
113 |
// print max work group size
|
114 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
115 |
sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
116 |
printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
117 |
|
118 |
// print max work items size
|
119 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
120 |
sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
121 |
printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
122 |
|
123 |
} |
124 |
printf("\n");
|
125 |
free(devices); |
126 |
} |
127 |
|
128 |
free(platforms); |
129 |
return 0; |
130 |
|
131 |
} |
132 |
|
133 |
const char* OpenCLSource[] = { |
134 |
"#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
|
135 |
"// Marsaglia RNG very simple implementation \n",
|
136 |
"#define znew ((z=36969*(z&65535)+(z>>16))<<16) \n",
|
137 |
"#define wnew ((w=18000*(w&65535)+(w>>16))&65535) \n",
|
138 |
"#define MWC (znew+wnew) \n",
|
139 |
"#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
|
140 |
"#define CONG (jcong=69069*jcong+1234567) \n",
|
141 |
"#define KISS ((MWC^CONG)+SHR3) \n",
|
142 |
"#define MWCfp MWC * 2.328306435454494e-10f \n",
|
143 |
"#define KISSfp KISS * 2.328306435454494e-10f \n",
|
144 |
"#define CONGfp CONG * 2.328306435454494e-10f \n",
|
145 |
"#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
|
146 |
"#define TINT32 0 \n",
|
147 |
"#define TINT64 1 \n",
|
148 |
"#define TFP32 2 \n",
|
149 |
"#define TFP64 3 \n",
|
150 |
"#define THEONE32I 1073741824 \n",
|
151 |
"#define THEONE32F 1.e0f \n",
|
152 |
"#define THEONE64I 4611686018427387904 \n",
|
153 |
"#define THEONE64F (double)1.e0f \n",
|
154 |
"ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
155 |
"{",
|
156 |
" uint z=seed_z+work;",
|
157 |
" uint w=seed_w+work;",
|
158 |
" ulong total=0;",
|
159 |
" for (ulong i=0;i<iterations;i++)",
|
160 |
" {",
|
161 |
" uint x= MWC>>17;",
|
162 |
" uint y= MWC>>17;",
|
163 |
" ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
|
164 |
" total+=inside;",
|
165 |
" }",
|
166 |
" return(total);",
|
167 |
"}",
|
168 |
"ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
169 |
"{",
|
170 |
" uint z=seed_z+work;",
|
171 |
" uint w=seed_w+work;",
|
172 |
" ulong total=0;",
|
173 |
" for (ulong i=0;i<iterations;i++)",
|
174 |
" {",
|
175 |
" float x=(float)MWCfp ;",
|
176 |
" float y=(float)MWCfp ;",
|
177 |
" ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
|
178 |
" total+=inside;",
|
179 |
" }",
|
180 |
" return(total);",
|
181 |
"}",
|
182 |
"ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
183 |
"{",
|
184 |
" uint z=seed_z+work;",
|
185 |
" uint w=seed_w+work;",
|
186 |
" ulong total=0;",
|
187 |
" for (ulong i=0;i<iterations;i++)",
|
188 |
" {",
|
189 |
" ulong x=(ulong)(MWC>>1);",
|
190 |
" ulong y=(ulong)(MWC>>1);",
|
191 |
" ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
|
192 |
" total+=inside;",
|
193 |
" }",
|
194 |
" return(total);",
|
195 |
"}",
|
196 |
"ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
197 |
"{",
|
198 |
" uint z=seed_z+work;",
|
199 |
" uint w=seed_w+work;",
|
200 |
" ulong total=0;",
|
201 |
" for (ulong i=0;i<iterations;i++)",
|
202 |
"{",
|
203 |
" float x=(double)MWCfp ;",
|
204 |
" float y=(double)MWCfp ;",
|
205 |
" ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
|
206 |
" total+=inside;",
|
207 |
"}",
|
208 |
" return(total);",
|
209 |
"}",
|
210 |
"__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
211 |
"{",
|
212 |
" ulong total;",
|
213 |
" if (MyType==TFP32) {",
|
214 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
215 |
" }",
|
216 |
" else if (MyType==TFP64) {",
|
217 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
218 |
" }",
|
219 |
" else if (MyType==TINT32) {",
|
220 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
221 |
" }",
|
222 |
" else if (MyType==TINT64) {",
|
223 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
224 |
" }",
|
225 |
" barrier(CLK_GLOBAL_MEM_FENCE);",
|
226 |
" s[get_global_id(0)]=(ulong)total;",
|
227 |
"}",
|
228 |
"__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
229 |
"{",
|
230 |
" ulong total;",
|
231 |
" if (MyType==TFP32) {",
|
232 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
|
233 |
" }",
|
234 |
" else if (MyType==TFP64) {",
|
235 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
|
236 |
" }",
|
237 |
" else if (MyType==TINT32) {",
|
238 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
|
239 |
" }",
|
240 |
" else if (MyType==TINT64) {",
|
241 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
|
242 |
" }",
|
243 |
" barrier(CLK_LOCAL_MEM_FENCE);",
|
244 |
" s[get_local_id(0)]=(ulong)total;",
|
245 |
"}",
|
246 |
"__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
247 |
"{",
|
248 |
" ulong total;",
|
249 |
" if (MyType==TFP32) {",
|
250 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
251 |
" }",
|
252 |
" else if (MyType==TFP64) {",
|
253 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
254 |
" }",
|
255 |
" else if (MyType==TINT32) {",
|
256 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
257 |
" }",
|
258 |
" else if (MyType==TINT64) {",
|
259 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
260 |
" }",
|
261 |
" barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
|
262 |
" s[get_global_id(0)]=total;",
|
263 |
"}"
|
264 |
}; |
265 |
|
266 |
int main(int argc, char **argv) |
267 |
{ |
268 |
if ((argc==1)|| |
269 |
(strcmp(argv[1],"-h")==0)|| |
270 |
(strcmp(argv[1],"--help")==0)) |
271 |
{ |
272 |
printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
|
273 |
"\t#1 OpenCL Plateform ID (default 0)\n"
|
274 |
"\t#2 OpenCL Device ID (default 0)\n"
|
275 |
"\t#3 Minimal number of iterations (default 1000000)\n"
|
276 |
"\t#4 Parallel Rate (default 1024)\n"
|
277 |
"\t#5 Loops (default 1)\n"
|
278 |
"\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
|
279 |
DetectOpenCLDevices(); |
280 |
} |
281 |
else
|
282 |
{ |
283 |
|
284 |
int MyPlatform=atoi(argv[1]); |
285 |
int MyDevice=atoi(argv[2]); |
286 |
|
287 |
struct timeval tv1,tv2;
|
288 |
struct timezone tz;
|
289 |
|
290 |
uint64_t Iterations=1000000;
|
291 |
if (argc>3) { |
292 |
Iterations=(uint64_t)atoll(argv[3]);
|
293 |
} |
294 |
|
295 |
uint32_t ParallelRate=64;
|
296 |
if (argc>4) { |
297 |
ParallelRate=(uint32_t)atoi(argv[4]);
|
298 |
} |
299 |
|
300 |
uint32_t Loops=1;
|
301 |
if (argc>5) { |
302 |
Loops=(uint32_t)atoi(argv[5]);
|
303 |
} |
304 |
|
305 |
uint32_t MyType=TFP32; |
306 |
if (argc>6) { |
307 |
if (strcmp(argv[6],"INT32")==0) { |
308 |
MyType=(uint32_t)TINT32; |
309 |
} |
310 |
else if (strcmp(argv[6],"INT64")==0) { |
311 |
MyType=(uint32_t)TINT64; |
312 |
} |
313 |
else if (strcmp(argv[6],"FP32")==0) { |
314 |
MyType=(uint32_t)TFP32; |
315 |
} |
316 |
else if (strcmp(argv[6],"FP64")==0) { |
317 |
MyType=(uint32_t)TFP64; |
318 |
} |
319 |
} |
320 |
|
321 |
printf("MyType %d\n",MyType);
|
322 |
|
323 |
cl_int err; |
324 |
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
325 |
|
326 |
// Detect, scan, get & initialize platform and device
|
327 |
cl_uint platformCount; |
328 |
cl_platform_id* platforms; |
329 |
cl_uint deviceCount; |
330 |
cl_device_id* devices; |
331 |
size_t valueSize; |
332 |
|
333 |
/* Setup OpenCL environment. */
|
334 |
|
335 |
// Get all platforms
|
336 |
err = clGetPlatformIDs(0, NULL, &platformCount); |
337 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
338 |
err = clGetPlatformIDs(platformCount, platforms, NULL);
|
339 |
|
340 |
// Get Device defined
|
341 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
342 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
343 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
344 |
|
345 |
// print device name
|
346 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
347 |
char* deviceName=(char*)malloc(valueSize); |
348 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
|
349 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
350 |
char* vendorName=(char*)malloc(valueSize); |
351 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
|
352 |
printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
|
353 |
free(deviceName); |
354 |
free(vendorName); |
355 |
|
356 |
props[1] = (cl_context_properties)platforms[MyPlatform];
|
357 |
|
358 |
cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err); |
359 |
cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
|
360 |
|
361 |
cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, |
362 |
sizeof(uint64_t) * ParallelRate, NULL, NULL); |
363 |
|
364 |
// 51 is the number of line for OpenCL code
|
365 |
// 66, sans test
|
366 |
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL); |
367 |
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); |
368 |
cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL); |
369 |
cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL); |
370 |
cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL); |
371 |
|
372 |
// Divide the total number of iterations by the parallel rate
|
373 |
// Add +1 to the number of per work iterations if division not integer
|
374 |
uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1; |
375 |
// Initialize seeds for MWC RNG generator from Marsaglia
|
376 |
uint32_t seed_w=110271;
|
377 |
uint32_t seed_z=101008;
|
378 |
|
379 |
// Set the values of arguments for OpenCL function call
|
380 |
clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside); |
381 |
clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach); |
382 |
clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w); |
383 |
clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z); |
384 |
clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType); |
385 |
|
386 |
size_t WorkSize[1] = {ParallelRate}; // one dimensional Range |
387 |
|
388 |
uint64_t HostInside[ParallelRate]; |
389 |
|
390 |
for (uint32_t loop=0;loop<Loops;loop++) { |
391 |
// Set start timer
|
392 |
gettimeofday(&tv1, &tz); |
393 |
|
394 |
// Execute the OpenCL kernel with datas
|
395 |
clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL, |
396 |
WorkSize, NULL, 0, NULL, NULL); |
397 |
// Copy each result for each PR from Device to Host
|
398 |
clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
|
399 |
ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL); |
400 |
uint64_t inside=0;
|
401 |
|
402 |
for (int i= 0; i < ParallelRate; i++) { |
403 |
inside+=HostInside[i]; |
404 |
} |
405 |
|
406 |
// Set stop timer
|
407 |
gettimeofday(&tv2, &tz); |
408 |
|
409 |
double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + |
410 |
(tv2.tv_usec-tv1.tv_usec))/1000000;
|
411 |
|
412 |
double itops=(double)(ParallelRate*IterationsEach)/elapsed; |
413 |
|
414 |
printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach)))); |
415 |
} |
416 |
printf("\n\n");
|
417 |
|
418 |
clReleaseKernel(OpenCLMainLoopGlobal); |
419 |
clReleaseProgram(OpenCLProgram); |
420 |
clReleaseCommandQueue(cqCommandQueue); |
421 |
clReleaseContext(GPUContext); |
422 |
clReleaseMemObject(GPUInside); |
423 |
|
424 |
|
425 |
return 0; |
426 |
} |
427 |
} |
428 |
|