root / Pi / C / OpenCL / PiOpenCL_CONG.c @ 286
Historique | Voir | Annoter | Télécharger (14,99 ko)
1 |
// Pi Dart Dash in OpenCL in C, illustrative example
|
---|---|
2 |
//
|
3 |
// Emmanuel Quemener <emmanuel.quemener@gmail.com>
|
4 |
//
|
5 |
// CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
6 |
// Copyleft Cecill v2
|
7 |
//
|
8 |
// -h : print the documentation and detect devices as (platform,device)
|
9 |
//
|
10 |
// classical use:
|
11 |
// #1 OpenCL Plateform ID: get this information with -h option
|
12 |
// #2 OpenCL Device ID: get this information with -h option
|
13 |
// #3 Minimal number of iterations:
|
14 |
// #4 Parallel Rate: scattering global work in parts executed //ly
|
15 |
// #5 Loops (to improve statistics)
|
16 |
// #6 Type of variables INT32, INT64, FP32, FP64
|
17 |
//
|
18 |
// To compile : gcc -o PiOpenCL_CONG PiOpenCL_CONG.c -lOpenCL -lm
|
19 |
|
20 |
#define CL_TARGET_OPENCL_VERSION 220 |
21 |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
22 |
#include <stdio.h> |
23 |
#include <stdlib.h> |
24 |
#include <string.h> |
25 |
#include <CL/cl.h> |
26 |
#include <stdint.h> |
27 |
#include <math.h> |
28 |
#include <sys/time.h> |
29 |
|
30 |
#define TINT32 0 |
31 |
#define TINT64 1 |
32 |
#define TFP32 2 |
33 |
#define TFP64 3 |
34 |
|
35 |
int DetectOpenCLDevices(void) |
36 |
{ |
37 |
int i, j;
|
38 |
char* value;
|
39 |
size_t valueSize; |
40 |
cl_uint platformCount; |
41 |
cl_platform_id* platforms; |
42 |
cl_uint deviceCount; |
43 |
cl_device_id* devices; |
44 |
cl_uint maxComputeUnits; |
45 |
cl_int maxWorkGroupSize; |
46 |
cl_int maxWorkItemSizes; |
47 |
cl_device_type dev_type; |
48 |
|
49 |
// get all platforms
|
50 |
clGetPlatformIDs(0, NULL, &platformCount); |
51 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
52 |
clGetPlatformIDs(platformCount, platforms, NULL);
|
53 |
|
54 |
printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
55 |
|
56 |
for (i = 0; i < platformCount; i++) { |
57 |
|
58 |
// get all devices
|
59 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
60 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
61 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
62 |
|
63 |
// for each device print critical attributes
|
64 |
for (j = 0; j < deviceCount; j++) { |
65 |
|
66 |
// print device name
|
67 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
68 |
value = (char*) malloc(valueSize);
|
69 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
70 |
printf("Device (%d,%d): %s\n",i, j, value);
|
71 |
free(value); |
72 |
|
73 |
// print type device CPU/GPU/ACCELERATOR
|
74 |
clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
75 |
printf("\tDevice Type: ");
|
76 |
if(dev_type & CL_DEVICE_TYPE_GPU)
|
77 |
printf("CL_DEVICE_TYPE_GPU ");
|
78 |
if(dev_type & CL_DEVICE_TYPE_CPU)
|
79 |
printf("CL_DEVICE_TYPE_CPU ");
|
80 |
if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
81 |
printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
82 |
if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
83 |
printf("CL_DEVICE_TYPE_DEFAULT ");
|
84 |
printf("\n");
|
85 |
|
86 |
// print device vendor
|
87 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
88 |
value = (char*) malloc(valueSize);
|
89 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
90 |
printf("\tDevice vendor: %s\n", value);
|
91 |
free(value); |
92 |
|
93 |
// print hardware device version
|
94 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
95 |
value = (char*) malloc(valueSize);
|
96 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
97 |
printf("\tHardware version: %s\n", value);
|
98 |
free(value); |
99 |
|
100 |
// print software driver version
|
101 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
102 |
value = (char*) malloc(valueSize);
|
103 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
104 |
printf("\tSoftware version: %s\n", value);
|
105 |
free(value); |
106 |
|
107 |
// print c version supported by compiler for device
|
108 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
109 |
value = (char*) malloc(valueSize);
|
110 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
111 |
printf("\tOpenCL C version: %s\n", value);
|
112 |
free(value); |
113 |
|
114 |
// print parallel compute units
|
115 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
116 |
sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
117 |
printf("\tParallel compute units: %d\n", maxComputeUnits);
|
118 |
|
119 |
// print max work group size
|
120 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
121 |
sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
122 |
printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
123 |
|
124 |
// print max work items size
|
125 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
126 |
sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
127 |
printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
128 |
|
129 |
} |
130 |
printf("\n");
|
131 |
free(devices); |
132 |
} |
133 |
|
134 |
free(platforms); |
135 |
return 0; |
136 |
|
137 |
} |
138 |
|
139 |
const char* OpenCLSource[] = { |
140 |
"#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
|
141 |
"// Marsaglia RNG very simple implementation \n",
|
142 |
"#define znew ((z=36969*(z&65535)+(z>>16))<<16) \n",
|
143 |
"#define wnew ((w=18000*(w&65535)+(w>>16))&65535) \n",
|
144 |
"#define MWC (znew+wnew) \n",
|
145 |
"#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
|
146 |
"#define CONG (jcong=69069*jcong+1234567) \n",
|
147 |
"#define KISS ((MWC^CONG)+SHR3) \n",
|
148 |
"#define MWCfp MWC * 2.328306435454494e-10f \n",
|
149 |
"#define KISSfp KISS * 2.328306435454494e-10f \n",
|
150 |
"#define CONGfp CONG * 2.328306435454494e-10f \n",
|
151 |
"#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
|
152 |
"#define TINT32 0 \n",
|
153 |
"#define TINT64 1 \n",
|
154 |
"#define TFP32 2 \n",
|
155 |
"#define TFP64 3 \n",
|
156 |
"#define THEONE32I 1073741824 \n",
|
157 |
"#define THEONE32F 1.e0f \n",
|
158 |
"#define THEONE64I 4611686018427387904 \n",
|
159 |
"#define THEONE64F (double)1.e0f \n",
|
160 |
"ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
161 |
"{",
|
162 |
" uint jcong=seed_z+work;",
|
163 |
" ulong total=0;",
|
164 |
" for (ulong i=0;i<iterations;i++)",
|
165 |
" {",
|
166 |
" uint x= CONG>>17;",
|
167 |
" uint y= CONG>>17;",
|
168 |
" ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
|
169 |
" total+=inside;",
|
170 |
" }",
|
171 |
" return(total);",
|
172 |
"}",
|
173 |
"ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
174 |
"{",
|
175 |
" uint jcong=seed_z+work;",
|
176 |
" ulong total=0;",
|
177 |
" for (ulong i=0;i<iterations;i++)",
|
178 |
" {",
|
179 |
" float x=(float)CONGfp ;",
|
180 |
" float y=(float)CONGfp ;",
|
181 |
" ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
|
182 |
" total+=inside;",
|
183 |
" }",
|
184 |
" return(total);",
|
185 |
"}",
|
186 |
"ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
187 |
"{",
|
188 |
" uint jcong=seed_z+work;",
|
189 |
" ulong total=0;",
|
190 |
" for (ulong i=0;i<iterations;i++)",
|
191 |
" {",
|
192 |
" ulong x=(ulong)(CONG>>1);",
|
193 |
" ulong y=(ulong)(CONG>>1);",
|
194 |
" ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
|
195 |
" total+=inside;",
|
196 |
" }",
|
197 |
" return(total);",
|
198 |
"}",
|
199 |
"ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
200 |
"{",
|
201 |
" uint jcong=seed_z+work;",
|
202 |
" ulong total=0;",
|
203 |
" for (ulong i=0;i<iterations;i++)",
|
204 |
"{",
|
205 |
" double x=(double)CONGfp ;",
|
206 |
" double y=(double)CONGfp ;",
|
207 |
" ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
|
208 |
" total+=inside;",
|
209 |
"}",
|
210 |
" return(total);",
|
211 |
"}",
|
212 |
"__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
213 |
"{",
|
214 |
" ulong total;",
|
215 |
" if (MyType==TFP32) {",
|
216 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
217 |
" }",
|
218 |
" else if (MyType==TFP64) {",
|
219 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
220 |
" }",
|
221 |
" else if (MyType==TINT32) {",
|
222 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
223 |
" }",
|
224 |
" else if (MyType==TINT64) {",
|
225 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
226 |
" }",
|
227 |
" barrier(CLK_GLOBAL_MEM_FENCE);",
|
228 |
" s[get_global_id(0)]=(ulong)total;",
|
229 |
"}",
|
230 |
"__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
231 |
"{",
|
232 |
" ulong total;",
|
233 |
" if (MyType==TFP32) {",
|
234 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
|
235 |
" }",
|
236 |
" else if (MyType==TFP64) {",
|
237 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
|
238 |
" }",
|
239 |
" else if (MyType==TINT32) {",
|
240 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
|
241 |
" }",
|
242 |
" else if (MyType==TINT64) {",
|
243 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
|
244 |
" }",
|
245 |
" barrier(CLK_LOCAL_MEM_FENCE);",
|
246 |
" s[get_local_id(0)]=(ulong)total;",
|
247 |
"}",
|
248 |
"__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
249 |
"{",
|
250 |
" ulong total;",
|
251 |
" if (MyType==TFP32) {",
|
252 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
253 |
" }",
|
254 |
" else if (MyType==TFP64) {",
|
255 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
256 |
" }",
|
257 |
" else if (MyType==TINT32) {",
|
258 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
259 |
" }",
|
260 |
" else if (MyType==TINT64) {",
|
261 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
262 |
" }",
|
263 |
" barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
|
264 |
" s[get_global_id(0)]=total;",
|
265 |
"}"
|
266 |
}; |
267 |
|
268 |
int main(int argc, char **argv) |
269 |
{ |
270 |
if ((argc==1)|| |
271 |
(strcmp(argv[1],"-h")==0)|| |
272 |
(strcmp(argv[1],"--help")==0)) |
273 |
{ |
274 |
printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
|
275 |
"\t#1 OpenCL Plateform ID (default 0)\n"
|
276 |
"\t#2 OpenCL Device ID (default 0)\n"
|
277 |
"\t#3 Minimal number of iterations (default 1000000)\n"
|
278 |
"\t#4 Parallel Rate (default 1024)\n"
|
279 |
"\t#5 Loops (default 1)\n"
|
280 |
"\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
|
281 |
DetectOpenCLDevices(); |
282 |
} |
283 |
else
|
284 |
{ |
285 |
|
286 |
int MyPlatform=atoi(argv[1]); |
287 |
int MyDevice=atoi(argv[2]); |
288 |
|
289 |
struct timeval tv1,tv2;
|
290 |
|
291 |
uint64_t Iterations=1000000;
|
292 |
if (argc>3) { |
293 |
Iterations=(uint64_t)atoll(argv[3]);
|
294 |
} |
295 |
|
296 |
uint32_t ParallelRate=1024;
|
297 |
if (argc>4) { |
298 |
ParallelRate=(uint32_t)atoi(argv[4]);
|
299 |
} |
300 |
|
301 |
uint32_t Loops=1;
|
302 |
if (argc>5) { |
303 |
Loops=(uint32_t)atoi(argv[5]);
|
304 |
} |
305 |
|
306 |
uint32_t MyType=TFP32; |
307 |
if (argc>6) { |
308 |
if (strcmp(argv[6],"INT32")==0) { |
309 |
MyType=(uint32_t)TINT32; |
310 |
} |
311 |
else if (strcmp(argv[6],"INT64")==0) { |
312 |
MyType=(uint32_t)TINT64; |
313 |
} |
314 |
else if (strcmp(argv[6],"FP32")==0) { |
315 |
MyType=(uint32_t)TFP32; |
316 |
} |
317 |
else if (strcmp(argv[6],"FP64")==0) { |
318 |
MyType=(uint32_t)TFP64; |
319 |
} |
320 |
} |
321 |
|
322 |
printf("MyType %d\n",MyType);
|
323 |
|
324 |
cl_int err; |
325 |
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
326 |
|
327 |
// Detect, scan, get & initialize platform and device
|
328 |
cl_uint platformCount; |
329 |
cl_platform_id* platforms; |
330 |
cl_uint deviceCount; |
331 |
cl_device_id* devices; |
332 |
size_t valueSize; |
333 |
|
334 |
/* Setup OpenCL environment. */
|
335 |
|
336 |
// Get all platforms
|
337 |
err = clGetPlatformIDs(0, NULL, &platformCount); |
338 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
339 |
err = clGetPlatformIDs(platformCount, platforms, NULL);
|
340 |
|
341 |
// Get Device defined
|
342 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
343 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
344 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
345 |
|
346 |
// print device name
|
347 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
348 |
char* deviceName=(char*)malloc(valueSize); |
349 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
|
350 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
351 |
char* vendorName=(char*)malloc(valueSize); |
352 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
|
353 |
printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
|
354 |
free(deviceName); |
355 |
free(vendorName); |
356 |
|
357 |
props[1] = (cl_context_properties)platforms[MyPlatform];
|
358 |
|
359 |
cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err); |
360 |
cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
|
361 |
|
362 |
cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, |
363 |
sizeof(uint64_t) * ParallelRate, NULL, NULL); |
364 |
|
365 |
// 126 is the number of lines for OpenCL code (for kernels)
|
366 |
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 126 ,OpenCLSource,NULL,NULL); |
367 |
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); |
368 |
cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL); |
369 |
cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL); |
370 |
cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL); |
371 |
|
372 |
// Divide the total number of iterations by the parallel rate
|
373 |
// Add +1 to the number of per work iterations if division not integer
|
374 |
uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1; |
375 |
// Initialize seeds for MWC RNG generator from Marsaglia
|
376 |
uint32_t seed_w=110271;
|
377 |
uint32_t seed_z=101008;
|
378 |
|
379 |
// Set the values of arguments for OpenCL function call
|
380 |
clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside); |
381 |
clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach); |
382 |
clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w); |
383 |
clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z); |
384 |
clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType); |
385 |
|
386 |
size_t WorkSize[1] = {ParallelRate}; // one dimensional Range |
387 |
|
388 |
uint64_t HostInside[ParallelRate]; |
389 |
|
390 |
for (uint32_t loop=0;loop<Loops;loop++) { |
391 |
// Set start timer
|
392 |
gettimeofday(&tv1, NULL);
|
393 |
|
394 |
// Execute the OpenCL kernel with datas
|
395 |
clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL, |
396 |
WorkSize, NULL, 0, NULL, NULL); |
397 |
// Copy each result for each PR from Device to Host
|
398 |
clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
|
399 |
ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL); |
400 |
uint64_t inside=0;
|
401 |
|
402 |
for (int i= 0; i < ParallelRate; i++) { |
403 |
inside+=HostInside[i]; |
404 |
} |
405 |
|
406 |
// Set stop timer
|
407 |
gettimeofday(&tv2, NULL);
|
408 |
|
409 |
double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + |
410 |
(tv2.tv_usec-tv1.tv_usec))/1000000;
|
411 |
|
412 |
double itops=(double)(ParallelRate*IterationsEach)/elapsed; |
413 |
|
414 |
printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nLogItops %.2f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,log10(itops),(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach)))); |
415 |
} |
416 |
printf("\n\n");
|
417 |
|
418 |
clReleaseKernel(OpenCLMainLoopGlobal); |
419 |
clReleaseProgram(OpenCLProgram); |
420 |
clReleaseCommandQueue(cqCommandQueue); |
421 |
clReleaseContext(GPUContext); |
422 |
clReleaseMemObject(GPUInside); |
423 |
|
424 |
|
425 |
return 0; |
426 |
} |
427 |
} |
428 |
|