Statistiques
| Révision :

root / Pi / C / OpenCL / PiOpenCL.c @ 249

Historique | Voir | Annoter | Télécharger (15,06 ko)

1
// Pi Dart Dash in OpenCL in C, illustrative example
2
//
3
// Emmanuel Quemener <emmanuel.quemener@gmail.com>
4
//
5
// CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
6
// Copyleft Cecill v2
7
//
8
// -h : print the documentation and detect devices as (platform,device)
9
//
10
// classical use:
11
// #1 OpenCL Plateform ID: get this information with -h option
12
// #2 OpenCL Device ID: get this information with -h option
13
// #3 Minimal number of iterations: 
14
// #4 Parallel Rate: scattering global work in parts executed //ly
15
// #5 Loops (to improve statistics)
16
// #6 Type of variables INT32, INT64, FP32, FP64
17
// 
18
// To compile : gcc -o PiOpenCL PiOpenCL.c -lOpenCL -lm
19

    
20
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
21
#include <stdio.h>
22
#include <stdlib.h>
23
#include <string.h>
24
#include <CL/cl.h>
25
#include <stdint.h>
26
#include <math.h>
27
#include <sys/time.h>
28

    
29
#define TINT32 0
30
#define TINT64 1
31
#define TFP32 2
32
#define TFP64 3
33

    
34
int DetectOpenCLDevices(void) 
35
{
36
  int i, j;
37
  char* value;
38
  size_t valueSize;
39
  cl_uint platformCount;
40
  cl_platform_id* platforms;
41
  cl_uint deviceCount;
42
  cl_device_id* devices;
43
  cl_uint maxComputeUnits;
44
  cl_int maxWorkGroupSize;
45
  cl_int maxWorkItemSizes;
46
  cl_device_type dev_type;
47

    
48
  // get all platforms
49
  clGetPlatformIDs(0, NULL, &platformCount);
50
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
51
  clGetPlatformIDs(platformCount, platforms, NULL);
52

    
53
  printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
54

    
55
  for (i = 0; i < platformCount; i++) {
56

    
57
    // get all devices
58
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
59
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
60
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
61

    
62
    // for each device print critical attributes
63
    for (j = 0; j < deviceCount; j++) {
64
      
65
      // print device name
66
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
67
      value = (char*) malloc(valueSize);
68
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
69
      printf("Device (%d,%d): %s\n",i, j, value);
70
      free(value);
71

    
72
      // print type device CPU/GPU/ACCELERATOR
73
      clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
74
      printf("\tDevice Type: ");
75
      if(dev_type & CL_DEVICE_TYPE_GPU)
76
        printf("CL_DEVICE_TYPE_GPU ");
77
      if(dev_type & CL_DEVICE_TYPE_CPU)
78
        printf("CL_DEVICE_TYPE_CPU ");
79
      if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
80
        printf("CL_DEVICE_TYPE_ACCELERATOR ");
81
      if(dev_type & CL_DEVICE_TYPE_DEFAULT)
82
        printf("CL_DEVICE_TYPE_DEFAULT ");
83
      printf("\n");
84

    
85
      // print device vendor
86
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
87
      value = (char*) malloc(valueSize);
88
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
89
      printf("\tDevice vendor: %s\n", value);
90
      free(value);
91

    
92
      // print hardware device version
93
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);
94
      value = (char*) malloc(valueSize);
95
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
96
      printf("\tHardware version: %s\n", value);
97
      free(value);
98

    
99
      // print software driver version
100
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);
101
      value = (char*) malloc(valueSize);
102
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
103
      printf("\tSoftware version: %s\n", value);
104
      free(value);
105
      
106
      // print c version supported by compiler for device
107
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize);
108
      value = (char*) malloc(valueSize);
109
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
110
      printf("\tOpenCL C version: %s\n", value);
111
      free(value);
112

    
113
      // print parallel compute units
114
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,
115
                      sizeof(maxComputeUnits), &maxComputeUnits, NULL);
116
      printf("\tParallel compute units: %d\n", maxComputeUnits);
117
      
118
      // print max work group size
119
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE,
120
                      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
121
      printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
122
      
123
      // print max work items size
124
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES,
125
                      sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL);
126
      printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
127
      
128
    }
129
    printf("\n");
130
    free(devices);
131
  }
132

    
133
  free(platforms);
134
  return 0;
135

    
136
}
137

    
138
const char* OpenCLSource[] = {
139
  "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
140
  "// Marsaglia RNG very simple implementation \n",
141
  "#define znew  ((z=36969*(z&65535)+(z>>16))<<16) \n",
142
  "#define wnew  ((w=18000*(w&65535)+(w>>16))&65535) \n",
143
  "#define MWC   (znew+wnew) \n",
144
  "#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
145
  "#define CONG  (jcong=69069*jcong+1234567) \n",
146
  "#define KISS  ((MWC^CONG)+SHR3) \n",
147
  "#define MWCfp MWC * 2.328306435454494e-10f \n",
148
  "#define KISSfp KISS * 2.328306435454494e-10f \n",
149
  "#define CONGfp CONG * 2.328306435454494e-10f \n",
150
  "#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
151
  "#define TINT32 0 \n",
152
  "#define TINT64 1 \n",
153
  "#define TFP32 2 \n",
154
  "#define TFP64 3 \n",
155
  "#define THEONE32I 1073741824 \n",
156
  "#define THEONE32F 1.e0f \n",
157
  "#define THEONE64I 4611686018427387904 \n",
158
  "#define THEONE64F (double)1.e0f \n",
159
  "ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
160
  "{",
161
  "   uint z=seed_z+work;",
162
  "   uint w=seed_w+work;",
163
  "   ulong total=0;",
164
  "   for (ulong i=0;i<iterations;i++)",
165
  "   {",
166
  "      uint x= MWC>>17;",
167
  "      uint y= MWC>>17;",
168
  "      ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
169
  "      total+=inside;",
170
  "   }",
171
  "   return(total);",
172
  "}",
173
  "ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
174
  "{",
175
  "   uint z=seed_z+work;",
176
  "   uint w=seed_w+work;",
177
  "   ulong total=0;",
178
  "   for (ulong i=0;i<iterations;i++)",
179
  "   {",
180
  "      float x=(float)MWCfp ;",
181
  "      float y=(float)MWCfp ;",
182
  "      ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
183
  "      total+=inside;",
184
  "   }",
185
  "   return(total);",
186
  "}",
187
  "ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
188
  "{",
189
  "   uint z=seed_z+work;",
190
  "   uint w=seed_w+work;",
191
  "   ulong total=0;",
192
  "   for (ulong i=0;i<iterations;i++)",
193
  "   {",
194
  "      ulong x=(ulong)(MWC>>1);",
195
  "      ulong y=(ulong)(MWC>>1);",
196
  "      ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
197
  "      total+=inside;",
198
  "   }",
199
  "   return(total);",
200
  "}",
201
  "ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
202
  "{",
203
  "   uint z=seed_z+work;",
204
  "   uint w=seed_w+work;",
205
  "   ulong total=0;",
206
  "   for (ulong i=0;i<iterations;i++)",
207
  "{",
208
  "        double x=(double)MWCfp ;",
209
  "        double y=(double)MWCfp ;",
210
  "      ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
211
  "      total+=inside;",
212
  "}",
213
  "   return(total);",
214
  "}",
215
  "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
216
  "{",
217
  "   ulong total;",
218
  "   if (MyType==TFP32) {",
219
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
220
  "   }",
221
  "   else if (MyType==TFP64) {",
222
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
223
  "   }",  
224
  "   else if (MyType==TINT32) {",
225
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
226
  "   }",  
227
  "   else if (MyType==TINT64) {",
228
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
229
  "   }",  
230
  "   barrier(CLK_GLOBAL_MEM_FENCE);",
231
  "   s[get_global_id(0)]=(ulong)total;",
232
  "}",
233
  "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
234
  "{",
235
  "   ulong total;",
236
  "   if (MyType==TFP32) {",
237
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
238
  "   }",
239
  "   else if (MyType==TFP64) {",
240
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
241
  "   }",  
242
  "   else if (MyType==TINT32) {",
243
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
244
  "   }",  
245
  "   else if (MyType==TINT64) {",
246
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
247
  "   }",  
248
  "   barrier(CLK_LOCAL_MEM_FENCE);",
249
  "   s[get_local_id(0)]=(ulong)total;",
250
  "}",
251
  "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
252
  "{",
253
  "   ulong total;",
254
  "   if (MyType==TFP32) {",
255
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
256
  "   }",
257
  "   else if (MyType==TFP64) {",
258
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
259
  "   }",  
260
  "   else if (MyType==TINT32) {",
261
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
262
  "   }",  
263
  "   else if (MyType==TINT64) {",
264
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
265
  "   }",  
266
  "   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
267
  "   s[get_global_id(0)]=total;",
268
  "}"
269
};
270

    
271
int main(int argc, char **argv)
272
{
273
  if ((argc==1)||
274
      (strcmp(argv[1],"-h")==0)||
275
      (strcmp(argv[1],"--help")==0))
276
    {
277
      printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
278
             "\t#1 OpenCL Plateform ID (default 0)\n"
279
             "\t#2 OpenCL Device ID (default 0)\n"
280
             "\t#3 Minimal number of iterations (default 1000000)\n"
281
             "\t#4 Parallel Rate (default 1024)\n"
282
             "\t#5 Loops (default 1)\n"
283
             "\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
284
      DetectOpenCLDevices();
285
    }
286
  else
287
    {
288
      
289
      int MyPlatform=atoi(argv[1]);
290
      int MyDevice=atoi(argv[2]);
291

    
292
      struct timeval tv1,tv2;
293
      struct timezone tz;
294
      
295
      uint64_t Iterations=1000000;
296
      if (argc>3) {
297
        Iterations=(uint64_t)atoll(argv[3]);
298
      }
299
      
300
      uint32_t ParallelRate=1024;
301
      if (argc>4) {
302
        ParallelRate=(uint32_t)atoi(argv[4]);
303
      }
304
      
305
      uint32_t Loops=1;
306
      if (argc>5) {
307
        Loops=(uint32_t)atoi(argv[5]);
308
      }
309
      
310
      uint32_t MyType=TFP32;
311
      if (argc>6) {
312
        if (strcmp(argv[6],"INT32")==0) {
313
          MyType=(uint32_t)TINT32;
314
        }
315
        else if (strcmp(argv[6],"INT64")==0) {
316
          MyType=(uint32_t)TINT64;
317
        }
318
        else if (strcmp(argv[6],"FP32")==0) {
319
          MyType=(uint32_t)TFP32;
320
        }
321
        else if (strcmp(argv[6],"FP64")==0) {
322
          MyType=(uint32_t)TFP64;
323
        }
324
      }
325

    
326
      printf("MyType %d\n",MyType);
327
      
328
      cl_int err;
329
      cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
330
      
331
      // Detect, scan, get & initialize platform and device
332
      cl_uint platformCount;
333
      cl_platform_id* platforms;
334
      cl_uint deviceCount;
335
      cl_device_id* devices;      
336
      size_t valueSize;
337
      
338
      /* Setup OpenCL environment. */
339
     
340
      // Get all platforms
341
      err = clGetPlatformIDs(0, NULL, &platformCount);
342
      platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
343
      err = clGetPlatformIDs(platformCount, platforms, NULL);
344

    
345
      // Get Device defined
346
      err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
347
      devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
348
      err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);  
349

    
350
      // print device name
351
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize);
352
      char* deviceName=(char*)malloc(valueSize);
353
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
354
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
355
      char* vendorName=(char*)malloc(valueSize);      
356
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
357
      printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
358
      free(deviceName);
359
      free(vendorName);
360
      
361
      props[1] = (cl_context_properties)platforms[MyPlatform];
362
      
363
      cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err);
364
      cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
365

    
366
      cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
367
                                              sizeof(uint64_t) * ParallelRate, NULL, NULL);
368
      
369
      // 51 is the number of line for OpenCL code
370
      // 66, sans test
371
      cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL);
372
      clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
373
      cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL);
374
      cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL);
375
      cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL);
376

    
377
      // Divide the total number of iterations by the parallel rate
378
      // Add +1 to the number of per work iterations if division not integer
379
      uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1;
380
      // Initialize seeds for MWC RNG generator from Marsaglia
381
      uint32_t seed_w=110271;
382
      uint32_t seed_z=101008;
383

    
384
      // Set the values of arguments for OpenCL function call
385
      clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside);
386
      clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach);
387
      clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w);
388
      clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z);
389
      clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType);
390
      
391
      size_t WorkSize[1] = {ParallelRate}; // one dimensional Range
392

    
393
      uint64_t HostInside[ParallelRate];
394

    
395
      for (uint32_t loop=0;loop<Loops;loop++) {
396
        // Set start timer
397
        gettimeofday(&tv1, &tz);
398
        
399
            // Execute the OpenCL kernel with datas
400
        clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL,
401
                               WorkSize, NULL, 0, NULL, NULL);
402
        // Copy each result for each PR from Device to Host
403
        clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
404
                            ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL);
405
        uint64_t inside=0;
406

    
407
        for (int i= 0; i < ParallelRate; i++) {
408
          inside+=HostInside[i];
409
        }
410
          
411
        // Set stop timer
412
        gettimeofday(&tv2, &tz);
413

    
414
        double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L +
415
                                (tv2.tv_usec-tv1.tv_usec))/1000000;  
416

    
417
        double itops=(double)(ParallelRate*IterationsEach)/elapsed;
418
      
419
        printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nLogItops %.2f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,log10(itops),(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach))));
420
      }
421
      printf("\n\n");
422
      
423
      clReleaseKernel(OpenCLMainLoopGlobal);
424
      clReleaseProgram(OpenCLProgram);
425
      clReleaseCommandQueue(cqCommandQueue);
426
      clReleaseContext(GPUContext);
427
      clReleaseMemObject(GPUInside);
428

    
429
      
430
      return 0;
431
    }
432
}
433