Statistiques
| Révision :

root / Pi / C / OpenCL / PiOpenCL.c @ 191

Historique | Voir | Annoter | Télécharger (15,01 ko)

1 191 equemene
// Pi Dart Dash in OpenCL in C, illustrative example
2 191 equemene
//
3 191 equemene
// Emmanuel Quemener <emmanuel.quemener@gmail.com>
4 191 equemene
//
5 191 equemene
// CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
6 191 equemene
// Copyleft Cecill v2
7 191 equemene
//
8 191 equemene
// -h : print the documentation and detect devices as (platform,device)
9 191 equemene
//
10 191 equemene
// classical use:
11 191 equemene
// #1 OpenCL Plateform ID: get this information with -h option
12 191 equemene
// #2 OpenCL Device ID: get this information with -h option
13 191 equemene
// #3 Minimal number of iterations:
14 191 equemene
// #4 Parallel Rate: scattering global work in parts executed //ly
15 191 equemene
// #5 Loops (to improve statistics)
16 191 equemene
// #6 Type of variables INT32, INT64, FP32, FP64
17 191 equemene
//
18 191 equemene
// To compile : gcc -o PiOpenCL PiOpenCL.c -lOpenCL
19 191 equemene
20 191 equemene
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
21 191 equemene
#include <stdio.h>
22 191 equemene
#include <stdlib.h>
23 191 equemene
#include <string.h>
24 191 equemene
#include <CL/cl.h>
25 191 equemene
#include <stdint.h>
26 191 equemene
#include <sys/time.h>
27 191 equemene
28 191 equemene
#define TINT32 0
29 191 equemene
#define TINT64 1
30 191 equemene
#define TFP32 2
31 191 equemene
#define TFP64 3
32 191 equemene
33 191 equemene
int DetectOpenCLDevices(void)
34 191 equemene
{
35 191 equemene
  int i, j;
36 191 equemene
  char* value;
37 191 equemene
  size_t valueSize;
38 191 equemene
  cl_uint platformCount;
39 191 equemene
  cl_platform_id* platforms;
40 191 equemene
  cl_uint deviceCount;
41 191 equemene
  cl_device_id* devices;
42 191 equemene
  cl_uint maxComputeUnits;
43 191 equemene
  cl_int maxWorkGroupSize;
44 191 equemene
  cl_int maxWorkItemSizes;
45 191 equemene
  cl_device_type dev_type;
46 191 equemene
47 191 equemene
  // get all platforms
48 191 equemene
  clGetPlatformIDs(0, NULL, &platformCount);
49 191 equemene
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
50 191 equemene
  clGetPlatformIDs(platformCount, platforms, NULL);
51 191 equemene
52 191 equemene
  printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
53 191 equemene
54 191 equemene
  for (i = 0; i < platformCount; i++) {
55 191 equemene
56 191 equemene
    // get all devices
57 191 equemene
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
58 191 equemene
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
59 191 equemene
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
60 191 equemene
61 191 equemene
    // for each device print critical attributes
62 191 equemene
    for (j = 0; j < deviceCount; j++) {
63 191 equemene
64 191 equemene
      // print device name
65 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
66 191 equemene
      value = (char*) malloc(valueSize);
67 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
68 191 equemene
      printf("Device (%d,%d): %s\n",i, j, value);
69 191 equemene
      free(value);
70 191 equemene
71 191 equemene
      // print type device CPU/GPU/ACCELERATOR
72 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
73 191 equemene
      printf("\tDevice Type: ");
74 191 equemene
      if(dev_type & CL_DEVICE_TYPE_GPU)
75 191 equemene
        printf("CL_DEVICE_TYPE_GPU ");
76 191 equemene
      if(dev_type & CL_DEVICE_TYPE_CPU)
77 191 equemene
        printf("CL_DEVICE_TYPE_CPU ");
78 191 equemene
      if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
79 191 equemene
        printf("CL_DEVICE_TYPE_ACCELERATOR ");
80 191 equemene
      if(dev_type & CL_DEVICE_TYPE_DEFAULT)
81 191 equemene
        printf("CL_DEVICE_TYPE_DEFAULT ");
82 191 equemene
      printf("\n");
83 191 equemene
84 191 equemene
      // print device vendor
85 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
86 191 equemene
      value = (char*) malloc(valueSize);
87 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
88 191 equemene
      printf("\tDevice vendor: %s\n", value);
89 191 equemene
      free(value);
90 191 equemene
91 191 equemene
      // print hardware device version
92 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);
93 191 equemene
      value = (char*) malloc(valueSize);
94 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
95 191 equemene
      printf("\tHardware version: %s\n", value);
96 191 equemene
      free(value);
97 191 equemene
98 191 equemene
      // print software driver version
99 191 equemene
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);
100 191 equemene
      value = (char*) malloc(valueSize);
101 191 equemene
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
102 191 equemene
      printf("\tSoftware version: %s\n", value);
103 191 equemene
      free(value);
104 191 equemene
105 191 equemene
      // print c version supported by compiler for device
106 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize);
107 191 equemene
      value = (char*) malloc(valueSize);
108 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
109 191 equemene
      printf("\tOpenCL C version: %s\n", value);
110 191 equemene
      free(value);
111 191 equemene
112 191 equemene
      // print parallel compute units
113 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,
114 191 equemene
                      sizeof(maxComputeUnits), &maxComputeUnits, NULL);
115 191 equemene
      printf("\tParallel compute units: %d\n", maxComputeUnits);
116 191 equemene
117 191 equemene
      // print max work group size
118 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE,
119 191 equemene
                      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
120 191 equemene
      printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
121 191 equemene
122 191 equemene
      // print max work items size
123 191 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES,
124 191 equemene
                      sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL);
125 191 equemene
      printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
126 191 equemene
127 191 equemene
    }
128 191 equemene
    printf("\n");
129 191 equemene
    free(devices);
130 191 equemene
  }
131 191 equemene
132 191 equemene
  free(platforms);
133 191 equemene
  return 0;
134 191 equemene
135 191 equemene
}
136 191 equemene
137 191 equemene
const char* OpenCLSource[] = {
138 191 equemene
  "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
139 191 equemene
  "// Marsaglia RNG very simple implementation \n",
140 191 equemene
  "#define znew  ((z=36969*(z&65535)+(z>>16))<<16) \n",
141 191 equemene
  "#define wnew  ((w=18000*(w&65535)+(w>>16))&65535) \n",
142 191 equemene
  "#define MWC   (znew+wnew) \n",
143 191 equemene
  "#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
144 191 equemene
  "#define CONG  (jcong=69069*jcong+1234567) \n",
145 191 equemene
  "#define KISS  ((MWC^CONG)+SHR3) \n",
146 191 equemene
  "#define MWCfp MWC * 2.328306435454494e-10f \n",
147 191 equemene
  "#define KISSfp KISS * 2.328306435454494e-10f \n",
148 191 equemene
  "#define CONGfp CONG * 2.328306435454494e-10f \n",
149 191 equemene
  "#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
150 191 equemene
  "#define TINT32 0 \n",
151 191 equemene
  "#define TINT64 1 \n",
152 191 equemene
  "#define TFP32 2 \n",
153 191 equemene
  "#define TFP64 3 \n",
154 191 equemene
  "#define THEONE32I 1073741824 \n",
155 191 equemene
  "#define THEONE32F 1.e0f \n",
156 191 equemene
  "#define THEONE64I 4611686018427387904 \n",
157 191 equemene
  "#define THEONE64F (double)1.e0f \n",
158 191 equemene
  "ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
159 191 equemene
  "{",
160 191 equemene
  "   uint z=seed_z+work;",
161 191 equemene
  "   uint w=seed_w+work;",
162 191 equemene
  "   ulong total=0;",
163 191 equemene
  "   for (ulong i=0;i<iterations;i++)",
164 191 equemene
  "   {",
165 191 equemene
  "      uint x= MWC>>17;",
166 191 equemene
  "      uint y= MWC>>17;",
167 191 equemene
  "      ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
168 191 equemene
  "      total+=inside;",
169 191 equemene
  "   }",
170 191 equemene
  "   return(total);",
171 191 equemene
  "}",
172 191 equemene
  "ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
173 191 equemene
  "{",
174 191 equemene
  "   uint z=seed_z+work;",
175 191 equemene
  "   uint w=seed_w+work;",
176 191 equemene
  "   ulong total=0;",
177 191 equemene
  "   for (ulong i=0;i<iterations;i++)",
178 191 equemene
  "   {",
179 191 equemene
  "      float x=(float)MWCfp ;",
180 191 equemene
  "      float y=(float)MWCfp ;",
181 191 equemene
  "      ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
182 191 equemene
  "      total+=inside;",
183 191 equemene
  "   }",
184 191 equemene
  "   return(total);",
185 191 equemene
  "}",
186 191 equemene
  "ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
187 191 equemene
  "{",
188 191 equemene
  "   uint z=seed_z+work;",
189 191 equemene
  "   uint w=seed_w+work;",
190 191 equemene
  "   ulong total=0;",
191 191 equemene
  "   for (ulong i=0;i<iterations;i++)",
192 191 equemene
  "   {",
193 191 equemene
  "      ulong x=(ulong)(MWC>>1);",
194 191 equemene
  "      ulong y=(ulong)(MWC>>1);",
195 191 equemene
  "      ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
196 191 equemene
  "      total+=inside;",
197 191 equemene
  "   }",
198 191 equemene
  "   return(total);",
199 191 equemene
  "}",
200 191 equemene
  "ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
201 191 equemene
  "{",
202 191 equemene
  "   uint z=seed_z+work;",
203 191 equemene
  "   uint w=seed_w+work;",
204 191 equemene
  "   ulong total=0;",
205 191 equemene
  "   for (ulong i=0;i<iterations;i++)",
206 191 equemene
  "{",
207 191 equemene
  "        double x=(double)MWCfp ;",
208 191 equemene
  "        double y=(double)MWCfp ;",
209 191 equemene
  "      ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
210 191 equemene
  "      total+=inside;",
211 191 equemene
  "}",
212 191 equemene
  "   return(total);",
213 191 equemene
  "}",
214 191 equemene
  "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
215 191 equemene
  "{",
216 191 equemene
  "   ulong total;",
217 191 equemene
  "   if (MyType==TFP32) {",
218 191 equemene
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
219 191 equemene
  "   }",
220 191 equemene
  "   else if (MyType==TFP64) {",
221 191 equemene
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
222 191 equemene
  "   }",
223 191 equemene
  "   else if (MyType==TINT32) {",
224 191 equemene
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
225 191 equemene
  "   }",
226 191 equemene
  "   else if (MyType==TINT64) {",
227 191 equemene
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
228 191 equemene
  "   }",
229 191 equemene
  "   barrier(CLK_GLOBAL_MEM_FENCE);",
230 191 equemene
  "   s[get_global_id(0)]=(ulong)total;",
231 191 equemene
  "}",
232 191 equemene
  "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
233 191 equemene
  "{",
234 191 equemene
  "   ulong total;",
235 191 equemene
  "   if (MyType==TFP32) {",
236 191 equemene
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
237 191 equemene
  "   }",
238 191 equemene
  "   else if (MyType==TFP64) {",
239 191 equemene
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
240 191 equemene
  "   }",
241 191 equemene
  "   else if (MyType==TINT32) {",
242 191 equemene
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
243 191 equemene
  "   }",
244 191 equemene
  "   else if (MyType==TINT64) {",
245 191 equemene
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
246 191 equemene
  "   }",
247 191 equemene
  "   barrier(CLK_LOCAL_MEM_FENCE);",
248 191 equemene
  "   s[get_local_id(0)]=(ulong)total;",
249 191 equemene
  "}",
250 191 equemene
  "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
251 191 equemene
  "{",
252 191 equemene
  "   ulong total;",
253 191 equemene
  "   if (MyType==TFP32) {",
254 191 equemene
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
255 191 equemene
  "   }",
256 191 equemene
  "   else if (MyType==TFP64) {",
257 191 equemene
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
258 191 equemene
  "   }",
259 191 equemene
  "   else if (MyType==TINT32) {",
260 191 equemene
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
261 191 equemene
  "   }",
262 191 equemene
  "   else if (MyType==TINT64) {",
263 191 equemene
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
264 191 equemene
  "   }",
265 191 equemene
  "   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
266 191 equemene
  "   s[get_global_id(0)]=total;",
267 191 equemene
  "}"
268 191 equemene
};
269 191 equemene
270 191 equemene
int main(int argc, char **argv)
271 191 equemene
{
272 191 equemene
  if ((argc==1)||
273 191 equemene
      (strcmp(argv[1],"-h")==0)||
274 191 equemene
      (strcmp(argv[1],"--help")==0))
275 191 equemene
    {
276 191 equemene
      printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
277 191 equemene
             "\t#1 OpenCL Plateform ID (default 0)\n"
278 191 equemene
             "\t#2 OpenCL Device ID (default 0)\n"
279 191 equemene
             "\t#3 Minimal number of iterations (default 1000000)\n"
280 191 equemene
             "\t#4 Parallel Rate (default 1024)\n"
281 191 equemene
             "\t#5 Loops (default 1)\n"
282 191 equemene
             "\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
283 191 equemene
      DetectOpenCLDevices();
284 191 equemene
    }
285 191 equemene
  else
286 191 equemene
    {
287 191 equemene
288 191 equemene
      int MyPlatform=atoi(argv[1]);
289 191 equemene
      int MyDevice=atoi(argv[2]);
290 191 equemene
291 191 equemene
      struct timeval tv1,tv2;
292 191 equemene
      struct timezone tz;
293 191 equemene
294 191 equemene
      uint64_t Iterations=1000000;
295 191 equemene
      if (argc>3) {
296 191 equemene
        Iterations=(uint64_t)atoll(argv[3]);
297 191 equemene
      }
298 191 equemene
299 191 equemene
      uint32_t ParallelRate=64;
300 191 equemene
      if (argc>4) {
301 191 equemene
        ParallelRate=(uint32_t)atoi(argv[4]);
302 191 equemene
      }
303 191 equemene
304 191 equemene
      uint32_t Loops=1;
305 191 equemene
      if (argc>5) {
306 191 equemene
        Loops=(uint32_t)atoi(argv[5]);
307 191 equemene
      }
308 191 equemene
309 191 equemene
      uint32_t MyType=TFP32;
310 191 equemene
      if (argc>6) {
311 191 equemene
        if (strcmp(argv[6],"INT32")==0) {
312 191 equemene
          MyType=(uint32_t)TINT32;
313 191 equemene
        }
314 191 equemene
        else if (strcmp(argv[6],"INT64")==0) {
315 191 equemene
          MyType=(uint32_t)TINT64;
316 191 equemene
        }
317 191 equemene
        else if (strcmp(argv[6],"FP32")==0) {
318 191 equemene
          MyType=(uint32_t)TFP32;
319 191 equemene
        }
320 191 equemene
        else if (strcmp(argv[6],"FP64")==0) {
321 191 equemene
          MyType=(uint32_t)TFP64;
322 191 equemene
        }
323 191 equemene
      }
324 191 equemene
325 191 equemene
      printf("MyType %d\n",MyType);
326 191 equemene
327 191 equemene
      cl_int err;
328 191 equemene
      cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
329 191 equemene
330 191 equemene
      // Detect, scan, get & initialize platform and device
331 191 equemene
      cl_uint platformCount;
332 191 equemene
      cl_platform_id* platforms;
333 191 equemene
      cl_uint deviceCount;
334 191 equemene
      cl_device_id* devices;
335 191 equemene
      size_t valueSize;
336 191 equemene
337 191 equemene
      /* Setup OpenCL environment. */
338 191 equemene
339 191 equemene
      // Get all platforms
340 191 equemene
      err = clGetPlatformIDs(0, NULL, &platformCount);
341 191 equemene
      platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
342 191 equemene
      err = clGetPlatformIDs(platformCount, platforms, NULL);
343 191 equemene
344 191 equemene
      // Get Device defined
345 191 equemene
      err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
346 191 equemene
      devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
347 191 equemene
      err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
348 191 equemene
349 191 equemene
      // print device name
350 191 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize);
351 191 equemene
      char* deviceName=(char*)malloc(valueSize);
352 191 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
353 191 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
354 191 equemene
      char* vendorName=(char*)malloc(valueSize);
355 191 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
356 191 equemene
      printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
357 191 equemene
      free(deviceName);
358 191 equemene
      free(vendorName);
359 191 equemene
360 191 equemene
      props[1] = (cl_context_properties)platforms[MyPlatform];
361 191 equemene
362 191 equemene
      cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err);
363 191 equemene
      cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
364 191 equemene
365 191 equemene
      cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
366 191 equemene
                                              sizeof(uint64_t) * ParallelRate, NULL, NULL);
367 191 equemene
368 191 equemene
      // 51 is the number of line for OpenCL code
369 191 equemene
      // 66, sans test
370 191 equemene
      cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL);
371 191 equemene
      clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
372 191 equemene
      cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL);
373 191 equemene
      cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL);
374 191 equemene
      cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL);
375 191 equemene
376 191 equemene
      // Divide the total number of iterations by the parallel rate
377 191 equemene
      // Add +1 to the number of per work iterations if division not integer
378 191 equemene
      uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1;
379 191 equemene
      // Initialize seeds for MWC RNG generator from Marsaglia
380 191 equemene
      uint32_t seed_w=110271;
381 191 equemene
      uint32_t seed_z=101008;
382 191 equemene
383 191 equemene
      // Set the values of arguments for OpenCL function call
384 191 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside);
385 191 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach);
386 191 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w);
387 191 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z);
388 191 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType);
389 191 equemene
390 191 equemene
      size_t WorkSize[1] = {ParallelRate}; // one dimensional Range
391 191 equemene
392 191 equemene
      uint64_t HostInside[ParallelRate];
393 191 equemene
394 191 equemene
      for (uint32_t loop=0;loop<Loops;loop++) {
395 191 equemene
        // Set start timer
396 191 equemene
        gettimeofday(&tv1, &tz);
397 191 equemene
398 191 equemene
            // Execute the OpenCL kernel with datas
399 191 equemene
        clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL,
400 191 equemene
                               WorkSize, NULL, 0, NULL, NULL);
401 191 equemene
        // Copy each result for each PR from Device to Host
402 191 equemene
        clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
403 191 equemene
                            ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL);
404 191 equemene
        uint64_t inside=0;
405 191 equemene
406 191 equemene
        for (int i= 0; i < ParallelRate; i++) {
407 191 equemene
          inside+=HostInside[i];
408 191 equemene
        }
409 191 equemene
410 191 equemene
        // Set stop timer
411 191 equemene
        gettimeofday(&tv2, &tz);
412 191 equemene
413 191 equemene
        double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L +
414 191 equemene
                                (tv2.tv_usec-tv1.tv_usec))/1000000;
415 191 equemene
416 191 equemene
        double itops=(double)(ParallelRate*IterationsEach)/elapsed;
417 191 equemene
418 191 equemene
        printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach))));
419 191 equemene
      }
420 191 equemene
      printf("\n\n");
421 191 equemene
422 191 equemene
      clReleaseKernel(OpenCLMainLoopGlobal);
423 191 equemene
      clReleaseProgram(OpenCLProgram);
424 191 equemene
      clReleaseCommandQueue(cqCommandQueue);
425 191 equemene
      clReleaseContext(GPUContext);
426 191 equemene
      clReleaseMemObject(GPUInside);
427 191 equemene
428 191 equemene
429 191 equemene
      return 0;
430 191 equemene
    }
431 191 equemene
}