Statistiques
| Révision :

root / Pi / C / OpenCL / PiOpenCL_KISS.c @ 308

Historique | Voir | Annoter | Télécharger (15,31 ko)

1 255 equemene
// Pi Dart Dash in OpenCL in C, illustrative example
2 255 equemene
//
3 255 equemene
// Emmanuel Quemener <emmanuel.quemener@gmail.com>
4 255 equemene
//
5 255 equemene
// CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
6 255 equemene
// Copyleft Cecill v2
7 255 equemene
//
8 255 equemene
// -h : print the documentation and detect devices as (platform,device)
9 255 equemene
//
10 255 equemene
// classical use:
11 255 equemene
// #1 OpenCL Plateform ID: get this information with -h option
12 255 equemene
// #2 OpenCL Device ID: get this information with -h option
13 255 equemene
// #3 Minimal number of iterations:
14 255 equemene
// #4 Parallel Rate: scattering global work in parts executed //ly
15 255 equemene
// #5 Loops (to improve statistics)
16 255 equemene
// #6 Type of variables INT32, INT64, FP32, FP64
17 255 equemene
//
18 255 equemene
// To compile : gcc -o PiOpenCL_KISS PiOpenCL_KISS.c -lOpenCL -lm
19 255 equemene
20 255 equemene
#define CL_TARGET_OPENCL_VERSION 220
21 255 equemene
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
22 255 equemene
#include <stdio.h>
23 255 equemene
#include <stdlib.h>
24 255 equemene
#include <string.h>
25 255 equemene
#include <CL/cl.h>
26 255 equemene
#include <stdint.h>
27 255 equemene
#include <math.h>
28 255 equemene
#include <sys/time.h>
29 255 equemene
30 255 equemene
#define TINT32 0
31 255 equemene
#define TINT64 1
32 255 equemene
#define TFP32 2
33 255 equemene
#define TFP64 3
34 255 equemene
35 255 equemene
int DetectOpenCLDevices(void)
36 255 equemene
{
37 255 equemene
  int i, j;
38 255 equemene
  char* value;
39 255 equemene
  size_t valueSize;
40 255 equemene
  cl_uint platformCount;
41 255 equemene
  cl_platform_id* platforms;
42 255 equemene
  cl_uint deviceCount;
43 255 equemene
  cl_device_id* devices;
44 255 equemene
  cl_uint maxComputeUnits;
45 255 equemene
  cl_int maxWorkGroupSize;
46 255 equemene
  cl_int maxWorkItemSizes;
47 255 equemene
  cl_device_type dev_type;
48 255 equemene
49 255 equemene
  // get all platforms
50 255 equemene
  clGetPlatformIDs(0, NULL, &platformCount);
51 255 equemene
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
52 255 equemene
  clGetPlatformIDs(platformCount, platforms, NULL);
53 255 equemene
54 255 equemene
  printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
55 255 equemene
56 255 equemene
  for (i = 0; i < platformCount; i++) {
57 255 equemene
58 255 equemene
    // get all devices
59 255 equemene
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
60 255 equemene
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
61 255 equemene
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
62 255 equemene
63 255 equemene
    // for each device print critical attributes
64 255 equemene
    for (j = 0; j < deviceCount; j++) {
65 255 equemene
66 255 equemene
      // print device name
67 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
68 255 equemene
      value = (char*) malloc(valueSize);
69 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
70 255 equemene
      printf("Device (%d,%d): %s\n",i, j, value);
71 255 equemene
      free(value);
72 255 equemene
73 255 equemene
      // print type device CPU/GPU/ACCELERATOR
74 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
75 255 equemene
      printf("\tDevice Type: ");
76 255 equemene
      if(dev_type & CL_DEVICE_TYPE_GPU)
77 255 equemene
        printf("CL_DEVICE_TYPE_GPU ");
78 255 equemene
      if(dev_type & CL_DEVICE_TYPE_CPU)
79 255 equemene
        printf("CL_DEVICE_TYPE_CPU ");
80 255 equemene
      if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
81 255 equemene
        printf("CL_DEVICE_TYPE_ACCELERATOR ");
82 255 equemene
      if(dev_type & CL_DEVICE_TYPE_DEFAULT)
83 255 equemene
        printf("CL_DEVICE_TYPE_DEFAULT ");
84 255 equemene
      printf("\n");
85 255 equemene
86 255 equemene
      // print device vendor
87 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
88 255 equemene
      value = (char*) malloc(valueSize);
89 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
90 255 equemene
      printf("\tDevice vendor: %s\n", value);
91 255 equemene
      free(value);
92 255 equemene
93 255 equemene
      // print hardware device version
94 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);
95 255 equemene
      value = (char*) malloc(valueSize);
96 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
97 255 equemene
      printf("\tHardware version: %s\n", value);
98 255 equemene
      free(value);
99 255 equemene
100 255 equemene
      // print software driver version
101 255 equemene
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);
102 255 equemene
      value = (char*) malloc(valueSize);
103 255 equemene
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
104 255 equemene
      printf("\tSoftware version: %s\n", value);
105 255 equemene
      free(value);
106 255 equemene
107 255 equemene
      // print c version supported by compiler for device
108 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize);
109 255 equemene
      value = (char*) malloc(valueSize);
110 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
111 255 equemene
      printf("\tOpenCL C version: %s\n", value);
112 255 equemene
      free(value);
113 255 equemene
114 255 equemene
      // print parallel compute units
115 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,
116 255 equemene
                      sizeof(maxComputeUnits), &maxComputeUnits, NULL);
117 255 equemene
      printf("\tParallel compute units: %d\n", maxComputeUnits);
118 255 equemene
119 255 equemene
      // print max work group size
120 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE,
121 255 equemene
                      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
122 255 equemene
      printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
123 255 equemene
124 255 equemene
      // print max work items size
125 255 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES,
126 255 equemene
                      sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL);
127 255 equemene
      printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
128 255 equemene
129 255 equemene
    }
130 255 equemene
    printf("\n");
131 255 equemene
    free(devices);
132 255 equemene
  }
133 255 equemene
134 255 equemene
  free(platforms);
135 255 equemene
  return 0;
136 255 equemene
137 255 equemene
}
138 255 equemene
139 255 equemene
const char* OpenCLSource[] = {
140 255 equemene
  "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
141 255 equemene
  "// Marsaglia RNG very simple implementation \n",
142 255 equemene
  "#define znew  ((z=36969*(z&65535)+(z>>16))<<16) \n",
143 255 equemene
  "#define wnew  ((w=18000*(w&65535)+(w>>16))&65535) \n",
144 255 equemene
  "#define MWC   (znew+wnew) \n",
145 255 equemene
  "#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
146 255 equemene
  "#define CONG  (jcong=69069*jcong+1234567) \n",
147 255 equemene
  "#define KISS  ((MWC^CONG)+SHR3) \n",
148 255 equemene
  "#define MWCfp MWC * 2.328306435454494e-10f \n",
149 255 equemene
  "#define KISSfp KISS * 2.328306435454494e-10f \n",
150 255 equemene
  "#define CONGfp CONG * 2.328306435454494e-10f \n",
151 255 equemene
  "#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
152 255 equemene
  "#define TINT32 0 \n",
153 255 equemene
  "#define TINT64 1 \n",
154 255 equemene
  "#define TFP32 2 \n",
155 255 equemene
  "#define TFP64 3 \n",
156 255 equemene
  "#define THEONE32I 1073741824 \n",
157 255 equemene
  "#define THEONE32F 1.e0f \n",
158 255 equemene
  "#define THEONE64I 4611686018427387904 \n",
159 255 equemene
  "#define THEONE64F (double)1.e0f \n",
160 255 equemene
  "ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
161 255 equemene
  "{",
162 255 equemene
  "   uint z=seed_z+work;",
163 255 equemene
  "   uint w=seed_w+work;",
164 255 equemene
  "   uint jcong=seed_z+work;",
165 255 equemene
  "   uint jsr=seed_w+work;",
166 255 equemene
  "   ulong total=0;",
167 255 equemene
  "   for (ulong i=0;i<iterations;i++)",
168 255 equemene
  "   {",
169 255 equemene
  "      uint x= MWC>>17;",
170 255 equemene
  "      uint y= MWC>>17;",
171 255 equemene
  "      ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
172 255 equemene
  "      total+=inside;",
173 255 equemene
  "   }",
174 255 equemene
  "   return(total);",
175 255 equemene
  "}",
176 255 equemene
  "ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
177 255 equemene
  "{",
178 255 equemene
  "   uint z=seed_z+work;",
179 255 equemene
  "   uint w=seed_w+work;",
180 255 equemene
  "   uint jcong=seed_z+work;",
181 255 equemene
  "   uint jsr=seed_w+work;",
182 255 equemene
  "   ulong total=0;",
183 255 equemene
  "   for (ulong i=0;i<iterations;i++)",
184 255 equemene
  "   {",
185 255 equemene
  "      float x=(float)MWCfp ;",
186 255 equemene
  "      float y=(float)MWCfp ;",
187 255 equemene
  "      ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
188 255 equemene
  "      total+=inside;",
189 255 equemene
  "   }",
190 255 equemene
  "   return(total);",
191 255 equemene
  "}",
192 255 equemene
  "ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
193 255 equemene
  "{",
194 255 equemene
  "   uint z=seed_z+work;",
195 255 equemene
  "   uint w=seed_w+work;",
196 255 equemene
  "   uint jcong=seed_z+work;",
197 255 equemene
  "   uint jsr=seed_w+work;",
198 255 equemene
  "   ulong total=0;",
199 255 equemene
  "   for (ulong i=0;i<iterations;i++)",
200 255 equemene
  "   {",
201 255 equemene
  "      ulong x=(ulong)(MWC>>1);",
202 255 equemene
  "      ulong y=(ulong)(MWC>>1);",
203 255 equemene
  "      ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
204 255 equemene
  "      total+=inside;",
205 255 equemene
  "   }",
206 255 equemene
  "   return(total);",
207 255 equemene
  "}",
208 255 equemene
  "ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
209 255 equemene
  "{",
210 255 equemene
  "   uint z=seed_z+work;",
211 255 equemene
  "   uint w=seed_w+work;",
212 255 equemene
  "   uint jcong=seed_z+work;",
213 255 equemene
  "   uint jsr=seed_w+work;",
214 255 equemene
  "   ulong total=0;",
215 255 equemene
  "   for (ulong i=0;i<iterations;i++)",
216 255 equemene
  "{",
217 255 equemene
  "        double x=(double)MWCfp ;",
218 255 equemene
  "        double y=(double)MWCfp ;",
219 255 equemene
  "      ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
220 255 equemene
  "      total+=inside;",
221 255 equemene
  "}",
222 255 equemene
  "   return(total);",
223 255 equemene
  "}",
224 255 equemene
  "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
225 255 equemene
  "{",
226 255 equemene
  "   ulong total;",
227 255 equemene
  "   if (MyType==TFP32) {",
228 255 equemene
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
229 255 equemene
  "   }",
230 255 equemene
  "   else if (MyType==TFP64) {",
231 255 equemene
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
232 255 equemene
  "   }",
233 255 equemene
  "   else if (MyType==TINT32) {",
234 255 equemene
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
235 255 equemene
  "   }",
236 255 equemene
  "   else if (MyType==TINT64) {",
237 255 equemene
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
238 255 equemene
  "   }",
239 255 equemene
  "   barrier(CLK_GLOBAL_MEM_FENCE);",
240 255 equemene
  "   s[get_global_id(0)]=(ulong)total;",
241 255 equemene
  "}",
242 255 equemene
  "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
243 255 equemene
  "{",
244 255 equemene
  "   ulong total;",
245 255 equemene
  "   if (MyType==TFP32) {",
246 255 equemene
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
247 255 equemene
  "   }",
248 255 equemene
  "   else if (MyType==TFP64) {",
249 255 equemene
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
250 255 equemene
  "   }",
251 255 equemene
  "   else if (MyType==TINT32) {",
252 255 equemene
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
253 255 equemene
  "   }",
254 255 equemene
  "   else if (MyType==TINT64) {",
255 255 equemene
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
256 255 equemene
  "   }",
257 255 equemene
  "   barrier(CLK_LOCAL_MEM_FENCE);",
258 255 equemene
  "   s[get_local_id(0)]=(ulong)total;",
259 255 equemene
  "}",
260 255 equemene
  "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
261 255 equemene
  "{",
262 255 equemene
  "   ulong total;",
263 255 equemene
  "   if (MyType==TFP32) {",
264 255 equemene
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
265 255 equemene
  "   }",
266 255 equemene
  "   else if (MyType==TFP64) {",
267 255 equemene
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
268 255 equemene
  "   }",
269 255 equemene
  "   else if (MyType==TINT32) {",
270 255 equemene
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
271 255 equemene
  "   }",
272 255 equemene
  "   else if (MyType==TINT64) {",
273 255 equemene
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
274 255 equemene
  "   }",
275 255 equemene
  "   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
276 255 equemene
  "   s[get_global_id(0)]=total;",
277 255 equemene
  "}"
278 255 equemene
};
279 255 equemene
280 255 equemene
int main(int argc, char **argv)
281 255 equemene
{
282 255 equemene
  if ((argc==1)||
283 255 equemene
      (strcmp(argv[1],"-h")==0)||
284 255 equemene
      (strcmp(argv[1],"--help")==0))
285 255 equemene
    {
286 255 equemene
      printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
287 255 equemene
             "\t#1 OpenCL Plateform ID (default 0)\n"
288 255 equemene
             "\t#2 OpenCL Device ID (default 0)\n"
289 255 equemene
             "\t#3 Minimal number of iterations (default 1000000)\n"
290 255 equemene
             "\t#4 Parallel Rate (default 1024)\n"
291 255 equemene
             "\t#5 Loops (default 1)\n"
292 255 equemene
             "\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
293 255 equemene
      DetectOpenCLDevices();
294 255 equemene
    }
295 255 equemene
  else
296 255 equemene
    {
297 255 equemene
298 255 equemene
      int MyPlatform=atoi(argv[1]);
299 255 equemene
      int MyDevice=atoi(argv[2]);
300 255 equemene
301 255 equemene
      struct timeval tv1,tv2;
302 255 equemene
303 255 equemene
      uint64_t Iterations=1000000;
304 255 equemene
      if (argc>3) {
305 255 equemene
        Iterations=(uint64_t)atoll(argv[3]);
306 255 equemene
      }
307 255 equemene
308 255 equemene
      uint32_t ParallelRate=1024;
309 255 equemene
      if (argc>4) {
310 255 equemene
        ParallelRate=(uint32_t)atoi(argv[4]);
311 255 equemene
      }
312 255 equemene
313 255 equemene
      uint32_t Loops=1;
314 255 equemene
      if (argc>5) {
315 255 equemene
        Loops=(uint32_t)atoi(argv[5]);
316 255 equemene
      }
317 255 equemene
318 255 equemene
      uint32_t MyType=TFP32;
319 255 equemene
      if (argc>6) {
320 255 equemene
        if (strcmp(argv[6],"INT32")==0) {
321 255 equemene
          MyType=(uint32_t)TINT32;
322 255 equemene
        }
323 255 equemene
        else if (strcmp(argv[6],"INT64")==0) {
324 255 equemene
          MyType=(uint32_t)TINT64;
325 255 equemene
        }
326 255 equemene
        else if (strcmp(argv[6],"FP32")==0) {
327 255 equemene
          MyType=(uint32_t)TFP32;
328 255 equemene
        }
329 255 equemene
        else if (strcmp(argv[6],"FP64")==0) {
330 255 equemene
          MyType=(uint32_t)TFP64;
331 255 equemene
        }
332 255 equemene
      }
333 255 equemene
334 255 equemene
      printf("MyType %d\n",MyType);
335 255 equemene
336 255 equemene
      cl_int err;
337 255 equemene
      cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
338 255 equemene
339 255 equemene
      // Detect, scan, get & initialize platform and device
340 255 equemene
      cl_uint platformCount;
341 255 equemene
      cl_platform_id* platforms;
342 255 equemene
      cl_uint deviceCount;
343 255 equemene
      cl_device_id* devices;
344 255 equemene
      size_t valueSize;
345 255 equemene
346 255 equemene
      /* Setup OpenCL environment. */
347 255 equemene
348 255 equemene
      // Get all platforms
349 255 equemene
      err = clGetPlatformIDs(0, NULL, &platformCount);
350 255 equemene
      platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
351 255 equemene
      err = clGetPlatformIDs(platformCount, platforms, NULL);
352 255 equemene
353 255 equemene
      // Get Device defined
354 255 equemene
      err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
355 255 equemene
      devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
356 255 equemene
      err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
357 255 equemene
358 255 equemene
      // print device name
359 255 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize);
360 255 equemene
      char* deviceName=(char*)malloc(valueSize);
361 255 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
362 255 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
363 255 equemene
      char* vendorName=(char*)malloc(valueSize);
364 255 equemene
      err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
365 255 equemene
      printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
366 255 equemene
      free(deviceName);
367 255 equemene
      free(vendorName);
368 255 equemene
369 255 equemene
      props[1] = (cl_context_properties)platforms[MyPlatform];
370 255 equemene
371 255 equemene
      cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err);
372 255 equemene
      cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
373 255 equemene
374 255 equemene
      cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
375 255 equemene
                                              sizeof(uint64_t) * ParallelRate, NULL, NULL);
376 255 equemene
377 255 equemene
      // 138 is the number of line for OpenCL code
378 255 equemene
      cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 138 ,OpenCLSource,NULL,NULL);
379 255 equemene
      clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
380 255 equemene
      cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL);
381 255 equemene
      cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL);
382 255 equemene
      cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL);
383 255 equemene
384 255 equemene
      // Divide the total number of iterations by the parallel rate
385 255 equemene
      // Add +1 to the number of per work iterations if division not integer
386 255 equemene
      uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1;
387 255 equemene
      // Initialize seeds for MWC RNG generator from Marsaglia
388 255 equemene
      uint32_t seed_w=110271;
389 255 equemene
      uint32_t seed_z=101008;
390 255 equemene
391 255 equemene
      // Set the values of arguments for OpenCL function call
392 255 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside);
393 255 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach);
394 255 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w);
395 255 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z);
396 255 equemene
      clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType);
397 255 equemene
398 255 equemene
      size_t WorkSize[1] = {ParallelRate}; // one dimensional Range
399 255 equemene
400 255 equemene
      uint64_t HostInside[ParallelRate];
401 255 equemene
402 255 equemene
      for (uint32_t loop=0;loop<Loops;loop++) {
403 255 equemene
        // Set start timer
404 255 equemene
        gettimeofday(&tv1, NULL);
405 255 equemene
406 255 equemene
            // Execute the OpenCL kernel with datas
407 255 equemene
        clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL,
408 255 equemene
                               WorkSize, NULL, 0, NULL, NULL);
409 255 equemene
        // Copy each result for each PR from Device to Host
410 255 equemene
        clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
411 255 equemene
                            ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL);
412 255 equemene
        uint64_t inside=0;
413 255 equemene
414 255 equemene
        for (int i= 0; i < ParallelRate; i++) {
415 255 equemene
          inside+=HostInside[i];
416 255 equemene
        }
417 255 equemene
418 255 equemene
        // Set stop timer
419 255 equemene
        gettimeofday(&tv2, NULL);
420 255 equemene
421 255 equemene
        double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L +
422 255 equemene
                                (tv2.tv_usec-tv1.tv_usec))/1000000;
423 255 equemene
424 255 equemene
        double itops=(double)(ParallelRate*IterationsEach)/elapsed;
425 255 equemene
426 255 equemene
        printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nLogItops %.2f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,log10(itops),(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach))));
427 255 equemene
      }
428 255 equemene
      printf("\n\n");
429 255 equemene
430 255 equemene
      clReleaseKernel(OpenCLMainLoopGlobal);
431 255 equemene
      clReleaseProgram(OpenCLProgram);
432 255 equemene
      clReleaseCommandQueue(cqCommandQueue);
433 255 equemene
      clReleaseContext(GPUContext);
434 255 equemene
      clReleaseMemObject(GPUInside);
435 255 equemene
436 255 equemene
437 255 equemene
      return 0;
438 255 equemene
    }
439 255 equemene
}