root / Pi / C / OpenCL / PiOpenCL.c @ 308
Historique | Voir | Annoter | Télécharger (15,06 ko)
1 | 191 | equemene | // Pi Dart Dash in OpenCL in C, illustrative example
|
---|---|---|---|
2 | 191 | equemene | //
|
3 | 191 | equemene | // Emmanuel Quemener <emmanuel.quemener@gmail.com>
|
4 | 191 | equemene | //
|
5 | 191 | equemene | // CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
6 | 191 | equemene | // Copyleft Cecill v2
|
7 | 191 | equemene | //
|
8 | 191 | equemene | // -h : print the documentation and detect devices as (platform,device)
|
9 | 191 | equemene | //
|
10 | 191 | equemene | // classical use:
|
11 | 191 | equemene | // #1 OpenCL Plateform ID: get this information with -h option
|
12 | 191 | equemene | // #2 OpenCL Device ID: get this information with -h option
|
13 | 191 | equemene | // #3 Minimal number of iterations:
|
14 | 191 | equemene | // #4 Parallel Rate: scattering global work in parts executed //ly
|
15 | 191 | equemene | // #5 Loops (to improve statistics)
|
16 | 191 | equemene | // #6 Type of variables INT32, INT64, FP32, FP64
|
17 | 191 | equemene | //
|
18 | 249 | equemene | // To compile : gcc -o PiOpenCL PiOpenCL.c -lOpenCL -lm
|
19 | 191 | equemene | |
20 | 255 | equemene | #define CL_TARGET_OPENCL_VERSION 220 |
21 | 191 | equemene | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
22 | 191 | equemene | #include <stdio.h> |
23 | 191 | equemene | #include <stdlib.h> |
24 | 191 | equemene | #include <string.h> |
25 | 191 | equemene | #include <CL/cl.h> |
26 | 191 | equemene | #include <stdint.h> |
27 | 248 | equemene | #include <math.h> |
28 | 191 | equemene | #include <sys/time.h> |
29 | 191 | equemene | |
30 | 191 | equemene | #define TINT32 0 |
31 | 191 | equemene | #define TINT64 1 |
32 | 191 | equemene | #define TFP32 2 |
33 | 191 | equemene | #define TFP64 3 |
34 | 191 | equemene | |
35 | 191 | equemene | int DetectOpenCLDevices(void) |
36 | 191 | equemene | { |
37 | 191 | equemene | int i, j;
|
38 | 191 | equemene | char* value;
|
39 | 191 | equemene | size_t valueSize; |
40 | 191 | equemene | cl_uint platformCount; |
41 | 191 | equemene | cl_platform_id* platforms; |
42 | 191 | equemene | cl_uint deviceCount; |
43 | 191 | equemene | cl_device_id* devices; |
44 | 191 | equemene | cl_uint maxComputeUnits; |
45 | 191 | equemene | cl_int maxWorkGroupSize; |
46 | 191 | equemene | cl_int maxWorkItemSizes; |
47 | 191 | equemene | cl_device_type dev_type; |
48 | 191 | equemene | |
49 | 191 | equemene | // get all platforms
|
50 | 191 | equemene | clGetPlatformIDs(0, NULL, &platformCount); |
51 | 191 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
52 | 191 | equemene | clGetPlatformIDs(platformCount, platforms, NULL);
|
53 | 191 | equemene | |
54 | 191 | equemene | printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
55 | 191 | equemene | |
56 | 191 | equemene | for (i = 0; i < platformCount; i++) { |
57 | 191 | equemene | |
58 | 191 | equemene | // get all devices
|
59 | 191 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
60 | 191 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
61 | 191 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
62 | 191 | equemene | |
63 | 191 | equemene | // for each device print critical attributes
|
64 | 191 | equemene | for (j = 0; j < deviceCount; j++) { |
65 | 191 | equemene | |
66 | 191 | equemene | // print device name
|
67 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
68 | 191 | equemene | value = (char*) malloc(valueSize);
|
69 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
70 | 191 | equemene | printf("Device (%d,%d): %s\n",i, j, value);
|
71 | 191 | equemene | free(value); |
72 | 191 | equemene | |
73 | 191 | equemene | // print type device CPU/GPU/ACCELERATOR
|
74 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
75 | 191 | equemene | printf("\tDevice Type: ");
|
76 | 191 | equemene | if(dev_type & CL_DEVICE_TYPE_GPU)
|
77 | 191 | equemene | printf("CL_DEVICE_TYPE_GPU ");
|
78 | 191 | equemene | if(dev_type & CL_DEVICE_TYPE_CPU)
|
79 | 191 | equemene | printf("CL_DEVICE_TYPE_CPU ");
|
80 | 191 | equemene | if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
81 | 191 | equemene | printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
82 | 191 | equemene | if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
83 | 191 | equemene | printf("CL_DEVICE_TYPE_DEFAULT ");
|
84 | 191 | equemene | printf("\n");
|
85 | 191 | equemene | |
86 | 191 | equemene | // print device vendor
|
87 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
88 | 191 | equemene | value = (char*) malloc(valueSize);
|
89 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
90 | 191 | equemene | printf("\tDevice vendor: %s\n", value);
|
91 | 191 | equemene | free(value); |
92 | 191 | equemene | |
93 | 191 | equemene | // print hardware device version
|
94 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
95 | 191 | equemene | value = (char*) malloc(valueSize);
|
96 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
97 | 191 | equemene | printf("\tHardware version: %s\n", value);
|
98 | 191 | equemene | free(value); |
99 | 191 | equemene | |
100 | 191 | equemene | // print software driver version
|
101 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
102 | 191 | equemene | value = (char*) malloc(valueSize);
|
103 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
104 | 191 | equemene | printf("\tSoftware version: %s\n", value);
|
105 | 191 | equemene | free(value); |
106 | 191 | equemene | |
107 | 191 | equemene | // print c version supported by compiler for device
|
108 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
109 | 191 | equemene | value = (char*) malloc(valueSize);
|
110 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
111 | 191 | equemene | printf("\tOpenCL C version: %s\n", value);
|
112 | 191 | equemene | free(value); |
113 | 191 | equemene | |
114 | 191 | equemene | // print parallel compute units
|
115 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
116 | 191 | equemene | sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
117 | 191 | equemene | printf("\tParallel compute units: %d\n", maxComputeUnits);
|
118 | 191 | equemene | |
119 | 191 | equemene | // print max work group size
|
120 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
121 | 191 | equemene | sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
122 | 191 | equemene | printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
123 | 191 | equemene | |
124 | 191 | equemene | // print max work items size
|
125 | 191 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
126 | 191 | equemene | sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
127 | 191 | equemene | printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
128 | 191 | equemene | |
129 | 191 | equemene | } |
130 | 191 | equemene | printf("\n");
|
131 | 191 | equemene | free(devices); |
132 | 191 | equemene | } |
133 | 191 | equemene | |
134 | 191 | equemene | free(platforms); |
135 | 191 | equemene | return 0; |
136 | 191 | equemene | |
137 | 191 | equemene | } |
138 | 191 | equemene | |
139 | 191 | equemene | const char* OpenCLSource[] = { |
140 | 191 | equemene | "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
|
141 | 191 | equemene | "// Marsaglia RNG very simple implementation \n",
|
142 | 191 | equemene | "#define znew ((z=36969*(z&65535)+(z>>16))<<16) \n",
|
143 | 191 | equemene | "#define wnew ((w=18000*(w&65535)+(w>>16))&65535) \n",
|
144 | 191 | equemene | "#define MWC (znew+wnew) \n",
|
145 | 191 | equemene | "#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
|
146 | 191 | equemene | "#define CONG (jcong=69069*jcong+1234567) \n",
|
147 | 191 | equemene | "#define KISS ((MWC^CONG)+SHR3) \n",
|
148 | 191 | equemene | "#define MWCfp MWC * 2.328306435454494e-10f \n",
|
149 | 191 | equemene | "#define KISSfp KISS * 2.328306435454494e-10f \n",
|
150 | 191 | equemene | "#define CONGfp CONG * 2.328306435454494e-10f \n",
|
151 | 191 | equemene | "#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
|
152 | 191 | equemene | "#define TINT32 0 \n",
|
153 | 191 | equemene | "#define TINT64 1 \n",
|
154 | 191 | equemene | "#define TFP32 2 \n",
|
155 | 191 | equemene | "#define TFP64 3 \n",
|
156 | 191 | equemene | "#define THEONE32I 1073741824 \n",
|
157 | 191 | equemene | "#define THEONE32F 1.e0f \n",
|
158 | 191 | equemene | "#define THEONE64I 4611686018427387904 \n",
|
159 | 191 | equemene | "#define THEONE64F (double)1.e0f \n",
|
160 | 191 | equemene | "ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
161 | 191 | equemene | "{",
|
162 | 191 | equemene | " uint z=seed_z+work;",
|
163 | 191 | equemene | " uint w=seed_w+work;",
|
164 | 191 | equemene | " ulong total=0;",
|
165 | 191 | equemene | " for (ulong i=0;i<iterations;i++)",
|
166 | 191 | equemene | " {",
|
167 | 191 | equemene | " uint x= MWC>>17;",
|
168 | 191 | equemene | " uint y= MWC>>17;",
|
169 | 191 | equemene | " ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
|
170 | 191 | equemene | " total+=inside;",
|
171 | 191 | equemene | " }",
|
172 | 191 | equemene | " return(total);",
|
173 | 191 | equemene | "}",
|
174 | 191 | equemene | "ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
175 | 191 | equemene | "{",
|
176 | 191 | equemene | " uint z=seed_z+work;",
|
177 | 191 | equemene | " uint w=seed_w+work;",
|
178 | 191 | equemene | " ulong total=0;",
|
179 | 191 | equemene | " for (ulong i=0;i<iterations;i++)",
|
180 | 191 | equemene | " {",
|
181 | 191 | equemene | " float x=(float)MWCfp ;",
|
182 | 191 | equemene | " float y=(float)MWCfp ;",
|
183 | 191 | equemene | " ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
|
184 | 191 | equemene | " total+=inside;",
|
185 | 191 | equemene | " }",
|
186 | 191 | equemene | " return(total);",
|
187 | 191 | equemene | "}",
|
188 | 191 | equemene | "ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
189 | 191 | equemene | "{",
|
190 | 191 | equemene | " uint z=seed_z+work;",
|
191 | 191 | equemene | " uint w=seed_w+work;",
|
192 | 191 | equemene | " ulong total=0;",
|
193 | 191 | equemene | " for (ulong i=0;i<iterations;i++)",
|
194 | 191 | equemene | " {",
|
195 | 191 | equemene | " ulong x=(ulong)(MWC>>1);",
|
196 | 191 | equemene | " ulong y=(ulong)(MWC>>1);",
|
197 | 191 | equemene | " ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
|
198 | 191 | equemene | " total+=inside;",
|
199 | 191 | equemene | " }",
|
200 | 191 | equemene | " return(total);",
|
201 | 191 | equemene | "}",
|
202 | 191 | equemene | "ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
203 | 191 | equemene | "{",
|
204 | 191 | equemene | " uint z=seed_z+work;",
|
205 | 191 | equemene | " uint w=seed_w+work;",
|
206 | 191 | equemene | " ulong total=0;",
|
207 | 191 | equemene | " for (ulong i=0;i<iterations;i++)",
|
208 | 191 | equemene | "{",
|
209 | 191 | equemene | " double x=(double)MWCfp ;",
|
210 | 191 | equemene | " double y=(double)MWCfp ;",
|
211 | 191 | equemene | " ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
|
212 | 191 | equemene | " total+=inside;",
|
213 | 191 | equemene | "}",
|
214 | 191 | equemene | " return(total);",
|
215 | 191 | equemene | "}",
|
216 | 191 | equemene | "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
217 | 191 | equemene | "{",
|
218 | 191 | equemene | " ulong total;",
|
219 | 191 | equemene | " if (MyType==TFP32) {",
|
220 | 191 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
221 | 191 | equemene | " }",
|
222 | 191 | equemene | " else if (MyType==TFP64) {",
|
223 | 191 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
224 | 191 | equemene | " }",
|
225 | 191 | equemene | " else if (MyType==TINT32) {",
|
226 | 191 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
227 | 191 | equemene | " }",
|
228 | 191 | equemene | " else if (MyType==TINT64) {",
|
229 | 191 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
230 | 191 | equemene | " }",
|
231 | 191 | equemene | " barrier(CLK_GLOBAL_MEM_FENCE);",
|
232 | 191 | equemene | " s[get_global_id(0)]=(ulong)total;",
|
233 | 191 | equemene | "}",
|
234 | 191 | equemene | "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
235 | 191 | equemene | "{",
|
236 | 191 | equemene | " ulong total;",
|
237 | 191 | equemene | " if (MyType==TFP32) {",
|
238 | 191 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
|
239 | 191 | equemene | " }",
|
240 | 191 | equemene | " else if (MyType==TFP64) {",
|
241 | 191 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
|
242 | 191 | equemene | " }",
|
243 | 191 | equemene | " else if (MyType==TINT32) {",
|
244 | 191 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
|
245 | 191 | equemene | " }",
|
246 | 191 | equemene | " else if (MyType==TINT64) {",
|
247 | 191 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
|
248 | 191 | equemene | " }",
|
249 | 191 | equemene | " barrier(CLK_LOCAL_MEM_FENCE);",
|
250 | 191 | equemene | " s[get_local_id(0)]=(ulong)total;",
|
251 | 191 | equemene | "}",
|
252 | 191 | equemene | "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
253 | 191 | equemene | "{",
|
254 | 191 | equemene | " ulong total;",
|
255 | 191 | equemene | " if (MyType==TFP32) {",
|
256 | 191 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
257 | 191 | equemene | " }",
|
258 | 191 | equemene | " else if (MyType==TFP64) {",
|
259 | 191 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
260 | 191 | equemene | " }",
|
261 | 191 | equemene | " else if (MyType==TINT32) {",
|
262 | 191 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
263 | 191 | equemene | " }",
|
264 | 191 | equemene | " else if (MyType==TINT64) {",
|
265 | 191 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
266 | 191 | equemene | " }",
|
267 | 191 | equemene | " barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
|
268 | 191 | equemene | " s[get_global_id(0)]=total;",
|
269 | 191 | equemene | "}"
|
270 | 191 | equemene | }; |
271 | 191 | equemene | |
272 | 191 | equemene | int main(int argc, char **argv) |
273 | 191 | equemene | { |
274 | 191 | equemene | if ((argc==1)|| |
275 | 191 | equemene | (strcmp(argv[1],"-h")==0)|| |
276 | 191 | equemene | (strcmp(argv[1],"--help")==0)) |
277 | 191 | equemene | { |
278 | 191 | equemene | printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
|
279 | 191 | equemene | "\t#1 OpenCL Plateform ID (default 0)\n"
|
280 | 191 | equemene | "\t#2 OpenCL Device ID (default 0)\n"
|
281 | 191 | equemene | "\t#3 Minimal number of iterations (default 1000000)\n"
|
282 | 191 | equemene | "\t#4 Parallel Rate (default 1024)\n"
|
283 | 191 | equemene | "\t#5 Loops (default 1)\n"
|
284 | 191 | equemene | "\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
|
285 | 191 | equemene | DetectOpenCLDevices(); |
286 | 191 | equemene | } |
287 | 191 | equemene | else
|
288 | 191 | equemene | { |
289 | 191 | equemene | |
290 | 191 | equemene | int MyPlatform=atoi(argv[1]); |
291 | 191 | equemene | int MyDevice=atoi(argv[2]); |
292 | 191 | equemene | |
293 | 191 | equemene | struct timeval tv1,tv2;
|
294 | 191 | equemene | |
295 | 191 | equemene | uint64_t Iterations=1000000;
|
296 | 191 | equemene | if (argc>3) { |
297 | 191 | equemene | Iterations=(uint64_t)atoll(argv[3]);
|
298 | 191 | equemene | } |
299 | 191 | equemene | |
300 | 249 | equemene | uint32_t ParallelRate=1024;
|
301 | 191 | equemene | if (argc>4) { |
302 | 191 | equemene | ParallelRate=(uint32_t)atoi(argv[4]);
|
303 | 191 | equemene | } |
304 | 191 | equemene | |
305 | 191 | equemene | uint32_t Loops=1;
|
306 | 191 | equemene | if (argc>5) { |
307 | 191 | equemene | Loops=(uint32_t)atoi(argv[5]);
|
308 | 191 | equemene | } |
309 | 191 | equemene | |
310 | 191 | equemene | uint32_t MyType=TFP32; |
311 | 191 | equemene | if (argc>6) { |
312 | 191 | equemene | if (strcmp(argv[6],"INT32")==0) { |
313 | 191 | equemene | MyType=(uint32_t)TINT32; |
314 | 191 | equemene | } |
315 | 191 | equemene | else if (strcmp(argv[6],"INT64")==0) { |
316 | 191 | equemene | MyType=(uint32_t)TINT64; |
317 | 191 | equemene | } |
318 | 191 | equemene | else if (strcmp(argv[6],"FP32")==0) { |
319 | 191 | equemene | MyType=(uint32_t)TFP32; |
320 | 191 | equemene | } |
321 | 191 | equemene | else if (strcmp(argv[6],"FP64")==0) { |
322 | 191 | equemene | MyType=(uint32_t)TFP64; |
323 | 191 | equemene | } |
324 | 191 | equemene | } |
325 | 191 | equemene | |
326 | 191 | equemene | printf("MyType %d\n",MyType);
|
327 | 191 | equemene | |
328 | 191 | equemene | cl_int err; |
329 | 191 | equemene | cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
330 | 191 | equemene | |
331 | 191 | equemene | // Detect, scan, get & initialize platform and device
|
332 | 191 | equemene | cl_uint platformCount; |
333 | 191 | equemene | cl_platform_id* platforms; |
334 | 191 | equemene | cl_uint deviceCount; |
335 | 191 | equemene | cl_device_id* devices; |
336 | 191 | equemene | size_t valueSize; |
337 | 191 | equemene | |
338 | 191 | equemene | /* Setup OpenCL environment. */
|
339 | 191 | equemene | |
340 | 191 | equemene | // Get all platforms
|
341 | 191 | equemene | err = clGetPlatformIDs(0, NULL, &platformCount); |
342 | 191 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
343 | 191 | equemene | err = clGetPlatformIDs(platformCount, platforms, NULL);
|
344 | 191 | equemene | |
345 | 191 | equemene | // Get Device defined
|
346 | 191 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
347 | 191 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
348 | 191 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
349 | 191 | equemene | |
350 | 191 | equemene | // print device name
|
351 | 191 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
352 | 191 | equemene | char* deviceName=(char*)malloc(valueSize); |
353 | 191 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
|
354 | 191 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
355 | 191 | equemene | char* vendorName=(char*)malloc(valueSize); |
356 | 191 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
|
357 | 191 | equemene | printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
|
358 | 191 | equemene | free(deviceName); |
359 | 191 | equemene | free(vendorName); |
360 | 191 | equemene | |
361 | 191 | equemene | props[1] = (cl_context_properties)platforms[MyPlatform];
|
362 | 191 | equemene | |
363 | 191 | equemene | cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err); |
364 | 191 | equemene | cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
|
365 | 191 | equemene | |
366 | 191 | equemene | cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, |
367 | 191 | equemene | sizeof(uint64_t) * ParallelRate, NULL, NULL); |
368 | 191 | equemene | |
369 | 255 | equemene | // 130 is the number of line for OpenCL code
|
370 | 191 | equemene | cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL); |
371 | 191 | equemene | clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); |
372 | 191 | equemene | cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL); |
373 | 191 | equemene | cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL); |
374 | 191 | equemene | cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL); |
375 | 191 | equemene | |
376 | 191 | equemene | // Divide the total number of iterations by the parallel rate
|
377 | 191 | equemene | // Add +1 to the number of per work iterations if division not integer
|
378 | 191 | equemene | uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1; |
379 | 191 | equemene | // Initialize seeds for MWC RNG generator from Marsaglia
|
380 | 191 | equemene | uint32_t seed_w=110271;
|
381 | 191 | equemene | uint32_t seed_z=101008;
|
382 | 191 | equemene | |
383 | 191 | equemene | // Set the values of arguments for OpenCL function call
|
384 | 191 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside); |
385 | 191 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach); |
386 | 191 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w); |
387 | 191 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z); |
388 | 191 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType); |
389 | 191 | equemene | |
390 | 191 | equemene | size_t WorkSize[1] = {ParallelRate}; // one dimensional Range |
391 | 191 | equemene | |
392 | 191 | equemene | uint64_t HostInside[ParallelRate]; |
393 | 191 | equemene | |
394 | 191 | equemene | for (uint32_t loop=0;loop<Loops;loop++) { |
395 | 191 | equemene | // Set start timer
|
396 | 255 | equemene | gettimeofday(&tv1, NULL);
|
397 | 191 | equemene | |
398 | 191 | equemene | // Execute the OpenCL kernel with datas
|
399 | 191 | equemene | clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL, |
400 | 191 | equemene | WorkSize, NULL, 0, NULL, NULL); |
401 | 191 | equemene | // Copy each result for each PR from Device to Host
|
402 | 191 | equemene | clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
|
403 | 191 | equemene | ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL); |
404 | 191 | equemene | uint64_t inside=0;
|
405 | 191 | equemene | |
406 | 191 | equemene | for (int i= 0; i < ParallelRate; i++) { |
407 | 191 | equemene | inside+=HostInside[i]; |
408 | 191 | equemene | } |
409 | 191 | equemene | |
410 | 191 | equemene | // Set stop timer
|
411 | 255 | equemene | gettimeofday(&tv2, NULL);
|
412 | 191 | equemene | |
413 | 191 | equemene | double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + |
414 | 191 | equemene | (tv2.tv_usec-tv1.tv_usec))/1000000;
|
415 | 191 | equemene | |
416 | 191 | equemene | double itops=(double)(ParallelRate*IterationsEach)/elapsed; |
417 | 191 | equemene | |
418 | 248 | equemene | printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nLogItops %.2f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,log10(itops),(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach)))); |
419 | 191 | equemene | } |
420 | 191 | equemene | printf("\n\n");
|
421 | 191 | equemene | |
422 | 191 | equemene | clReleaseKernel(OpenCLMainLoopGlobal); |
423 | 191 | equemene | clReleaseProgram(OpenCLProgram); |
424 | 191 | equemene | clReleaseCommandQueue(cqCommandQueue); |
425 | 191 | equemene | clReleaseContext(GPUContext); |
426 | 191 | equemene | clReleaseMemObject(GPUInside); |
427 | 191 | equemene | |
428 | 191 | equemene | |
429 | 191 | equemene | return 0; |
430 | 191 | equemene | } |
431 | 191 | equemene | } |