root / Pi / OpenCL / PiOpenCL.c @ 158
Historique | Voir | Annoter | Télécharger (14,84 ko)
1 | 152 | equemene | // Pi Dart Dash in OpenCL in C, illustrative example
|
---|---|---|---|
2 | 152 | equemene | //
|
3 | 152 | equemene | // Emmanuel Quemener <emmanuel.quemener@ens-lyon.fr
|
4 | 152 | equemene | // Copyleft Cecill v2
|
5 | 152 | equemene | //
|
6 | 152 | equemene | // -h : print the documentation and detect devices as (platform,device)
|
7 | 152 | equemene | //
|
8 | 152 | equemene | // classical use:
|
9 | 152 | equemene | // #1 OpenCL Plateform ID: get this information with -h option
|
10 | 152 | equemene | // #2 OpenCL Device ID: get this information with -h option
|
11 | 152 | equemene | // #3 Minimal number of iterations:
|
12 | 152 | equemene | // #4 Parallel Rate: scattering global work in parts executed //ly
|
13 | 152 | equemene | // #5 Loops (to improve statistics)
|
14 | 152 | equemene | //
|
15 | 152 | equemene | |
16 | 152 | equemene | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
17 | 152 | equemene | #include <stdio.h> |
18 | 152 | equemene | #include <stdlib.h> |
19 | 152 | equemene | #include <string.h> |
20 | 152 | equemene | #include <CL/cl.h> |
21 | 152 | equemene | #include <stdint.h> |
22 | 152 | equemene | #include <sys/time.h> |
23 | 152 | equemene | |
24 | 156 | equemene | #define TINT32 0 |
25 | 156 | equemene | #define TINT64 1 |
26 | 156 | equemene | #define TFP32 2 |
27 | 156 | equemene | #define TFP64 3 |
28 | 156 | equemene | |
29 | 152 | equemene | int DetectOpenCLDevices(void) |
30 | 152 | equemene | { |
31 | 152 | equemene | int i, j;
|
32 | 152 | equemene | char* value;
|
33 | 152 | equemene | size_t valueSize; |
34 | 152 | equemene | cl_uint platformCount; |
35 | 152 | equemene | cl_platform_id* platforms; |
36 | 152 | equemene | cl_uint deviceCount; |
37 | 152 | equemene | cl_device_id* devices; |
38 | 152 | equemene | cl_uint maxComputeUnits; |
39 | 152 | equemene | cl_int maxWorkGroupSize; |
40 | 152 | equemene | cl_int maxWorkItemSizes; |
41 | 152 | equemene | cl_device_type dev_type; |
42 | 152 | equemene | |
43 | 152 | equemene | // get all platforms
|
44 | 152 | equemene | clGetPlatformIDs(0, NULL, &platformCount); |
45 | 152 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
46 | 152 | equemene | clGetPlatformIDs(platformCount, platforms, NULL);
|
47 | 152 | equemene | |
48 | 152 | equemene | printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
49 | 152 | equemene | |
50 | 152 | equemene | for (i = 0; i < platformCount; i++) { |
51 | 152 | equemene | |
52 | 152 | equemene | // get all devices
|
53 | 152 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
54 | 152 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
55 | 152 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
56 | 152 | equemene | |
57 | 152 | equemene | // for each device print critical attributes
|
58 | 152 | equemene | for (j = 0; j < deviceCount; j++) { |
59 | 152 | equemene | |
60 | 152 | equemene | // print device name
|
61 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
62 | 152 | equemene | value = (char*) malloc(valueSize);
|
63 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
64 | 152 | equemene | printf("Device (%d,%d): %s\n",i, j, value);
|
65 | 152 | equemene | free(value); |
66 | 152 | equemene | |
67 | 152 | equemene | // print type device CPU/GPU/ACCELERATOR
|
68 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
69 | 152 | equemene | printf("\tDevice Type: ");
|
70 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_GPU)
|
71 | 152 | equemene | printf("CL_DEVICE_TYPE_GPU ");
|
72 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_CPU)
|
73 | 152 | equemene | printf("CL_DEVICE_TYPE_CPU ");
|
74 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
75 | 152 | equemene | printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
76 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
77 | 152 | equemene | printf("CL_DEVICE_TYPE_DEFAULT ");
|
78 | 152 | equemene | printf("\n");
|
79 | 152 | equemene | |
80 | 152 | equemene | // print device vendor
|
81 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
82 | 152 | equemene | value = (char*) malloc(valueSize);
|
83 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
84 | 152 | equemene | printf("\tDevice vendor: %s\n", value);
|
85 | 152 | equemene | free(value); |
86 | 152 | equemene | |
87 | 152 | equemene | // print hardware device version
|
88 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
89 | 152 | equemene | value = (char*) malloc(valueSize);
|
90 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
91 | 152 | equemene | printf("\tHardware version: %s\n", value);
|
92 | 152 | equemene | free(value); |
93 | 152 | equemene | |
94 | 152 | equemene | // print software driver version
|
95 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
96 | 152 | equemene | value = (char*) malloc(valueSize);
|
97 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
98 | 152 | equemene | printf("\tSoftware version: %s\n", value);
|
99 | 152 | equemene | free(value); |
100 | 152 | equemene | |
101 | 152 | equemene | // print c version supported by compiler for device
|
102 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
103 | 152 | equemene | value = (char*) malloc(valueSize);
|
104 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
105 | 152 | equemene | printf("\tOpenCL C version: %s\n", value);
|
106 | 152 | equemene | free(value); |
107 | 152 | equemene | |
108 | 152 | equemene | // print parallel compute units
|
109 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
110 | 152 | equemene | sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
111 | 152 | equemene | printf("\tParallel compute units: %d\n", maxComputeUnits);
|
112 | 152 | equemene | |
113 | 152 | equemene | // print max work group size
|
114 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
115 | 152 | equemene | sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
116 | 152 | equemene | printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
117 | 152 | equemene | |
118 | 152 | equemene | // print max work items size
|
119 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
120 | 152 | equemene | sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
121 | 152 | equemene | printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
122 | 152 | equemene | |
123 | 152 | equemene | } |
124 | 152 | equemene | printf("\n");
|
125 | 152 | equemene | free(devices); |
126 | 152 | equemene | } |
127 | 152 | equemene | |
128 | 152 | equemene | free(platforms); |
129 | 152 | equemene | return 0; |
130 | 152 | equemene | |
131 | 152 | equemene | } |
132 | 152 | equemene | |
133 | 152 | equemene | const char* OpenCLSource[] = { |
134 | 156 | equemene | "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
|
135 | 152 | equemene | "// Marsaglia RNG very simple implementation \n",
|
136 | 152 | equemene | "#define znew ((z=36969*(z&65535)+(z>>16))<<16) \n",
|
137 | 152 | equemene | "#define wnew ((w=18000*(w&65535)+(w>>16))&65535) \n",
|
138 | 152 | equemene | "#define MWC (znew+wnew) \n",
|
139 | 152 | equemene | "#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
|
140 | 152 | equemene | "#define CONG (jcong=69069*jcong+1234567) \n",
|
141 | 152 | equemene | "#define KISS ((MWC^CONG)+SHR3) \n",
|
142 | 152 | equemene | "#define MWCfp MWC * 2.328306435454494e-10f \n",
|
143 | 152 | equemene | "#define KISSfp KISS * 2.328306435454494e-10f \n",
|
144 | 152 | equemene | "#define CONGfp CONG * 2.328306435454494e-10f \n",
|
145 | 152 | equemene | "#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
|
146 | 156 | equemene | "#define TINT32 0 \n",
|
147 | 156 | equemene | "#define TINT64 1 \n",
|
148 | 156 | equemene | "#define TFP32 2 \n",
|
149 | 156 | equemene | "#define TFP64 3 \n",
|
150 | 156 | equemene | "#define THEONE32I 1073741824 \n",
|
151 | 156 | equemene | "#define THEONE32F 1.e0f \n",
|
152 | 156 | equemene | "#define THEONE64I 4611686018427387904 \n",
|
153 | 156 | equemene | "#define THEONE64F (double)1.e0f \n",
|
154 | 156 | equemene | "ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
155 | 152 | equemene | "{",
|
156 | 152 | equemene | " uint z=seed_z+work;",
|
157 | 152 | equemene | " uint w=seed_w+work;",
|
158 | 152 | equemene | " ulong total=0;",
|
159 | 152 | equemene | " for (ulong i=0;i<iterations;i++)",
|
160 | 156 | equemene | " {",
|
161 | 156 | equemene | " uint x= MWC>>17;",
|
162 | 156 | equemene | " uint y= MWC>>17;",
|
163 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
|
164 | 156 | equemene | " total+=inside;",
|
165 | 156 | equemene | " }",
|
166 | 156 | equemene | " return(total);",
|
167 | 156 | equemene | "}",
|
168 | 156 | equemene | "ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
169 | 152 | equemene | "{",
|
170 | 156 | equemene | " uint z=seed_z+work;",
|
171 | 156 | equemene | " uint w=seed_w+work;",
|
172 | 156 | equemene | " ulong total=0;",
|
173 | 156 | equemene | " for (ulong i=0;i<iterations;i++)",
|
174 | 156 | equemene | " {",
|
175 | 156 | equemene | " float x=(float)MWCfp ;",
|
176 | 156 | equemene | " float y=(float)MWCfp ;",
|
177 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
|
178 | 152 | equemene | " total+=inside;",
|
179 | 156 | equemene | " }",
|
180 | 156 | equemene | " return(total);",
|
181 | 152 | equemene | "}",
|
182 | 156 | equemene | "ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
183 | 156 | equemene | "{",
|
184 | 156 | equemene | " uint z=seed_z+work;",
|
185 | 156 | equemene | " uint w=seed_w+work;",
|
186 | 156 | equemene | " ulong total=0;",
|
187 | 156 | equemene | " for (ulong i=0;i<iterations;i++)",
|
188 | 156 | equemene | " {",
|
189 | 156 | equemene | " ulong x=(ulong)(MWC>>1);",
|
190 | 156 | equemene | " ulong y=(ulong)(MWC>>1);",
|
191 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
|
192 | 156 | equemene | " total+=inside;",
|
193 | 156 | equemene | " }",
|
194 | 152 | equemene | " return(total);",
|
195 | 152 | equemene | "}",
|
196 | 156 | equemene | "ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
197 | 152 | equemene | "{",
|
198 | 156 | equemene | " uint z=seed_z+work;",
|
199 | 156 | equemene | " uint w=seed_w+work;",
|
200 | 156 | equemene | " ulong total=0;",
|
201 | 156 | equemene | " for (ulong i=0;i<iterations;i++)",
|
202 | 156 | equemene | "{",
|
203 | 158 | equemene | " double x=(double)MWCfp ;",
|
204 | 158 | equemene | " double y=(double)MWCfp ;",
|
205 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
|
206 | 156 | equemene | " total+=inside;",
|
207 | 156 | equemene | "}",
|
208 | 156 | equemene | " return(total);",
|
209 | 156 | equemene | "}",
|
210 | 156 | equemene | "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
211 | 156 | equemene | "{",
|
212 | 156 | equemene | " ulong total;",
|
213 | 156 | equemene | " if (MyType==TFP32) {",
|
214 | 156 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
215 | 156 | equemene | " }",
|
216 | 156 | equemene | " else if (MyType==TFP64) {",
|
217 | 156 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
218 | 156 | equemene | " }",
|
219 | 156 | equemene | " else if (MyType==TINT32) {",
|
220 | 156 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
221 | 156 | equemene | " }",
|
222 | 156 | equemene | " else if (MyType==TINT64) {",
|
223 | 156 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
224 | 156 | equemene | " }",
|
225 | 152 | equemene | " barrier(CLK_GLOBAL_MEM_FENCE);",
|
226 | 152 | equemene | " s[get_global_id(0)]=(ulong)total;",
|
227 | 152 | equemene | "}",
|
228 | 156 | equemene | "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
229 | 152 | equemene | "{",
|
230 | 156 | equemene | " ulong total;",
|
231 | 156 | equemene | " if (MyType==TFP32) {",
|
232 | 156 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
|
233 | 156 | equemene | " }",
|
234 | 156 | equemene | " else if (MyType==TFP64) {",
|
235 | 156 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
|
236 | 156 | equemene | " }",
|
237 | 156 | equemene | " else if (MyType==TINT32) {",
|
238 | 156 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
|
239 | 156 | equemene | " }",
|
240 | 156 | equemene | " else if (MyType==TINT64) {",
|
241 | 156 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
|
242 | 156 | equemene | " }",
|
243 | 152 | equemene | " barrier(CLK_LOCAL_MEM_FENCE);",
|
244 | 156 | equemene | " s[get_local_id(0)]=(ulong)total;",
|
245 | 152 | equemene | "}",
|
246 | 156 | equemene | "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
247 | 152 | equemene | "{",
|
248 | 156 | equemene | " ulong total;",
|
249 | 156 | equemene | " if (MyType==TFP32) {",
|
250 | 156 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
251 | 156 | equemene | " }",
|
252 | 156 | equemene | " else if (MyType==TFP64) {",
|
253 | 156 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
254 | 156 | equemene | " }",
|
255 | 156 | equemene | " else if (MyType==TINT32) {",
|
256 | 156 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
257 | 156 | equemene | " }",
|
258 | 156 | equemene | " else if (MyType==TINT64) {",
|
259 | 156 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
260 | 156 | equemene | " }",
|
261 | 152 | equemene | " barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
|
262 | 152 | equemene | " s[get_global_id(0)]=total;",
|
263 | 152 | equemene | "}"
|
264 | 152 | equemene | }; |
265 | 152 | equemene | |
266 | 152 | equemene | int main(int argc, char **argv) |
267 | 152 | equemene | { |
268 | 152 | equemene | if ((argc==1)|| |
269 | 152 | equemene | (strcmp(argv[1],"-h")==0)|| |
270 | 152 | equemene | (strcmp(argv[1],"--help")==0)) |
271 | 152 | equemene | { |
272 | 152 | equemene | printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
|
273 | 156 | equemene | "\t#1 OpenCL Plateform ID (default 0)\n"
|
274 | 156 | equemene | "\t#2 OpenCL Device ID (default 0)\n"
|
275 | 156 | equemene | "\t#3 Minimal number of iterations (default 1000000)\n"
|
276 | 156 | equemene | "\t#4 Parallel Rate (default 1024)\n"
|
277 | 156 | equemene | "\t#5 Loops (default 1)\n"
|
278 | 156 | equemene | "\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
|
279 | 152 | equemene | DetectOpenCLDevices(); |
280 | 152 | equemene | } |
281 | 152 | equemene | else
|
282 | 152 | equemene | { |
283 | 152 | equemene | |
284 | 152 | equemene | int MyPlatform=atoi(argv[1]); |
285 | 152 | equemene | int MyDevice=atoi(argv[2]); |
286 | 152 | equemene | |
287 | 152 | equemene | struct timeval tv1,tv2;
|
288 | 152 | equemene | struct timezone tz;
|
289 | 152 | equemene | |
290 | 152 | equemene | uint64_t Iterations=1000000;
|
291 | 152 | equemene | if (argc>3) { |
292 | 152 | equemene | Iterations=(uint64_t)atoll(argv[3]);
|
293 | 152 | equemene | } |
294 | 152 | equemene | |
295 | 152 | equemene | uint32_t ParallelRate=64;
|
296 | 152 | equemene | if (argc>4) { |
297 | 152 | equemene | ParallelRate=(uint32_t)atoi(argv[4]);
|
298 | 152 | equemene | } |
299 | 152 | equemene | |
300 | 152 | equemene | uint32_t Loops=1;
|
301 | 152 | equemene | if (argc>5) { |
302 | 152 | equemene | Loops=(uint32_t)atoi(argv[5]);
|
303 | 152 | equemene | } |
304 | 152 | equemene | |
305 | 156 | equemene | uint32_t MyType=TFP32; |
306 | 156 | equemene | if (argc>6) { |
307 | 156 | equemene | if (strcmp(argv[6],"INT32")==0) { |
308 | 156 | equemene | MyType=(uint32_t)TINT32; |
309 | 156 | equemene | } |
310 | 156 | equemene | else if (strcmp(argv[6],"INT64")==0) { |
311 | 156 | equemene | MyType=(uint32_t)TINT64; |
312 | 156 | equemene | } |
313 | 156 | equemene | else if (strcmp(argv[6],"FP32")==0) { |
314 | 156 | equemene | MyType=(uint32_t)TFP32; |
315 | 156 | equemene | } |
316 | 156 | equemene | else if (strcmp(argv[6],"FP64")==0) { |
317 | 156 | equemene | MyType=(uint32_t)TFP64; |
318 | 156 | equemene | } |
319 | 156 | equemene | } |
320 | 156 | equemene | |
321 | 156 | equemene | printf("MyType %d\n",MyType);
|
322 | 156 | equemene | |
323 | 152 | equemene | cl_int err; |
324 | 152 | equemene | cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
325 | 152 | equemene | |
326 | 152 | equemene | // Detect, scan, get & initialize platform and device
|
327 | 152 | equemene | cl_uint platformCount; |
328 | 152 | equemene | cl_platform_id* platforms; |
329 | 152 | equemene | cl_uint deviceCount; |
330 | 152 | equemene | cl_device_id* devices; |
331 | 152 | equemene | size_t valueSize; |
332 | 152 | equemene | |
333 | 152 | equemene | /* Setup OpenCL environment. */
|
334 | 152 | equemene | |
335 | 152 | equemene | // Get all platforms
|
336 | 152 | equemene | err = clGetPlatformIDs(0, NULL, &platformCount); |
337 | 152 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
338 | 152 | equemene | err = clGetPlatformIDs(platformCount, platforms, NULL);
|
339 | 152 | equemene | |
340 | 152 | equemene | // Get Device defined
|
341 | 152 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
342 | 152 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
343 | 152 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
344 | 152 | equemene | |
345 | 152 | equemene | // print device name
|
346 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
347 | 152 | equemene | char* deviceName=(char*)malloc(valueSize); |
348 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
|
349 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
350 | 152 | equemene | char* vendorName=(char*)malloc(valueSize); |
351 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
|
352 | 152 | equemene | printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
|
353 | 152 | equemene | free(deviceName); |
354 | 152 | equemene | free(vendorName); |
355 | 152 | equemene | |
356 | 152 | equemene | props[1] = (cl_context_properties)platforms[MyPlatform];
|
357 | 152 | equemene | |
358 | 152 | equemene | cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err); |
359 | 152 | equemene | cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
|
360 | 152 | equemene | |
361 | 152 | equemene | cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, |
362 | 152 | equemene | sizeof(uint64_t) * ParallelRate, NULL, NULL); |
363 | 152 | equemene | |
364 | 152 | equemene | // 51 is the number of line for OpenCL code
|
365 | 156 | equemene | // 66, sans test
|
366 | 156 | equemene | cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL); |
367 | 152 | equemene | clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); |
368 | 152 | equemene | cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL); |
369 | 152 | equemene | cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL); |
370 | 152 | equemene | cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL); |
371 | 152 | equemene | |
372 | 152 | equemene | // Divide the total number of iterations by the parallel rate
|
373 | 152 | equemene | // Add +1 to the number of per work iterations if division not integer
|
374 | 152 | equemene | uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1; |
375 | 152 | equemene | // Initialize seeds for MWC RNG generator from Marsaglia
|
376 | 152 | equemene | uint32_t seed_w=110271;
|
377 | 152 | equemene | uint32_t seed_z=101008;
|
378 | 152 | equemene | |
379 | 152 | equemene | // Set the values of arguments for OpenCL function call
|
380 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside); |
381 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach); |
382 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w); |
383 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z); |
384 | 156 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType); |
385 | 152 | equemene | |
386 | 152 | equemene | size_t WorkSize[1] = {ParallelRate}; // one dimensional Range |
387 | 152 | equemene | |
388 | 152 | equemene | uint64_t HostInside[ParallelRate]; |
389 | 152 | equemene | |
390 | 152 | equemene | for (uint32_t loop=0;loop<Loops;loop++) { |
391 | 153 | equemene | // Set start timer
|
392 | 153 | equemene | gettimeofday(&tv1, &tz); |
393 | 153 | equemene | |
394 | 153 | equemene | // Execute the OpenCL kernel with datas
|
395 | 152 | equemene | clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL, |
396 | 152 | equemene | WorkSize, NULL, 0, NULL, NULL); |
397 | 153 | equemene | // Copy each result for each PR from Device to Host
|
398 | 152 | equemene | clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
|
399 | 152 | equemene | ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL); |
400 | 153 | equemene | uint64_t inside=0;
|
401 | 153 | equemene | |
402 | 153 | equemene | for (int i= 0; i < ParallelRate; i++) { |
403 | 153 | equemene | inside+=HostInside[i]; |
404 | 153 | equemene | } |
405 | 153 | equemene | |
406 | 153 | equemene | // Set stop timer
|
407 | 153 | equemene | gettimeofday(&tv2, &tz); |
408 | 153 | equemene | |
409 | 153 | equemene | double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + |
410 | 153 | equemene | (tv2.tv_usec-tv1.tv_usec))/1000000;
|
411 | 153 | equemene | |
412 | 153 | equemene | double itops=(double)(ParallelRate*IterationsEach)/elapsed; |
413 | 153 | equemene | |
414 | 153 | equemene | printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach)))); |
415 | 152 | equemene | } |
416 | 152 | equemene | printf("\n\n");
|
417 | 152 | equemene | |
418 | 152 | equemene | clReleaseKernel(OpenCLMainLoopGlobal); |
419 | 152 | equemene | clReleaseProgram(OpenCLProgram); |
420 | 152 | equemene | clReleaseCommandQueue(cqCommandQueue); |
421 | 152 | equemene | clReleaseContext(GPUContext); |
422 | 152 | equemene | clReleaseMemObject(GPUInside); |
423 | 152 | equemene | |
424 | 152 | equemene | |
425 | 152 | equemene | return 0; |
426 | 152 | equemene | } |
427 | 152 | equemene | } |