root / Pi / OpenCL / PiOpenCL.c @ 185
Historique | Voir | Annoter | Télécharger (15,01 ko)
1 | 152 | equemene | // Pi Dart Dash in OpenCL in C, illustrative example
|
---|---|---|---|
2 | 152 | equemene | //
|
3 | 174 | equemene | // Emmanuel Quemener <emmanuel.quemener@gmail.com>
|
4 | 174 | equemene | //
|
5 | 174 | equemene | // CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
6 | 152 | equemene | // Copyleft Cecill v2
|
7 | 152 | equemene | //
|
8 | 152 | equemene | // -h : print the documentation and detect devices as (platform,device)
|
9 | 152 | equemene | //
|
10 | 152 | equemene | // classical use:
|
11 | 152 | equemene | // #1 OpenCL Plateform ID: get this information with -h option
|
12 | 152 | equemene | // #2 OpenCL Device ID: get this information with -h option
|
13 | 152 | equemene | // #3 Minimal number of iterations:
|
14 | 152 | equemene | // #4 Parallel Rate: scattering global work in parts executed //ly
|
15 | 152 | equemene | // #5 Loops (to improve statistics)
|
16 | 174 | equemene | // #6 Type of variables INT32, INT64, FP32, FP64
|
17 | 152 | equemene | //
|
18 | 174 | equemene | // To compile : gcc -o PiOpenCL PiOpenCL.c -lOpenCL
|
19 | 152 | equemene | |
20 | 152 | equemene | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
21 | 152 | equemene | #include <stdio.h> |
22 | 152 | equemene | #include <stdlib.h> |
23 | 152 | equemene | #include <string.h> |
24 | 152 | equemene | #include <CL/cl.h> |
25 | 152 | equemene | #include <stdint.h> |
26 | 152 | equemene | #include <sys/time.h> |
27 | 152 | equemene | |
28 | 156 | equemene | #define TINT32 0 |
29 | 156 | equemene | #define TINT64 1 |
30 | 156 | equemene | #define TFP32 2 |
31 | 156 | equemene | #define TFP64 3 |
32 | 156 | equemene | |
33 | 152 | equemene | int DetectOpenCLDevices(void) |
34 | 152 | equemene | { |
35 | 152 | equemene | int i, j;
|
36 | 152 | equemene | char* value;
|
37 | 152 | equemene | size_t valueSize; |
38 | 152 | equemene | cl_uint platformCount; |
39 | 152 | equemene | cl_platform_id* platforms; |
40 | 152 | equemene | cl_uint deviceCount; |
41 | 152 | equemene | cl_device_id* devices; |
42 | 152 | equemene | cl_uint maxComputeUnits; |
43 | 152 | equemene | cl_int maxWorkGroupSize; |
44 | 152 | equemene | cl_int maxWorkItemSizes; |
45 | 152 | equemene | cl_device_type dev_type; |
46 | 152 | equemene | |
47 | 152 | equemene | // get all platforms
|
48 | 152 | equemene | clGetPlatformIDs(0, NULL, &platformCount); |
49 | 152 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
50 | 152 | equemene | clGetPlatformIDs(platformCount, platforms, NULL);
|
51 | 152 | equemene | |
52 | 152 | equemene | printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
53 | 152 | equemene | |
54 | 152 | equemene | for (i = 0; i < platformCount; i++) { |
55 | 152 | equemene | |
56 | 152 | equemene | // get all devices
|
57 | 152 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
58 | 152 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
59 | 152 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
60 | 152 | equemene | |
61 | 152 | equemene | // for each device print critical attributes
|
62 | 152 | equemene | for (j = 0; j < deviceCount; j++) { |
63 | 152 | equemene | |
64 | 152 | equemene | // print device name
|
65 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
66 | 152 | equemene | value = (char*) malloc(valueSize);
|
67 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
68 | 152 | equemene | printf("Device (%d,%d): %s\n",i, j, value);
|
69 | 152 | equemene | free(value); |
70 | 152 | equemene | |
71 | 152 | equemene | // print type device CPU/GPU/ACCELERATOR
|
72 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
73 | 152 | equemene | printf("\tDevice Type: ");
|
74 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_GPU)
|
75 | 152 | equemene | printf("CL_DEVICE_TYPE_GPU ");
|
76 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_CPU)
|
77 | 152 | equemene | printf("CL_DEVICE_TYPE_CPU ");
|
78 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
79 | 152 | equemene | printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
80 | 152 | equemene | if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
81 | 152 | equemene | printf("CL_DEVICE_TYPE_DEFAULT ");
|
82 | 152 | equemene | printf("\n");
|
83 | 152 | equemene | |
84 | 152 | equemene | // print device vendor
|
85 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
86 | 152 | equemene | value = (char*) malloc(valueSize);
|
87 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
88 | 152 | equemene | printf("\tDevice vendor: %s\n", value);
|
89 | 152 | equemene | free(value); |
90 | 152 | equemene | |
91 | 152 | equemene | // print hardware device version
|
92 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
93 | 152 | equemene | value = (char*) malloc(valueSize);
|
94 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
95 | 152 | equemene | printf("\tHardware version: %s\n", value);
|
96 | 152 | equemene | free(value); |
97 | 152 | equemene | |
98 | 152 | equemene | // print software driver version
|
99 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
100 | 152 | equemene | value = (char*) malloc(valueSize);
|
101 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
102 | 152 | equemene | printf("\tSoftware version: %s\n", value);
|
103 | 152 | equemene | free(value); |
104 | 152 | equemene | |
105 | 152 | equemene | // print c version supported by compiler for device
|
106 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
107 | 152 | equemene | value = (char*) malloc(valueSize);
|
108 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
109 | 152 | equemene | printf("\tOpenCL C version: %s\n", value);
|
110 | 152 | equemene | free(value); |
111 | 152 | equemene | |
112 | 152 | equemene | // print parallel compute units
|
113 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
114 | 152 | equemene | sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
115 | 152 | equemene | printf("\tParallel compute units: %d\n", maxComputeUnits);
|
116 | 152 | equemene | |
117 | 152 | equemene | // print max work group size
|
118 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
119 | 152 | equemene | sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
120 | 152 | equemene | printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
121 | 152 | equemene | |
122 | 152 | equemene | // print max work items size
|
123 | 152 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
124 | 152 | equemene | sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
125 | 152 | equemene | printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
126 | 152 | equemene | |
127 | 152 | equemene | } |
128 | 152 | equemene | printf("\n");
|
129 | 152 | equemene | free(devices); |
130 | 152 | equemene | } |
131 | 152 | equemene | |
132 | 152 | equemene | free(platforms); |
133 | 152 | equemene | return 0; |
134 | 152 | equemene | |
135 | 152 | equemene | } |
136 | 152 | equemene | |
137 | 152 | equemene | const char* OpenCLSource[] = { |
138 | 156 | equemene | "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
|
139 | 152 | equemene | "// Marsaglia RNG very simple implementation \n",
|
140 | 152 | equemene | "#define znew ((z=36969*(z&65535)+(z>>16))<<16) \n",
|
141 | 152 | equemene | "#define wnew ((w=18000*(w&65535)+(w>>16))&65535) \n",
|
142 | 152 | equemene | "#define MWC (znew+wnew) \n",
|
143 | 152 | equemene | "#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) \n",
|
144 | 152 | equemene | "#define CONG (jcong=69069*jcong+1234567) \n",
|
145 | 152 | equemene | "#define KISS ((MWC^CONG)+SHR3) \n",
|
146 | 152 | equemene | "#define MWCfp MWC * 2.328306435454494e-10f \n",
|
147 | 152 | equemene | "#define KISSfp KISS * 2.328306435454494e-10f \n",
|
148 | 152 | equemene | "#define CONGfp CONG * 2.328306435454494e-10f \n",
|
149 | 152 | equemene | "#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
|
150 | 156 | equemene | "#define TINT32 0 \n",
|
151 | 156 | equemene | "#define TINT64 1 \n",
|
152 | 156 | equemene | "#define TFP32 2 \n",
|
153 | 156 | equemene | "#define TFP64 3 \n",
|
154 | 156 | equemene | "#define THEONE32I 1073741824 \n",
|
155 | 156 | equemene | "#define THEONE32F 1.e0f \n",
|
156 | 156 | equemene | "#define THEONE64I 4611686018427387904 \n",
|
157 | 156 | equemene | "#define THEONE64F (double)1.e0f \n",
|
158 | 156 | equemene | "ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
159 | 152 | equemene | "{",
|
160 | 152 | equemene | " uint z=seed_z+work;",
|
161 | 152 | equemene | " uint w=seed_w+work;",
|
162 | 152 | equemene | " ulong total=0;",
|
163 | 152 | equemene | " for (ulong i=0;i<iterations;i++)",
|
164 | 156 | equemene | " {",
|
165 | 156 | equemene | " uint x= MWC>>17;",
|
166 | 156 | equemene | " uint y= MWC>>17;",
|
167 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
|
168 | 156 | equemene | " total+=inside;",
|
169 | 156 | equemene | " }",
|
170 | 156 | equemene | " return(total);",
|
171 | 156 | equemene | "}",
|
172 | 156 | equemene | "ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
173 | 152 | equemene | "{",
|
174 | 156 | equemene | " uint z=seed_z+work;",
|
175 | 156 | equemene | " uint w=seed_w+work;",
|
176 | 156 | equemene | " ulong total=0;",
|
177 | 156 | equemene | " for (ulong i=0;i<iterations;i++)",
|
178 | 156 | equemene | " {",
|
179 | 156 | equemene | " float x=(float)MWCfp ;",
|
180 | 156 | equemene | " float y=(float)MWCfp ;",
|
181 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
|
182 | 152 | equemene | " total+=inside;",
|
183 | 156 | equemene | " }",
|
184 | 156 | equemene | " return(total);",
|
185 | 152 | equemene | "}",
|
186 | 156 | equemene | "ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
187 | 156 | equemene | "{",
|
188 | 156 | equemene | " uint z=seed_z+work;",
|
189 | 156 | equemene | " uint w=seed_w+work;",
|
190 | 156 | equemene | " ulong total=0;",
|
191 | 156 | equemene | " for (ulong i=0;i<iterations;i++)",
|
192 | 156 | equemene | " {",
|
193 | 156 | equemene | " ulong x=(ulong)(MWC>>1);",
|
194 | 156 | equemene | " ulong y=(ulong)(MWC>>1);",
|
195 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
|
196 | 156 | equemene | " total+=inside;",
|
197 | 156 | equemene | " }",
|
198 | 152 | equemene | " return(total);",
|
199 | 152 | equemene | "}",
|
200 | 156 | equemene | "ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
201 | 152 | equemene | "{",
|
202 | 156 | equemene | " uint z=seed_z+work;",
|
203 | 156 | equemene | " uint w=seed_w+work;",
|
204 | 156 | equemene | " ulong total=0;",
|
205 | 156 | equemene | " for (ulong i=0;i<iterations;i++)",
|
206 | 156 | equemene | "{",
|
207 | 158 | equemene | " double x=(double)MWCfp ;",
|
208 | 158 | equemene | " double y=(double)MWCfp ;",
|
209 | 156 | equemene | " ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
|
210 | 156 | equemene | " total+=inside;",
|
211 | 156 | equemene | "}",
|
212 | 156 | equemene | " return(total);",
|
213 | 156 | equemene | "}",
|
214 | 156 | equemene | "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
215 | 156 | equemene | "{",
|
216 | 156 | equemene | " ulong total;",
|
217 | 156 | equemene | " if (MyType==TFP32) {",
|
218 | 156 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
219 | 156 | equemene | " }",
|
220 | 156 | equemene | " else if (MyType==TFP64) {",
|
221 | 156 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
222 | 156 | equemene | " }",
|
223 | 156 | equemene | " else if (MyType==TINT32) {",
|
224 | 156 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
225 | 156 | equemene | " }",
|
226 | 156 | equemene | " else if (MyType==TINT64) {",
|
227 | 156 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
228 | 156 | equemene | " }",
|
229 | 152 | equemene | " barrier(CLK_GLOBAL_MEM_FENCE);",
|
230 | 152 | equemene | " s[get_global_id(0)]=(ulong)total;",
|
231 | 152 | equemene | "}",
|
232 | 156 | equemene | "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
233 | 152 | equemene | "{",
|
234 | 156 | equemene | " ulong total;",
|
235 | 156 | equemene | " if (MyType==TFP32) {",
|
236 | 156 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
|
237 | 156 | equemene | " }",
|
238 | 156 | equemene | " else if (MyType==TFP64) {",
|
239 | 156 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
|
240 | 156 | equemene | " }",
|
241 | 156 | equemene | " else if (MyType==TINT32) {",
|
242 | 156 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
|
243 | 156 | equemene | " }",
|
244 | 156 | equemene | " else if (MyType==TINT64) {",
|
245 | 156 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
|
246 | 156 | equemene | " }",
|
247 | 152 | equemene | " barrier(CLK_LOCAL_MEM_FENCE);",
|
248 | 156 | equemene | " s[get_local_id(0)]=(ulong)total;",
|
249 | 152 | equemene | "}",
|
250 | 156 | equemene | "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
251 | 152 | equemene | "{",
|
252 | 156 | equemene | " ulong total;",
|
253 | 156 | equemene | " if (MyType==TFP32) {",
|
254 | 156 | equemene | " total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
|
255 | 156 | equemene | " }",
|
256 | 156 | equemene | " else if (MyType==TFP64) {",
|
257 | 156 | equemene | " total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
|
258 | 156 | equemene | " }",
|
259 | 156 | equemene | " else if (MyType==TINT32) {",
|
260 | 156 | equemene | " total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
|
261 | 156 | equemene | " }",
|
262 | 156 | equemene | " else if (MyType==TINT64) {",
|
263 | 156 | equemene | " total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
|
264 | 156 | equemene | " }",
|
265 | 152 | equemene | " barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
|
266 | 152 | equemene | " s[get_global_id(0)]=total;",
|
267 | 152 | equemene | "}"
|
268 | 152 | equemene | }; |
269 | 152 | equemene | |
270 | 152 | equemene | int main(int argc, char **argv) |
271 | 152 | equemene | { |
272 | 152 | equemene | if ((argc==1)|| |
273 | 152 | equemene | (strcmp(argv[1],"-h")==0)|| |
274 | 152 | equemene | (strcmp(argv[1],"--help")==0)) |
275 | 152 | equemene | { |
276 | 152 | equemene | printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
|
277 | 156 | equemene | "\t#1 OpenCL Plateform ID (default 0)\n"
|
278 | 156 | equemene | "\t#2 OpenCL Device ID (default 0)\n"
|
279 | 156 | equemene | "\t#3 Minimal number of iterations (default 1000000)\n"
|
280 | 156 | equemene | "\t#4 Parallel Rate (default 1024)\n"
|
281 | 156 | equemene | "\t#5 Loops (default 1)\n"
|
282 | 156 | equemene | "\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
|
283 | 152 | equemene | DetectOpenCLDevices(); |
284 | 152 | equemene | } |
285 | 152 | equemene | else
|
286 | 152 | equemene | { |
287 | 152 | equemene | |
288 | 152 | equemene | int MyPlatform=atoi(argv[1]); |
289 | 152 | equemene | int MyDevice=atoi(argv[2]); |
290 | 152 | equemene | |
291 | 152 | equemene | struct timeval tv1,tv2;
|
292 | 152 | equemene | struct timezone tz;
|
293 | 152 | equemene | |
294 | 152 | equemene | uint64_t Iterations=1000000;
|
295 | 152 | equemene | if (argc>3) { |
296 | 152 | equemene | Iterations=(uint64_t)atoll(argv[3]);
|
297 | 152 | equemene | } |
298 | 152 | equemene | |
299 | 152 | equemene | uint32_t ParallelRate=64;
|
300 | 152 | equemene | if (argc>4) { |
301 | 152 | equemene | ParallelRate=(uint32_t)atoi(argv[4]);
|
302 | 152 | equemene | } |
303 | 152 | equemene | |
304 | 152 | equemene | uint32_t Loops=1;
|
305 | 152 | equemene | if (argc>5) { |
306 | 152 | equemene | Loops=(uint32_t)atoi(argv[5]);
|
307 | 152 | equemene | } |
308 | 152 | equemene | |
309 | 156 | equemene | uint32_t MyType=TFP32; |
310 | 156 | equemene | if (argc>6) { |
311 | 156 | equemene | if (strcmp(argv[6],"INT32")==0) { |
312 | 156 | equemene | MyType=(uint32_t)TINT32; |
313 | 156 | equemene | } |
314 | 156 | equemene | else if (strcmp(argv[6],"INT64")==0) { |
315 | 156 | equemene | MyType=(uint32_t)TINT64; |
316 | 156 | equemene | } |
317 | 156 | equemene | else if (strcmp(argv[6],"FP32")==0) { |
318 | 156 | equemene | MyType=(uint32_t)TFP32; |
319 | 156 | equemene | } |
320 | 156 | equemene | else if (strcmp(argv[6],"FP64")==0) { |
321 | 156 | equemene | MyType=(uint32_t)TFP64; |
322 | 156 | equemene | } |
323 | 156 | equemene | } |
324 | 156 | equemene | |
325 | 156 | equemene | printf("MyType %d\n",MyType);
|
326 | 156 | equemene | |
327 | 152 | equemene | cl_int err; |
328 | 152 | equemene | cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
329 | 152 | equemene | |
330 | 152 | equemene | // Detect, scan, get & initialize platform and device
|
331 | 152 | equemene | cl_uint platformCount; |
332 | 152 | equemene | cl_platform_id* platforms; |
333 | 152 | equemene | cl_uint deviceCount; |
334 | 152 | equemene | cl_device_id* devices; |
335 | 152 | equemene | size_t valueSize; |
336 | 152 | equemene | |
337 | 152 | equemene | /* Setup OpenCL environment. */
|
338 | 152 | equemene | |
339 | 152 | equemene | // Get all platforms
|
340 | 152 | equemene | err = clGetPlatformIDs(0, NULL, &platformCount); |
341 | 152 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
342 | 152 | equemene | err = clGetPlatformIDs(platformCount, platforms, NULL);
|
343 | 152 | equemene | |
344 | 152 | equemene | // Get Device defined
|
345 | 152 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
346 | 152 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
347 | 152 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
348 | 152 | equemene | |
349 | 152 | equemene | // print device name
|
350 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
351 | 152 | equemene | char* deviceName=(char*)malloc(valueSize); |
352 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, deviceName, NULL);
|
353 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
354 | 152 | equemene | char* vendorName=(char*)malloc(valueSize); |
355 | 152 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_VENDOR, valueSize, vendorName, NULL);
|
356 | 152 | equemene | printf("\nDevice (%d,%d):\n\t- vendor: %s\n\t- device: %s\n\n",MyPlatform,MyDevice, vendorName,deviceName);
|
357 | 152 | equemene | free(deviceName); |
358 | 152 | equemene | free(vendorName); |
359 | 152 | equemene | |
360 | 152 | equemene | props[1] = (cl_context_properties)platforms[MyPlatform];
|
361 | 152 | equemene | |
362 | 152 | equemene | cl_context GPUContext = clCreateContext(props, 1, &devices[MyDevice], NULL, NULL, &err); |
363 | 152 | equemene | cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext,devices[MyDevice], 0, &err);
|
364 | 152 | equemene | |
365 | 152 | equemene | cl_mem GPUInside = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, |
366 | 152 | equemene | sizeof(uint64_t) * ParallelRate, NULL, NULL); |
367 | 152 | equemene | |
368 | 152 | equemene | // 51 is the number of line for OpenCL code
|
369 | 156 | equemene | // 66, sans test
|
370 | 156 | equemene | cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL); |
371 | 152 | equemene | clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); |
372 | 152 | equemene | cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL); |
373 | 152 | equemene | cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL); |
374 | 152 | equemene | cl_kernel OpenCLMainLoopHybrid = clCreateKernel(OpenCLProgram, "MainLoopHybrid", NULL); |
375 | 152 | equemene | |
376 | 152 | equemene | // Divide the total number of iterations by the parallel rate
|
377 | 152 | equemene | // Add +1 to the number of per work iterations if division not integer
|
378 | 152 | equemene | uint64_t IterationsEach=((Iterations%ParallelRate)==0)?Iterations/ParallelRate:Iterations/ParallelRate+1; |
379 | 152 | equemene | // Initialize seeds for MWC RNG generator from Marsaglia
|
380 | 152 | equemene | uint32_t seed_w=110271;
|
381 | 152 | equemene | uint32_t seed_z=101008;
|
382 | 152 | equemene | |
383 | 152 | equemene | // Set the values of arguments for OpenCL function call
|
384 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 0, sizeof(cl_mem),&GPUInside); |
385 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach); |
386 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w); |
387 | 152 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z); |
388 | 156 | equemene | clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType); |
389 | 152 | equemene | |
390 | 152 | equemene | size_t WorkSize[1] = {ParallelRate}; // one dimensional Range |
391 | 152 | equemene | |
392 | 152 | equemene | uint64_t HostInside[ParallelRate]; |
393 | 152 | equemene | |
394 | 152 | equemene | for (uint32_t loop=0;loop<Loops;loop++) { |
395 | 153 | equemene | // Set start timer
|
396 | 153 | equemene | gettimeofday(&tv1, &tz); |
397 | 153 | equemene | |
398 | 153 | equemene | // Execute the OpenCL kernel with datas
|
399 | 152 | equemene | clEnqueueNDRangeKernel(cqCommandQueue, OpenCLMainLoopGlobal, 1, NULL, |
400 | 152 | equemene | WorkSize, NULL, 0, NULL, NULL); |
401 | 153 | equemene | // Copy each result for each PR from Device to Host
|
402 | 152 | equemene | clEnqueueReadBuffer(cqCommandQueue, GPUInside, CL_TRUE, 0,
|
403 | 152 | equemene | ParallelRate * sizeof(uint64_t), HostInside, 0, NULL, NULL); |
404 | 153 | equemene | uint64_t inside=0;
|
405 | 153 | equemene | |
406 | 153 | equemene | for (int i= 0; i < ParallelRate; i++) { |
407 | 153 | equemene | inside+=HostInside[i]; |
408 | 153 | equemene | } |
409 | 153 | equemene | |
410 | 153 | equemene | // Set stop timer
|
411 | 153 | equemene | gettimeofday(&tv2, &tz); |
412 | 153 | equemene | |
413 | 153 | equemene | double elapsed=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + |
414 | 153 | equemene | (tv2.tv_usec-tv1.tv_usec))/1000000;
|
415 | 153 | equemene | |
416 | 153 | equemene | double itops=(double)(ParallelRate*IterationsEach)/elapsed; |
417 | 153 | equemene | |
418 | 153 | equemene | printf("Inside/Total %ld %ld\nParallelRate %i\nElapsed Time %.2f\nItops %.0f\nPi estimation %f\n\n",inside,ParallelRate*IterationsEach,ParallelRate,elapsed,itops,(4.*(float)inside/((float)(ParallelRate)*(float)(IterationsEach)))); |
419 | 152 | equemene | } |
420 | 152 | equemene | printf("\n\n");
|
421 | 152 | equemene | |
422 | 152 | equemene | clReleaseKernel(OpenCLMainLoopGlobal); |
423 | 152 | equemene | clReleaseProgram(OpenCLProgram); |
424 | 152 | equemene | clReleaseCommandQueue(cqCommandQueue); |
425 | 152 | equemene | clReleaseContext(GPUContext); |
426 | 152 | equemene | clReleaseMemObject(GPUInside); |
427 | 152 | equemene | |
428 | 152 | equemene | |
429 | 152 | equemene | return 0; |
430 | 152 | equemene | } |
431 | 152 | equemene | } |