Révision 156
Pi/OpenCL/PiOpenCL.c (revision 156) | ||
---|---|---|
21 | 21 |
#include <stdint.h> |
22 | 22 |
#include <sys/time.h> |
23 | 23 |
|
24 |
#define TINT32 0 |
|
25 |
#define TINT64 1 |
|
26 |
#define TFP32 2 |
|
27 |
#define TFP64 3 |
|
28 |
|
|
24 | 29 |
int DetectOpenCLDevices(void) |
25 | 30 |
{ |
26 | 31 |
int i, j; |
... | ... | |
126 | 131 |
} |
127 | 132 |
|
128 | 133 |
const char* OpenCLSource[] = { |
134 |
"#pragma OPENCL EXTENSION cl_khr_fp64: enable \n", |
|
129 | 135 |
"// Marsaglia RNG very simple implementation \n", |
130 | 136 |
"#define znew ((z=36969*(z&65535)+(z>>16))<<16) \n", |
131 | 137 |
"#define wnew ((w=18000*(w&65535)+(w>>16))&65535) \n", |
... | ... | |
137 | 143 |
"#define KISSfp KISS * 2.328306435454494e-10f \n", |
138 | 144 |
"#define CONGfp CONG * 2.328306435454494e-10f \n", |
139 | 145 |
"#define SHR3fp SHR3 * 2.328306435454494e-10f \n", |
140 |
"#define THEONE 1.e0f \n", |
|
141 |
"ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)", |
|
146 |
"#define TINT32 0 \n", |
|
147 |
"#define TINT64 1 \n", |
|
148 |
"#define TFP32 2 \n", |
|
149 |
"#define TFP64 3 \n", |
|
150 |
"#define THEONE32I 1073741824 \n", |
|
151 |
"#define THEONE32F 1.e0f \n", |
|
152 |
"#define THEONE64I 4611686018427387904 \n", |
|
153 |
"#define THEONE64F (double)1.e0f \n", |
|
154 |
"ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)", |
|
142 | 155 |
"{", |
143 | 156 |
" uint z=seed_z+work;", |
144 | 157 |
" uint w=seed_w+work;", |
145 | 158 |
" ulong total=0;", |
146 | 159 |
" for (ulong i=0;i<iterations;i++)", |
160 |
" {", |
|
161 |
" uint x= MWC>>17;", |
|
162 |
" uint y= MWC>>17;", |
|
163 |
" ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;", |
|
164 |
" total+=inside;", |
|
165 |
" }", |
|
166 |
" return(total);", |
|
167 |
"}", |
|
168 |
"ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)", |
|
147 | 169 |
"{", |
148 |
" float x=(float)MWCfp ;", |
|
149 |
" float y=(float)MWCfp ;", |
|
150 |
" ulong inside=((x*x+y*y) <= THEONE) ? 1:0;", |
|
170 |
" uint z=seed_z+work;", |
|
171 |
" uint w=seed_w+work;", |
|
172 |
" ulong total=0;", |
|
173 |
" for (ulong i=0;i<iterations;i++)", |
|
174 |
" {", |
|
175 |
" float x=(float)MWCfp ;", |
|
176 |
" float y=(float)MWCfp ;", |
|
177 |
" ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;", |
|
151 | 178 |
" total+=inside;", |
179 |
" }", |
|
180 |
" return(total);", |
|
152 | 181 |
"}", |
182 |
"ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)", |
|
183 |
"{", |
|
184 |
" uint z=seed_z+work;", |
|
185 |
" uint w=seed_w+work;", |
|
186 |
" ulong total=0;", |
|
187 |
" for (ulong i=0;i<iterations;i++)", |
|
188 |
" {", |
|
189 |
" ulong x=(ulong)(MWC>>1);", |
|
190 |
" ulong y=(ulong)(MWC>>1);", |
|
191 |
" ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;", |
|
192 |
" total+=inside;", |
|
193 |
" }", |
|
153 | 194 |
" return(total);", |
154 | 195 |
"}", |
155 |
"__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)",
|
|
196 |
"ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
|
|
156 | 197 |
"{", |
157 |
" ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));", |
|
198 |
" uint z=seed_z+work;", |
|
199 |
" uint w=seed_w+work;", |
|
200 |
" ulong total=0;", |
|
201 |
" for (ulong i=0;i<iterations;i++)", |
|
202 |
"{", |
|
203 |
" float x=(double)MWCfp ;", |
|
204 |
" float y=(double)MWCfp ;", |
|
205 |
" ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;", |
|
206 |
" total+=inside;", |
|
207 |
"}", |
|
208 |
" return(total);", |
|
209 |
"}", |
|
210 |
"__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)", |
|
211 |
"{", |
|
212 |
" ulong total;", |
|
213 |
" if (MyType==TFP32) {", |
|
214 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));", |
|
215 |
" }", |
|
216 |
" else if (MyType==TFP64) {", |
|
217 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));", |
|
218 |
" }", |
|
219 |
" else if (MyType==TINT32) {", |
|
220 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));", |
|
221 |
" }", |
|
222 |
" else if (MyType==TINT64) {", |
|
223 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));", |
|
224 |
" }", |
|
158 | 225 |
" barrier(CLK_GLOBAL_MEM_FENCE);", |
159 | 226 |
" s[get_global_id(0)]=(ulong)total;", |
160 | 227 |
"}", |
161 |
"__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)", |
|
228 |
"__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
|
162 | 229 |
"{", |
163 |
" ulong total=2;", |
|
230 |
" ulong total;", |
|
231 |
" if (MyType==TFP32) {", |
|
232 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));", |
|
233 |
" }", |
|
234 |
" else if (MyType==TFP64) {", |
|
235 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));", |
|
236 |
" }", |
|
237 |
" else if (MyType==TINT32) {", |
|
238 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));", |
|
239 |
" }", |
|
240 |
" else if (MyType==TINT64) {", |
|
241 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));", |
|
242 |
" }", |
|
164 | 243 |
" barrier(CLK_LOCAL_MEM_FENCE);", |
165 |
" s[get_local_id(0)]=total;", |
|
244 |
" s[get_local_id(0)]=(ulong)total;",
|
|
166 | 245 |
"}", |
167 |
"__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)", |
|
246 |
"__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
|
|
168 | 247 |
"{", |
169 |
" ulong total=3;", |
|
248 |
" ulong total;", |
|
249 |
" if (MyType==TFP32) {", |
|
250 |
" total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));", |
|
251 |
" }", |
|
252 |
" else if (MyType==TFP64) {", |
|
253 |
" total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));", |
|
254 |
" }", |
|
255 |
" else if (MyType==TINT32) {", |
|
256 |
" total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));", |
|
257 |
" }", |
|
258 |
" else if (MyType==TINT64) {", |
|
259 |
" total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));", |
|
260 |
" }", |
|
170 | 261 |
" barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);", |
171 | 262 |
" s[get_global_id(0)]=total;", |
172 | 263 |
"}" |
... | ... | |
179 | 270 |
(strcmp(argv[1],"--help")==0)) |
180 | 271 |
{ |
181 | 272 |
printf("\nPerforms a Pi estimation by Dart Dash:\n\n" |
182 |
"\t#1 OpenCL Plateform ID\n" |
|
183 |
"\t#2 OpenCL Device ID\n" |
|
184 |
"\t#3 Minimal number of iterations\n" |
|
185 |
"\t#4 Parallel Rate\n" |
|
186 |
"\t#5 Loops\n\n"); |
|
273 |
"\t#1 OpenCL Plateform ID (default 0)\n" |
|
274 |
"\t#2 OpenCL Device ID (default 0)\n" |
|
275 |
"\t#3 Minimal number of iterations (default 1000000)\n" |
|
276 |
"\t#4 Parallel Rate (default 1024)\n" |
|
277 |
"\t#5 Loops (default 1)\n" |
|
278 |
"\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n"); |
|
187 | 279 |
DetectOpenCLDevices(); |
188 | 280 |
} |
189 | 281 |
else |
... | ... | |
210 | 302 |
Loops=(uint32_t)atoi(argv[5]); |
211 | 303 |
} |
212 | 304 |
|
305 |
uint32_t MyType=TFP32; |
|
306 |
if (argc>6) { |
|
307 |
if (strcmp(argv[6],"INT32")==0) { |
|
308 |
MyType=(uint32_t)TINT32; |
|
309 |
} |
|
310 |
else if (strcmp(argv[6],"INT64")==0) { |
|
311 |
MyType=(uint32_t)TINT64; |
|
312 |
} |
|
313 |
else if (strcmp(argv[6],"FP32")==0) { |
|
314 |
MyType=(uint32_t)TFP32; |
|
315 |
} |
|
316 |
else if (strcmp(argv[6],"FP64")==0) { |
|
317 |
MyType=(uint32_t)TFP64; |
|
318 |
} |
|
319 |
} |
|
320 |
|
|
321 |
printf("MyType %d\n",MyType); |
|
322 |
|
|
213 | 323 |
cl_int err; |
214 | 324 |
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
215 | 325 |
|
... | ... | |
252 | 362 |
sizeof(uint64_t) * ParallelRate, NULL, NULL); |
253 | 363 |
|
254 | 364 |
// 51 is the number of line for OpenCL code |
255 |
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 44,OpenCLSource,NULL,NULL); |
|
365 |
// 66, sans test |
|
366 |
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL); |
|
256 | 367 |
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); |
257 | 368 |
cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL); |
258 | 369 |
cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL); |
... | ... | |
270 | 381 |
clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach); |
271 | 382 |
clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w); |
272 | 383 |
clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z); |
384 |
clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType); |
|
273 | 385 |
|
274 | 386 |
size_t WorkSize[1] = {ParallelRate}; // one dimensional Range |
275 | 387 |
|
Formats disponibles : Unified diff