Révision 156

Pi/OpenCL/PiOpenCL.c (revision 156)
21 21
#include <stdint.h>
22 22
#include <sys/time.h>
23 23

  
24
#define TINT32 0
25
#define TINT64 1
26
#define TFP32 2
27
#define TFP64 3
28

  
24 29
int DetectOpenCLDevices(void) 
25 30
{
26 31
  int i, j;
......
126 131
}
127 132

  
128 133
const char* OpenCLSource[] = {
134
  "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n",
129 135
  "// Marsaglia RNG very simple implementation \n",
130 136
  "#define znew  ((z=36969*(z&65535)+(z>>16))<<16) \n",
131 137
  "#define wnew  ((w=18000*(w&65535)+(w>>16))&65535) \n",
......
137 143
  "#define KISSfp KISS * 2.328306435454494e-10f \n",
138 144
  "#define CONGfp CONG * 2.328306435454494e-10f \n",
139 145
  "#define SHR3fp SHR3 * 2.328306435454494e-10f \n",
140
  "#define THEONE 1.e0f \n",
141
  "ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)",
146
  "#define TINT32 0 \n",
147
  "#define TINT64 1 \n",
148
  "#define TFP32 2 \n",
149
  "#define TFP64 3 \n",
150
  "#define THEONE32I 1073741824 \n",
151
  "#define THEONE32F 1.e0f \n",
152
  "#define THEONE64I 4611686018427387904 \n",
153
  "#define THEONE64F (double)1.e0f \n",
154
  "ulong MainLoop32I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
142 155
  "{",
143 156
  "   uint z=seed_z+work;",
144 157
  "   uint w=seed_w+work;",
145 158
  "   ulong total=0;",
146 159
  "   for (ulong i=0;i<iterations;i++)",
160
  "   {",
161
  "      uint x= MWC>>17;",
162
  "      uint y= MWC>>17;",
163
  "      ulong inside=((x*x+y*y) <= THEONE32I) ? 1:0;",
164
  "      total+=inside;",
165
  "   }",
166
  "   return(total);",
167
  "}",
168
  "ulong MainLoop32F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
147 169
  "{",
148
  "        float x=(float)MWCfp ;",
149
  "        float y=(float)MWCfp ;",
150
  "      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;",
170
  "   uint z=seed_z+work;",
171
  "   uint w=seed_w+work;",
172
  "   ulong total=0;",
173
  "   for (ulong i=0;i<iterations;i++)",
174
  "   {",
175
  "      float x=(float)MWCfp ;",
176
  "      float y=(float)MWCfp ;",
177
  "      ulong inside=((x*x+y*y) <= THEONE32F) ? 1:0;",
151 178
  "      total+=inside;",
179
  "   }",
180
  "   return(total);",
152 181
  "}",
182
  "ulong MainLoop64I(ulong iterations,uint seed_z,uint seed_w,size_t work)",
183
  "{",
184
  "   uint z=seed_z+work;",
185
  "   uint w=seed_w+work;",
186
  "   ulong total=0;",
187
  "   for (ulong i=0;i<iterations;i++)",
188
  "   {",
189
  "      ulong x=(ulong)(MWC>>1);",
190
  "      ulong y=(ulong)(MWC>>1);",
191
  "      ulong inside=((x*x+y*y) <= THEONE64I) ? 1:0;",
192
  "      total+=inside;",
193
  "   }",
153 194
  "   return(total);",
154 195
  "}",
155
  "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)",
196
  "ulong MainLoop64F(ulong iterations,uint seed_z,uint seed_w,size_t work)",
156 197
  "{",
157
  "   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));",
198
  "   uint z=seed_z+work;",
199
  "   uint w=seed_w+work;",
200
  "   ulong total=0;",
201
  "   for (ulong i=0;i<iterations;i++)",
202
  "{",
203
  "        float x=(double)MWCfp ;",
204
  "        float y=(double)MWCfp ;",
205
  "      ulong inside=((x*x+y*y) <= THEONE64F) ? 1:0;",
206
  "      total+=inside;",
207
  "}",
208
  "   return(total);",
209
  "}",
210
  "__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
211
  "{",
212
  "   ulong total;",
213
  "   if (MyType==TFP32) {",
214
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
215
  "   }",
216
  "   else if (MyType==TFP64) {",
217
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
218
  "   }",  
219
  "   else if (MyType==TINT32) {",
220
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
221
  "   }",  
222
  "   else if (MyType==TINT64) {",
223
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
224
  "   }",  
158 225
  "   barrier(CLK_GLOBAL_MEM_FENCE);",
159 226
  "   s[get_global_id(0)]=(ulong)total;",
160 227
  "}",
161
  "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)",
228
  "__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
162 229
  "{",
163
  "   ulong total=2;",
230
  "   ulong total;",
231
  "   if (MyType==TFP32) {",
232
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_local_id(0));",
233
  "   }",
234
  "   else if (MyType==TFP64) {",
235
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_local_id(0));",
236
  "   }",  
237
  "   else if (MyType==TINT32) {",
238
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_local_id(0));",
239
  "   }",  
240
  "   else if (MyType==TINT64) {",
241
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_local_id(0));",
242
  "   }",  
164 243
  "   barrier(CLK_LOCAL_MEM_FENCE);",
165
  "   s[get_local_id(0)]=total;",
244
  "   s[get_local_id(0)]=(ulong)total;",
166 245
  "}",
167
  "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)",
246
  "__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z,uint MyType)",
168 247
  "{",
169
  "   ulong total=3;",
248
  "   ulong total;",
249
  "   if (MyType==TFP32) {",
250
  "      total=(ulong)MainLoop32F(iterations,seed_z,seed_w,get_global_id(0));",
251
  "   }",
252
  "   else if (MyType==TFP64) {",
253
  "      total=(ulong)MainLoop64F(iterations,seed_z,seed_w,get_global_id(0));",
254
  "   }",  
255
  "   else if (MyType==TINT32) {",
256
  "      total=(ulong)MainLoop32I(iterations,seed_z,seed_w,get_global_id(0));",
257
  "   }",  
258
  "   else if (MyType==TINT64) {",
259
  "      total=(ulong)MainLoop64I(iterations,seed_z,seed_w,get_global_id(0));",
260
  "   }",  
170 261
  "   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);",
171 262
  "   s[get_global_id(0)]=total;",
172 263
  "}"
......
179 270
      (strcmp(argv[1],"--help")==0))
180 271
    {
181 272
      printf("\nPerforms a Pi estimation by Dart Dash:\n\n"
182
	     "\t#1 OpenCL Plateform ID\n"
183
	     "\t#2 OpenCL Device ID\n"
184
	     "\t#3 Minimal number of iterations\n"
185
	     "\t#4 Parallel Rate\n"
186
	     "\t#5 Loops\n\n");
273
	     "\t#1 OpenCL Plateform ID (default 0)\n"
274
	     "\t#2 OpenCL Device ID (default 0)\n"
275
	     "\t#3 Minimal number of iterations (default 1000000)\n"
276
	     "\t#4 Parallel Rate (default 1024)\n"
277
	     "\t#5 Loops (default 1)\n"
278
	     "\t#6 Type of variable: INT32, INT64, FP32, FP64 (default FP32)\n\n");
187 279
      DetectOpenCLDevices();
188 280
    }
189 281
  else
......
210 302
	Loops=(uint32_t)atoi(argv[5]);
211 303
      }
212 304
      
305
      uint32_t MyType=TFP32;
306
      if (argc>6) {
307
	if (strcmp(argv[6],"INT32")==0) {
308
	  MyType=(uint32_t)TINT32;
309
	}
310
	else if (strcmp(argv[6],"INT64")==0) {
311
	  MyType=(uint32_t)TINT64;
312
	}
313
	else if (strcmp(argv[6],"FP32")==0) {
314
	  MyType=(uint32_t)TFP32;
315
	}
316
	else if (strcmp(argv[6],"FP64")==0) {
317
	  MyType=(uint32_t)TFP64;
318
	}
319
      }
320

  
321
      printf("MyType %d\n",MyType);
322
      
213 323
      cl_int err;
214 324
      cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
215 325
      
......
252 362
					      sizeof(uint64_t) * ParallelRate, NULL, NULL);
253 363
      
254 364
      // 51 is the number of line for OpenCL code
255
      cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 44,OpenCLSource,NULL,NULL);
365
      // 66, sans test
366
      cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 130 ,OpenCLSource,NULL,NULL);
256 367
      clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
257 368
      cl_kernel OpenCLMainLoopGlobal = clCreateKernel(OpenCLProgram, "MainLoopGlobal", NULL);
258 369
      cl_kernel OpenCLMainLoopLocal = clCreateKernel(OpenCLProgram, "MainLoopLocal", NULL);
......
270 381
      clSetKernelArg(OpenCLMainLoopGlobal, 1, sizeof(uint64_t),&IterationsEach);
271 382
      clSetKernelArg(OpenCLMainLoopGlobal, 2, sizeof(uint32_t),&seed_w);
272 383
      clSetKernelArg(OpenCLMainLoopGlobal, 3, sizeof(uint32_t),&seed_z);
384
      clSetKernelArg(OpenCLMainLoopGlobal, 4, sizeof(uint32_t),&MyType);
273 385
      
274 386
      size_t WorkSize[1] = {ParallelRate}; // one dimensional Range
275 387

  

Formats disponibles : Unified diff