/NBody/NBodyGL.py - Bench4GPU - Forge du Centre Blaise Pascal

root / NBody / NBodyGL.py @ 171

Historique | Voir | Annoter | Télécharger (24,98 ko)

       #!/usr/bin/env python3
       # -*- coding: utf-8 -*-
       """
       Demonstrateur OpenCL d'interaction NCorps
       Emmanuel QUEMENER <emmanuel.quemener@ens-lyon.fr> CeCILLv2
       """
       import getopt
       import sys
       import time
       import numpy as np
       import pyopencl as cl
       import pyopencl.array as cl_array
       from numpy.random import randint as nprnd
       import string, sys
       from OpenGL.GL import *
       from OpenGL.GLUT import *
       def DictionariesAPI():
           Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
           Computing={'FP32':0,'FP64':1}
           return(Marsaglia,Computing)
       BlobOpenCL= """
       #define TFP32 0
       #define TFP64 1
       #if TYPE == TFP32
       #define MYFLOAT4 float4
       #define MYFLOAT8 float8
       #define MYFLOAT float
       #define DISTANCE fast_distance
       #else
       #define MYFLOAT4 double4
       #define MYFLOAT8 double8
       #define MYFLOAT double
       #define DISTANCE distance
       #if defined(cl_khr_fp64)  // Khronos extension available?
       #pragma OPENCL EXTENSION cl_khr_fp64 : enable
       #endif
       #endif
       #define znew  ((zmwc=36969*(zmwc&65535)+(zmwc>>16))<<16)
       #define wnew  ((wmwc=18000*(wmwc&65535)+(wmwc>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       #define MWCfp (MYFLOAT)(MWC * 2.3283064365386963e-10f)
       #define KISSfp (MYFLOAT)(KISS * 2.3283064365386963e-10f)
       #define SHR3fp (MYFLOAT)(SHR3 * 2.3283064365386963e-10f)
       #define CONGfp (MYFLOAT)(CONG * 2.3283064365386963e-10f)
       #define PI (MYFLOAT)3.141592653589793238462643197169399375105820974944592307816406286e0f
       #define SMALL_NUM (MYFLOAT)1.e-9f
       #define LENGTH 1.e0f
       // Create my own Distance implementation: distance buggy on Oland AMD chipset
       MYFLOAT MyDistance(MYFLOAT4 n,MYFLOAT4 m)
+      {
           private MYFLOAT x2,y2,z2;
           x2=n.s0-m.s0;
           x2*=x2;
           y2=n.s1-m.s1;
           y2*=y2;
           z2=n.s2-m.s2;
           z2*=z2;
           return(sqrt(x2+y2+z2));
+      }
       // MYFLOAT4 Interaction(MYFLOAT4 m,MYFLOAT4 n)
       // {
       //     private MYFLOAT r=MyDistance((MYFLOAT4)n,(MYFLOAT4)m);
       //
       //     return(((MYFLOAT4)n-(MYFLOAT4)m)/(MYFLOAT)(r*r*r));
       // }
       MYFLOAT4 Interaction(MYFLOAT4 m,MYFLOAT4 n)
+      {
           private MYFLOAT r=MyDistance((MYFLOAT4)n,(MYFLOAT4)m);
           private MYFLOAT r2=r*r;
           private MYFLOAT c=1.e0f/(MYFLOAT)get_global_size(0);
           return(((MYFLOAT4)n-(MYFLOAT4)m)*(MYFLOAT)(1.e0f-exp(-c*r2))/(MYFLOAT)(r*r2));
+      }
       MYFLOAT PairPotential(MYFLOAT4 m,MYFLOAT4 n)
+      {
           return((MYFLOAT)(-1.e0f)/(DISTANCE(n,m)));
       //    return((MYFLOAT)(-1.e0f)/(MyDistance(n,m)));
+      }
       MYFLOAT AtomicPotential(__global MYFLOAT4* clDataX,int gid)
+      {
           private MYFLOAT potential=(MYFLOAT)0.e0f;
           private MYFLOAT4 x=clDataX[gid];
           for (int i=0;i<get_global_size(0);i++)
+          {
               if (gid != i)
               potential+=PairPotential(x,clDataX[i]);
+          }
           barrier(CLK_GLOBAL_MEM_FENCE);
           return(potential);
+      }
       MYFLOAT AtomicPotentialCoM(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int gid)
+      {
           return(PairPotential(clDataX[gid],clCoM[0]));
+      }
       // Elements from : http://doswa.com/2009/01/02/fourth-order-runge-kutta-numerical-integration.html
       MYFLOAT8 AtomicRungeKutta(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
+      {
           private MYFLOAT4 a0,v0,x0,a1,v1,x1,a2,v2,x2,a3,v3,x3,a4,v4,x4,xf,vf;
           MYFLOAT4 DT=dt*(MYFLOAT4)(1.e0f,1.e0f,1.e0f,1.e0f);
           a0=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           v0=(MYFLOAT4)clDataInV[gid];
           x0=(MYFLOAT4)clDataInX[gid];
           int N = get_global_size(0);
           for (private int i=0;i<N;i++)
+          {
               if (gid != i)
               a0+=Interaction(x0,clDataInX[i]);
+          }
           a1=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           v1=a0*dt+v0;
           x1=v0*dt+x0;
           for (private int j=0;j<N;j++)
+          {
               if (gid != j)
               a1+=Interaction(x1,clDataInX[j]);
+          }
           a2=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           v2=a1*(MYFLOAT)(dt/2.e0f)+v0;
           x2=v1*(MYFLOAT)(dt/2.e0f)+x0;
           for (private int k=0;k<N;k++)
+          {
               if (gid != k)
               a2+=Interaction(x2,clDataInX[k]);
+          }
           a3=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           v3=a2*(MYFLOAT)(dt/2.e0f)+v0;
           x3=v2*(MYFLOAT)(dt/2.e0f)+x0;
           for (private int l=0;l<N;l++)
+          {
               if (gid != l)
               a3+=Interaction(x3,clDataInX[l]);
+          }
           a4=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           v4=a3*dt+v0;
           x4=v3*dt+x0;
           for (private int m=0;m<N;m++)
+          {
               if (gid != m)
               a4+=Interaction(x4,clDataInX[m]);
+          }
           xf=x0+dt*(v1+(MYFLOAT)2.e0f*(v2+v3)+v4)/(MYFLOAT)6.e0f;
           vf=v0+dt*(a1+(MYFLOAT)2.e0f*(a2+a3)+a4)/(MYFLOAT)6.e0f;
           return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
+      }
       MYFLOAT8 AtomicHeun(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
+      {
           private MYFLOAT4 x0,v0,a0,x1,v1,a1,xf,vf;
           MYFLOAT4 Dt=dt*(MYFLOAT4)(1.e0f,1.e0f,1.e0f,1.e0f);
           x0=(MYFLOAT4)clDataInX[gid];
           v0=(MYFLOAT4)clDataInV[gid];
           a0=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           for (private int i=0;i<get_global_size(0);i++)
+          {
               if (gid != i)
               a0+=Interaction(x0,clDataInX[i]);
+          }
           a1=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           //v1=v0+dt*a0;
           //x1=x0+dt*v0;
           v1=dt*a0+v0;
           x1=dt*v0+x0;
           for (private int j=0;j<get_global_size(0);j++)
+          {
               if (gid != j)
               a1+=Interaction(x1,clDataInX[j]);
+          }
           vf=v0+dt*(a0+a1)/(MYFLOAT)2.e0f;
           xf=x0+dt*(v0+v1)/(MYFLOAT)2.e0f;
           return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
+      }
       MYFLOAT8 AtomicImplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
+      {
           MYFLOAT4 x0,v0,a,xf,vf;
           x0=(MYFLOAT4)clDataInX[gid];
           v0=(MYFLOAT4)clDataInV[gid];
           a=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           for (private int i=0;i<get_global_size(0);i++)
+          {
               if (gid != i)
                 a+=Interaction(x0,clDataInX[i]);
+          }
           vf=v0+dt*a;
           xf=x0+dt*vf;
           return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
+      }
       MYFLOAT8 AtomicExplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
+      {
           MYFLOAT4 x0,v0,a,xf,vf;
           x0=(MYFLOAT4)clDataInX[gid];
           v0=(MYFLOAT4)clDataInV[gid];
           a=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
           for (private int i=0;i<get_global_size(0);i++)
+          {
               if (gid != i)
               a+=Interaction(x0,clDataInX[i]);
+          }
           vf=v0+dt*a;
           xf=x0+dt*v0;
           return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
+      }
       __kernel void InBallSplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT radius,
                                    uint seed_z,uint seed_w)
+      {
           private int gid=get_global_id(0);
           private uint zmwc=seed_z+gid;
           private uint wmwc=seed_w+(gid+1)%2;
           private MYFLOAT Heat,Radius,Theta,Phi,PosX,PosY,PosZ,SinTheta;
           for (int i=0;i<gid;i++)
+          {
               Heat=MWCfp;
+          }
       // More accurate distribution based on spherical coordonates
       // Disactivated because of AMD Oland GPU crash on launch
       //     Radius=MWCfp*radius;
       //     Theta=(MYFLOAT)acos((float)(-2.e0f*MWCfp+1.0e0f));
       //     Phi=(MYFLOAT)(2.e0f*PI*MWCfp);
       //     SinTheta=sin((float)Theta);
       //     PosX=cos((float)Phi)*Radius*SinTheta;
       //     PosY=sin((float)Phi)*Radius*SinTheta;
       //     PosZ=cos((float)Theta)*Radius;
       //     clDataX[gid]=(MYFLOAT4)(PosX,PosY,PosZ,0.e0f);
           MYFLOAT4 Position=(MYFLOAT4)((MWCfp-0.5e0f)*radius,(MWCfp-0.5e0f)*radius,(MWCfp-0.5e0f)*radius,0.e0f);
           MYFLOAT Length=(MYFLOAT)length((MYFLOAT4)Position);
           clDataX[gid]=Position/Length*fmod(radius,Length);
           barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       __kernel void InBoxSplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT box,
                                    uint seed_z,uint seed_w)
+      {
           int gid=get_global_id(0);
           uint zmwc=seed_z+gid;
           uint wmwc=seed_w-gid;
           private MYFLOAT Heat;
           for (int i=0;i<gid;i++)
+          {
               Heat=MWCfp;
+          }
           clDataX[gid]=(MYFLOAT4)((MWCfp-0.5e0f)*box,(MWCfp-0.5e0f)*box,(MWCfp-0.5e0f)*box,0.e0f);
           barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       __kernel void SplutterStress(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,__global MYFLOAT4* clCoM, MYFLOAT velocity,uint seed_z,uint seed_w)
+      {
           int gid = get_global_id(0);
           MYFLOAT N = (MYFLOAT)get_global_size(0);
           uint zmwc=seed_z+(uint)gid;
           uint wmwc=seed_w-(uint)gid;
           MYFLOAT4 SpeedVector;
           if (velocity<SMALL_NUM) {
              SpeedVector=(MYFLOAT4)normalize(cross(clDataX[gid],clCoM[0]))*sqrt((-AtomicPotential(clDataX,gid)/(MYFLOAT)2.e0f));
+          }
           else
+          {
              // cast to float for sin,cos are NEEDED by Mesa FP64 implementation!
              // Implemention on AMD Oland are probably broken in float
              MYFLOAT theta=acos((float)(1.0e0f-2.e0f*MWCfp));
              MYFLOAT phi=MWCfp*PI*(MYFLOAT)2.e0f;
              MYFLOAT sinTheta=sin((float)theta);
              MYFLOAT sinPhi=sin((float)phi);
              SpeedVector=(MYFLOAT4)((MWCfp-0.5e0f)*velocity,(MWCfp-0.5e0f)*velocity,
                                     (MWCfp-0.5e0f)*velocity,0.e0f);
+          }
           clDataV[gid]=SpeedVector;
           barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       __kernel void RungeKutta(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
+      {
           private int gid = get_global_id(0);
           private MYFLOAT8 clDataGid;
           clDataGid=AtomicRungeKutta(clDataX,clDataV,gid,h);
           barrier(CLK_GLOBAL_MEM_FENCE);
           clDataX[gid]=clDataGid.s0123;
           clDataV[gid]=clDataGid.s4567;
+      }
       __kernel void Heun(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
+      {
           private int gid = get_global_id(0);
           private MYFLOAT8 clDataGid;
           clDataGid=AtomicHeun(clDataX,clDataV,gid,h);
           barrier(CLK_GLOBAL_MEM_FENCE);
           clDataX[gid]=clDataGid.s0123;
           clDataV[gid]=clDataGid.s4567;
+      }
       __kernel void ImplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
+      {
           private int gid = get_global_id(0);
           private MYFLOAT8 clDataGid;
           clDataGid=AtomicImplicitEuler(clDataX,clDataV,gid,h);
           barrier(CLK_GLOBAL_MEM_FENCE);
           clDataX[gid]=clDataGid.s0123;
           clDataV[gid]=clDataGid.s4567;
+      }
       __kernel void ExplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
+      {
           private int gid = get_global_id(0);
           private MYFLOAT8 clDataGid;
           clDataGid=AtomicExplicitEuler(clDataX,clDataV,gid,h);
           barrier(CLK_GLOBAL_MEM_FENCE);
           clDataX[gid]=clDataGid.s0123;
           clDataV[gid]=clDataGid.s4567;
+      }
       __kernel void CoMPotential(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,__global MYFLOAT* clPotential)
+      {
           int gid = get_global_id(0);
           clPotential[gid]=PairPotential(clDataX[gid],clCoM[0]);
+      }
       __kernel void Potential(__global MYFLOAT4* clDataX,__global MYFLOAT* clPotential)
+      {
           int gid = get_global_id(0);
           MYFLOAT potential=(MYFLOAT)0.e0f;
           MYFLOAT4 x=clDataX[gid];
           for (int i=0;i<get_global_size(0);i++)
+          {
               if (gid != i)
               potential+=PairPotential(x,clDataX[i]);
+          }
           barrier(CLK_GLOBAL_MEM_FENCE);
           clPotential[gid]=potential*(MYFLOAT)5.e-1f;
+      }
       __kernel void CenterOfMass(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int Size)
+      {
           MYFLOAT4 CoM=clDataX[0];
           for (int i=1;i<Size;i++)
+          {
               CoM+=clDataX[i];
+          }
           barrier(CLK_GLOBAL_MEM_FENCE);
           clCoM[0]=(MYFLOAT4)(CoM.s0,CoM.s1,CoM.s2,0.e0f)/(MYFLOAT)Size;
+      }
       __kernel void Kinetic(__global MYFLOAT4* clDataV,__global MYFLOAT* clKinetic)
+      {
           int gid = get_global_id(0);
           barrier(CLK_GLOBAL_MEM_FENCE);
           MYFLOAT d=(MYFLOAT)length(clDataV[gid]);
           clKinetic[gid]=(MYFLOAT)5.e-1f*(MYFLOAT)(d*d);
+      }
       """
       def MainOpenCL(clDataX,clDataV,Step,Method):
           time_start=time.time()
           if Method=="RungeKutta":
               CLLaunch=MyRoutines.RungeKutta(queue,(Number,1),None,clDataX,clDataV,Step)
           elif Method=="ExplicitEuler":
               CLLaunch=MyRoutines.ExplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step)
           elif Method=="Heun":
               CLLaunch=MyRoutines.Heun(queue,(Number,1),None,clDataX,clDataV,Step)
           else:
               CLLaunch=MyRoutines.ImplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step)
           CLLaunch.wait()
           Elapsed=time.time()-time_start
           return(Elapsed)
       def display(*args):
           global MyDataX,MyDataV,clDataX,clDataV,Step,Method,Number,Iterations,Durations,Verbose,SpeedRendering
           glClearColor(0.0, 0.0, 0.0, 0.0)
           glClear(GL_COLOR_BUFFER_BIT)
           glColor3f(1.0,1.0,1.0)
           Elapsed=MainOpenCL(clDataX,clDataV,Step,Method)
           if SpeedRendering:
               cl.enqueue_copy(queue, MyDataV, clDataV)
               MyDataV.reshape(Number,4)[:,3]=1
               glVertexPointerf(MyDataV.reshape(Number,4))
           else:
               cl.enqueue_copy(queue, MyDataX, clDataX)
               MyDataX.reshape(Number,4)[:,3]=1
               glVertexPointerf(MyDataX.reshape(Number,4))
           if Verbose:
               print("Duration of #%s iteration: %s" % (Iterations,Elapsed))
           else:
               sys.stdout.write('.')
               sys.stdout.flush()
           Durations=np.append(Durations,MainOpenCL(clDataX,clDataV,Step,Method))
           glEnableClientState(GL_VERTEX_ARRAY)
           glDrawArrays(GL_POINTS, 0, Number)
           glDisableClientState(GL_VERTEX_ARRAY)
           glFlush()
           Iterations+=1
           glutSwapBuffers()
       def halt():
           pass
       def keyboard(k,x,y):
           global ViewRZ,SizeOfBox,SpeedRendering
           LC_Z = as_8_bit( 'z' )
           UC_Z = as_8_bit( 'Z' )
           Plus = as_8_bit( '+' )
           Minus = as_8_bit( '-' )
           Switch = as_8_bit( 's' )
           Zoom=1
           if k == LC_Z:
               ViewRZ += 1.0
           elif k == UC_Z:
               ViewRZ -= 1.0
           elif k == Plus:
               Zoom *= 2.0
           elif k == Minus:
               Zoom /= 2.0
           elif k == Switch:
               if SpeedRendering:
                   SpeedRendering=False
               else:
                   SpeedRendering=True
           elif ord(k) == 27: # Escape
               glutLeaveMainLoop()
               return(False)
           else:
               return
           glRotatef(ViewRZ, 0.0, 0.0, 1.0)
           glScalef(Zoom,Zoom,Zoom)
           glutPostRedisplay()
       def special(k,x,y):
           global ViewRX, ViewRY, ViewRZ
           if k == GLUT_KEY_UP:
               ViewRX += 1.0
           elif k == GLUT_KEY_DOWN:
               ViewRX -= 1.0
           elif k == GLUT_KEY_LEFT:
               ViewRY += 1.0
           elif k == GLUT_KEY_RIGHT:
               ViewRY -= 1.0
           else:
               return
           glRotatef(ViewRX, 1.0, 0.0, 0.0)
           glRotatef(ViewRY, 0.0, 1.0, 0.0)
           glutPostRedisplay()
       def setup_viewport():
           global SizeOfBox
           glMatrixMode(GL_PROJECTION)
           glLoadIdentity()
           glOrtho(-SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox)
           glutPostRedisplay()
       def reshape(w, h):
           glViewport(0, 0, w, h)
           setup_viewport()
       if __name__=='__main__':
           global Number,Step,clDataX,clDataV,MyDataX,MyDataV,Method,SizeOfBox,Iterations,Verbose,Durations
           # ValueType
           ValueType='FP32'
           class MyFloat(np.float32):pass
           #    clType8=cl_array.vec.float8
           # Set defaults values
           np.set_printoptions(precision=2)
           # Id of Device : 1 is for first find !
           Device=0
           # Number of bodies is integer
           Number=2
           # Number of iterations (for standalone execution)
           Iterations=100
           # Size of shape
           SizeOfShape=MyFloat(1.)
           # Initial velocity of particules
           Velocity=MyFloat(1.)
           # Step
           Step=MyFloat(1./32)
           # Method of integration
           Method='ImplicitEuler'
           # InitialRandom
           InitialRandom=False
           # RNG Marsaglia Method
           RNG='MWC'
           # Viriel Distribution of stress
           VirielStress=True
           # Verbose
           Verbose=False
           # OpenGL real time rendering
           OpenGL=False
           # Speed rendering
           SpeedRendering=False
           # Shape to distribute
           Shape='Box'
           HowToUse='%s -h [Help] -r [InitialRandom] -g [OpenGL] -e [VirielStress] -d <DeviceId> -p [SpeedRendering] -n <NumberOfParticules> -i <Iterations> -z <SizeOfBoxOrBall> -v <Velocity> -s <Step> -b <Ball|Box> -m <ImplicitEuler|RungeKutta|ExplicitEuler|Heun> -t <FP32|FP64>'
           try:
               opts, args = getopt.getopt(sys.argv[1:],"rpgehd:n:i:z:v:s:m:t:b:",["random","rendering","opengl","viriel","device=","number=","iterations=","size=","velocity=","step=","method=","valuetype=","shape="])
           except getopt.GetoptError:
               print(HowToUse % sys.argv[0])
               sys.exit(2)
           for opt, arg in opts:
               if opt == '-h':
                   print(HowToUse % sys.argv[0])
                   print("\nInformations about devices detected under OpenCL:")
                   try:
                       Id=0
                       for platform in cl.get_platforms():
                           for device in platform.get_devices():
                               # Failed now because of POCL implementation
                               #deviceType=cl.device_type.to_string(device.type)
                               deviceType="xPU"
                               print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
                               Id=Id+1
                       sys.exit()
                   except ImportError:
                       print("Your platform does not seem to support OpenCL")
                       sys.exit()
               elif opt in ("-t", "--valuetype"):
                   if arg=='FP64':
                       class MyFloat(np.float64): pass
                   else:
                       class MyFloat(np.float32):pass
                   ValueType = arg
               elif opt in ("-d", "--device"):
                   Device=int(arg)
               elif opt in ("-m", "--method"):
                   Method=arg
               elif opt in ("-b", "--shape"):
                   Shape=arg
               elif opt in ("-n", "--number"):
                   Number=int(arg)
               elif opt in ("-i", "--iterations"):
                   Iterations=int(arg)
               elif opt in ("-z", "--size"):
                   SizeOfShape=MyFloat(arg)
               elif opt in ("-v", "--velocity"):
                   Velocity=MyFloat(arg)
                   VirielStress=False
               elif opt in ("-s", "--step"):
                   Step=MyFloat(arg)
               elif opt in ("-r", "--random"):
                   InitialRandom=True
               elif opt in ("-c", "--check"):
                   CheckEnergies=True
               elif opt in ("-e", "--viriel"):
                   VirielStress=True
               elif opt in ("-g", "--opengl"):
                   OpenGL=True
               elif opt in ("-p", "--rendering"):
                   SpeedRendering=True
           SizeOfShape=MyFloat(SizeOfShape*Number)
           Velocity=MyFloat(Velocity)
           Step=MyFloat(Step)
           print("Device choosed : %s" % Device)
           print("Number of particules : %s" % Number)
           print("Size of Shape : %s" % SizeOfShape)
           print("Initial velocity : %s" % Velocity)
           print("Step of iteration : %s" % Step)
           print("Number of iterations : %s" % Iterations)
           print("Method of resolution : %s" % Method)
           print("Initial Random for RNG Seed : %s" % InitialRandom)
           print("ValueType is : %s" % ValueType)
           print("Viriel distribution of stress : %s" % VirielStress)
           print("OpenGL real time rendering : %s" % OpenGL)
           print("Speed rendering : %s" % SpeedRendering)
           # Create Numpy array of CL vector with 8 FP32
           MyCoM = np.zeros(4,dtype=MyFloat)
           MyDataX = np.zeros(Number*4, dtype=MyFloat)
           MyDataV = np.zeros(Number*4, dtype=MyFloat)
           MyPotential = np.zeros(Number, dtype=MyFloat)
           MyKinetic = np.zeros(Number, dtype=MyFloat)
           Marsaglia,Computing=DictionariesAPI()
           # Scan the OpenCL arrays
           Id=0
           HasXPU=False
           for platform in cl.get_platforms():
               for device in platform.get_devices():
                   if Id==Device:
                       PlatForm=platform
                       XPU=device
                       print("CPU/GPU selected: ",device.name.lstrip())
                       print("Platform selected: ",platform.name)
                       HasXPU=True
                   Id+=1
           if HasXPU==False:
               print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
               sys.exit()
           # Create Context
           try:
               ctx = cl.Context([XPU])
               queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
           except:
               print("Crash during context creation")
           # Build all routines used for the computing
           #BuildOptions="-cl-mad-enable -cl-kernel-arg-info -cl-fast-relaxed-math -cl-std=CL1.2 -DTRNG=%i -DTYPE=%i" % (Marsaglia[RNG],Computing[ValueType])
           BuildOptions="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%i" % (Marsaglia[RNG],Computing[ValueType])
           if 'Intel' in PlatForm.name or 'Experimental' in PlatForm.name or 'Clover' in PlatForm.name or 'Portable' in PlatForm.name :
               MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions)
           else:
               MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions+" -cl-strict-aliasing")
           mf = cl.mem_flags
           # Read/Write approach for buffering
           clDataX = cl.Buffer(ctx, mf.READ_WRITE, MyDataX.nbytes)
           clDataV = cl.Buffer(ctx, mf.READ_WRITE, MyDataV.nbytes)
           clPotential = cl.Buffer(ctx, mf.READ_WRITE, MyPotential.nbytes)
           clKinetic = cl.Buffer(ctx, mf.READ_WRITE, MyKinetic.nbytes)
           clCoM = cl.Buffer(ctx, mf.READ_WRITE, MyCoM.nbytes)
           # Write/HostPointer approach for buffering
           # clDataX = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataX)
           # clDataV = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataV)
           # clPotential = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyPotential)
           # clKinetic = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyKinetic)
           # clCoM = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyCoM)
           print('All particles superimposed.')
           # Set particles to RNG points
           if InitialRandom:
               seed_w=np.uint32(nprnd(2**32))
               seed_z=np.uint32(nprnd(2**32))
           else:
               seed_w=np.uint32(19710211)
               seed_z=np.uint32(20081010)
           if Shape=='Ball':
               MyRoutines.InBallSplutterPoints(queue,(Number,1),None,clDataX,SizeOfShape,seed_w,seed_z)
           else:
               MyRoutines.InBoxSplutterPoints(queue,(Number,1),None,clDataX,SizeOfShape,seed_w,seed_z)
           print('All particules distributed')
           CLLaunch=MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number))
           CLLaunch.wait()
           cl.enqueue_copy(queue,MyCoM,clCoM)
           print('Center Of Mass estimated: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2]))
           if VirielStress:
               CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,MyFloat(0.),np.uint32(110271),np.uint32(250173))
           else:
               CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,Velocity,np.uint32(110271),np.uint32(250173))
           CLLaunch.wait()
           print('All particules stressed')
           CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential)
           CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic)
           CLLaunch.wait()
           cl.enqueue_copy(queue,MyPotential,clPotential)
           cl.enqueue_copy(queue,MyKinetic,clKinetic)
           print('Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n'% (np.sum(MyPotential)+2*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic)))
           if SpeedRendering:
               SizeOfBox=max(2*MyKinetic)
           else:
               SizeOfBox=SizeOfShape
           wall_time_start=time.time()
           Durations=np.array([],dtype=MyFloat)
           print('Starting!')
           if OpenGL:
               global ViewRX,ViewRY,ViewRZ
               Iterations=0
               ViewRX,ViewRY,ViewRZ = 0.,0.,0.
               # Launch OpenGL Loop
               glutInit(sys.argv)
               glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB)
               glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE,GLUT_ACTION_CONTINUE_EXECUTION)
               glutInitWindowSize(512,512)
               glutCreateWindow(b'NBodyGL')
               setup_viewport()
               glutReshapeFunc(reshape)
               glutDisplayFunc(display)
               glutIdleFunc(display)
               #   glutMouseFunc(mouse)
               glutSpecialFunc(special)
               glutKeyboardFunc(keyboard)
               glutMainLoop()
           else:
               for iteration in range(Iterations):
                   Elapsed=MainOpenCL(clDataX,clDataV,Step,Method)
                   if Verbose:
                       print("Duration of #%s iteration: %s" % (Iterations,Elapsed))
                   else:
                       sys.stdout.write('.')
                       sys.stdout.flush()
                   Durations=np.append(Durations,Elapsed)
           print('\nEnding!')
           MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number))
           CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential)
           CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic)
           CLLaunch.wait()
           cl.enqueue_copy(queue,MyCoM,clCoM)
           cl.enqueue_copy(queue,MyPotential,clPotential)
           cl.enqueue_copy(queue,MyKinetic,clKinetic)
           print('\nCenter Of Mass estimated: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2]))
           print('Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n'% (np.sum(MyPotential)+2.*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic)))
           print("Duration stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n\n\tVariability:\t%s\n" % (Device,Iterations,np.mean(Durations),np.median(Durations),np.std(Durations),np.min(Durations),np.max(Durations),np.std(Durations)/np.median(Durations)))
           # Contraction of Square*Size*Hertz: Size*Size/Elapsed
           Squertz=np.ones(len(Durations))
           Squertz*=Number*Number
           Squertz/=Durations
           print("Squertz in log10 stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n" % (Device,Iterations,np.log10(np.mean(Squertz)),np.log10(np.median(Squertz)),np.log10(np.std(Squertz)),np.log10(np.min(Squertz)),np.log10(np.max(Squertz))))
           clDataX.release()
           clDataV.release()
           clKinetic.release()
           clPotential.release()

Centre Blaise Pascal » Bench4GPU

root / NBody / NBodyGL.py @ 171