#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Demonstrateur OpenCL d'interaction NCorps

Emmanuel QUEMENER <emmanuel.quemener@ens-lyon.fr> CeCILLv2
"""
import getopt
import sys
import time
import numpy as np
import pyopencl as cl
import pyopencl.array as cl_array
from numpy.random import randint as nprnd
import string, sys
from OpenGL.GL import *
from OpenGL.GLUT import *

def DictionariesAPI():
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
    Computing={'FP32':0,'FP64':1}
    return(Marsaglia,Computing)

BlobOpenCL= """
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
#define MWC   (znew+wnew)
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
#define CONG  (jcong=69069*jcong+1234567)
#define KISS  ((MWC^CONG)+SHR3)

#define TFP32 0
#define TFP64 1

#define MWCfp MWC * 2.3283064365386963e-10f
#define KISSfp KISS * 2.3283064365386963e-10f
#define SHR3fp SHR3 * 2.3283064365386963e-10f
#define CONGfp CONG * 2.3283064365386963e-10f

#define PI 3.141592653589793238462643197169399375105820974944592307816406286e0f

#define SMALL_NUM 1.e-9f

#define LENGTH 1.e0f

#if TYPE == TFP32
#define MYFLOAT4 float4
#define MYFLOAT8 float8
#define MYFLOAT float
#define DISTANCE fast_distance
#else
#if defined(cl_khr_fp64)  // Khronos extension available?
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define DOUBLE_SUPPORT_AVAILABLE
#elif defined(cl_amd_fp64)  // AMD extension available?
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#define DOUBLE_SUPPORT_AVAILABLE
#endif
#define MYFLOAT4 double4
#define MYFLOAT8 double8
#define MYFLOAT double
#define DISTANCE distance
#endif


MYFLOAT4 Interaction(MYFLOAT4 m,MYFLOAT4 n)
{
    private MYFLOAT r=DISTANCE(n,m);

    return((n-m)/(MYFLOAT)(r*r*r));
}

MYFLOAT4 InteractionCore(MYFLOAT4 m,MYFLOAT4 n)
{
    private MYFLOAT core=(MYFLOAT)1.e5f;
    private MYFLOAT r=DISTANCE(n,m);
    private MYFLOAT d=r*r+core*core;

    return(core*(n-m)/(MYFLOAT)(d*d));
}

MYFLOAT PairPotential(MYFLOAT4 m,MYFLOAT4 n)
{
    return((MYFLOAT)(-1.e0f)/(DISTANCE(n,m)));
}

MYFLOAT AtomicPotential(__global MYFLOAT4* clDataX,int gid)
{
    private MYFLOAT potential=(MYFLOAT)0.e0f;
    private MYFLOAT4 x=clDataX[gid]; 
    
    for (int i=0;i<get_global_size(0);i++)
    {
        if (gid != i)
        potential+=PairPotential(x,clDataX[i]);
    }

    barrier(CLK_GLOBAL_MEM_FENCE);
    return(potential);
}

MYFLOAT AtomicPotentialCoM(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int gid)
{
    return(PairPotential(clDataX[gid],clCoM[0]));
}

MYFLOAT8 AtomicRungeKutta(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
{
    private MYFLOAT4 a0,v0,x0,a1,v1,x1,a2,v2,x2,a3,v3,x3,a4,v4,x4,xf,vf;

    a0=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
    v0=(MYFLOAT4)clDataInV[gid];
    x0=(MYFLOAT4)clDataInX[gid];
    int N = get_global_size(0);    
    
    for (int i=0;i<N;i++)
    {
        if (gid != i)
        a0+=Interaction(x0,clDataInX[i]);
    }
        
    a1=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
    v1=v0+a0*dt;
    x1=x0+v0*dt;
    for (int i=0;i<N;i++)
    {
        if (gid != i)
        a1+=Interaction(x1,clDataInX[i]);
    }

    a2=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
    v2=v0+a1*dt*(MYFLOAT)5.e-1f;
    x2=x0+v1*dt*(MYFLOAT)5.e-1f;
    for (int i=0;i<N;i++)
    {
        if (gid != i)
        a2+=Interaction(x2,clDataInX[i]);
    }
    
    a3=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
    v3=v0+a2*dt*(MYFLOAT)5.e-1f;
    x3=x0+v2*dt*(MYFLOAT)5.e-1f;
    for (int i=0;i<N;i++)
    {
        if (gid != i)
        a3+=Interaction(x3,clDataInX[i]);
    }
    
    a4=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);
    v4=v0+a3*dt;
    x4=x0+v3*dt;
    for (int i=0;i<N;i++)
    {
        if (gid != i)
        a4+=Interaction(x4,clDataInX[i]);
    }
    
    xf=x0+dt*(v1+(MYFLOAT)2.e0f*(v2+v3)+v4)/(MYFLOAT)6.e0f;
    vf=v0+dt*(a1+(MYFLOAT)2.e0f*(a2+a3)+a4)/(MYFLOAT)6.e0f;
     
    return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,1.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
}

// Elements from : http://doswa.com/2009/01/02/fourth-order-runge-kutta-numerical-integration.html

MYFLOAT8 AtomicHeun(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
{
    private MYFLOAT4 x,v,a,xi,vi,ai,xf,vf;

    x=(MYFLOAT4)clDataInX[gid];
    v=(MYFLOAT4)clDataInV[gid];
    a=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);

    for (int i=0;i<get_global_size(0);i++)
    {
        if (gid != i)
        a+=Interaction(x,clDataInX[i]);
    }

    vi=v+dt*a;
    xi=x+dt*vi;
    ai=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);

    for (int i=0;i<get_global_size(0);i++)
    {
        if (gid != i)
        ai+=Interaction(xi,clDataInX[i]);
    }

    vf=v+dt*(a+ai)/(MYFLOAT)2.e0f;
    xf=x+dt*(v+vi)/(MYFLOAT)2.e0f;

    return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,1.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
}

MYFLOAT8 AtomicImplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
{
    private MYFLOAT4 x,v,a,xf,vf;

    x=(MYFLOAT4)clDataInX[gid];
    v=(MYFLOAT4)clDataInV[gid];
    a=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);

    for (int i=0;i<get_global_size(0);i++)
    {
        if (gid != i)
        a+=Interaction(x,clDataInX[i]);
    }
       
    vf=v+dt*a;
    xf=x+dt*vf;
 
    return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,1.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
}

MYFLOAT8 AtomicExplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt)
{
    MYFLOAT4 x,v,a,xf,vf;

    x=(MYFLOAT4)clDataInX[gid];
    v=(MYFLOAT4)clDataInV[gid];
    a=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f);

    for (int i=0;i<get_global_size(0);i++)
    {
        if (gid != i)
        a+=Interaction(x,clDataInX[i]);
    }
       
    vf=v+dt*a;
    xf=x+dt*v;
 
    return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,1.e0f,vf.s0,vf.s1,vf.s2,0.e0f));
}

__kernel void SplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT box, 
                             uint seed_z,uint seed_w)
{
    int gid=get_global_id(0);
    uint z=seed_z+gid;
    uint w=seed_w+gid;
     
    for (int i=0;i<gid;i++)
    {
        private MYFLOAT heat=MWCfp;
    }
 
    // Distribute in sphere
    MYFLOAT radius=MWCfp*box;
    MYFLOAT theta=MWCfp*PI;
    MYFLOAT phi=MWCfp*PI*(MYFLOAT)2.e0f;
    // cast to float for sin,cos are NEEDED by Mesa FP64 implementation!
    MYFLOAT sinTheta=sin((float)theta);    
    clDataX[gid].s0=radius*sinTheta*cos((float)phi)/2.e0f;
    clDataX[gid].s1=radius*sinTheta*sin((float)phi)/2.e0f;
    clDataX[gid].s2=radius*cos((float)theta)/2.e0f;
    clDataX[gid].s3=(MYFLOAT)0.e0f;
    barrier(CLK_GLOBAL_MEM_FENCE);

    // Distribute in box
    // MYFLOAT x0=box*(MYFLOAT)(MWCfp-0.5);
    // MYFLOAT y0=box*(MYFLOAT)(MWCfp-0.5);
    // MYFLOAT z0=box*(MYFLOAT)(MWCfp-0.5);
    // clDataX[gid].s0123 = (MYFLOAT4) (x0,y0,z0,0.e0f);
}

__kernel void SplutterStress(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,__global MYFLOAT4* clCoM, MYFLOAT velocity,uint seed_z,uint seed_w)
{
    int gid = get_global_id(0);
    MYFLOAT N = (MYFLOAT)get_global_size(0);
    uint z=seed_z+(uint)gid;
    uint w=seed_w-(uint)gid;

    if (velocity<SMALL_NUM) {
       MYFLOAT4 SpeedVector=(MYFLOAT4)normalize(cross(clDataX[gid],clCoM[0]))*sqrt(-AtomicPotential(clDataX,gid)/(MYFLOAT)2.e0f);
       clDataV[gid]=SpeedVector;
    }
    else
    {    
       // cast to float for sin,cos are NEEDED by Mesa FP64 implementation!
       MYFLOAT theta=MWCfp*PI;
       MYFLOAT phi=MWCfp*PI*(MYFLOAT)2.e0f;
       MYFLOAT sinTheta=sin((float)theta);

       clDataV[gid].s0=velocity*sinTheta*cos((float)phi);
       clDataV[gid].s1=velocity*sinTheta*sin((float)phi);
       clDataV[gid].s2=velocity*cos((float)theta);
       clDataV[gid].s3=(MYFLOAT)0.e0f;
    }
    barrier(CLK_GLOBAL_MEM_FENCE);
}

__kernel void RungeKutta(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
{
    int gid = get_global_id(0);
    
    MYFLOAT8 clDataGid=AtomicRungeKutta(clDataX,clDataV,gid,h);
    barrier(CLK_GLOBAL_MEM_FENCE);
    clDataX[gid]=clDataGid.lo;
    clDataV[gid]=clDataGid.hi;
}

__kernel void ImplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
{
    int gid = get_global_id(0);
    
    MYFLOAT8 clDataGid=AtomicImplicitEuler(clDataX,clDataV,gid,h);
    barrier(CLK_GLOBAL_MEM_FENCE);
    clDataX[gid]=clDataGid.lo;
    clDataV[gid]=clDataGid.hi;
}

__kernel void Heun(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
{
    int gid = get_global_id(0);
    
    MYFLOAT8 clDataGid=AtomicHeun(clDataX,clDataV,gid,h);
    barrier(CLK_GLOBAL_MEM_FENCE);
    clDataX[gid]=clDataGid.lo;
    clDataV[gid]=clDataGid.hi;
}

__kernel void ExplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h)
{
    int gid = get_global_id(0);
    
    MYFLOAT8 clDataGid=AtomicExplicitEuler(clDataX,clDataV,gid,h);
    barrier(CLK_GLOBAL_MEM_FENCE);
    clDataX[gid]=clDataGid.lo;
    clDataV[gid]=clDataGid.hi;
}

__kernel void CoMPotential(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,__global MYFLOAT* clPotential)
{
    int gid = get_global_id(0);

    clPotential[gid]=PairPotential(clDataX[gid],clCoM[0]);
}

__kernel void Potential(__global MYFLOAT4* clDataX,__global MYFLOAT* clPotential)
{
    int gid = get_global_id(0);

    MYFLOAT potential=(MYFLOAT)0.e0f;
    MYFLOAT4 x=clDataX[gid]; 
    
    for (int i=0;i<get_global_size(0);i++)
    {
        if (gid != i)
        potential+=PairPotential(x,clDataX[i]);
    }
                 
    barrier(CLK_GLOBAL_MEM_FENCE);
    clPotential[gid]=potential*(MYFLOAT)5.e-1f;
}

__kernel void CenterOfMass(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int Size)
{
    MYFLOAT4 CoM=clDataX[0]; 

    for (int i=1;i<Size;i++)
    {
        CoM+=clDataX[i];
    }

    barrier(CLK_GLOBAL_MEM_FENCE);
    clCoM[0]=(MYFLOAT4)(CoM.s0,CoM.s1,CoM.s2,1.e0f)/(MYFLOAT)Size;
}

__kernel void Kinetic(__global MYFLOAT4* clDataV,__global MYFLOAT* clKinetic)
{
    int gid = get_global_id(0);
    
    barrier(CLK_GLOBAL_MEM_FENCE);
    MYFLOAT d=(MYFLOAT)length(clDataV[gid]);
    clKinetic[gid]=(MYFLOAT)5.e-1f*(MYFLOAT)(d*d);
}
"""

def display(*args):

    global MyDataX,MyDataV,clDataX,clDataV,Step,Method,Number,Iterations
    
    glClearColor(0.0, 0.0, 0.0, 0.0)
    glClear(GL_COLOR_BUFFER_BIT)
    glColor3f(1.0,1.0,1.0)
    
    time_start=time.time()
    if Method=="RungeKutta":            
        CLLaunch=MyRoutines.RungeKutta(queue,(Number,1),None,clDataX,clDataV,Step)
    elif Method=="ExplicitEuler":
        CLLaunch=MyRoutines.ExplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step)
    elif Method=="Heun":
        CLLaunch=MyRoutines.Heun(queue,(Number,1),None,clDataX,clDataV,Step)
    else:
        CLLaunch=MyRoutines.ImplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step)
    CLLaunch.wait()
    print("Duration of #%s iteration: %s" % (Iterations,(time.time()-time_start)))

    cl.enqueue_copy(queue, MyDataX, clDataX)
    # print(MyDataX.reshape(Number,4))
    MyDataX.reshape(Number,4)[:,3]=1
    glVertexPointerf(MyDataX.reshape(Number,4))
    # cl.enqueue_copy(queue, MyDataV, clDataV)
    # print(MyDataV.reshape(Number,4))
    # MyDataV.reshape(Number,4)[:,3]=1
    # glVertexPointerf(MyDataV.reshape(Number,4))
    glEnableClientState(GL_VERTEX_ARRAY)
    glDrawArrays(GL_POINTS, 0, Number)
    glDisableClientState(GL_VERTEX_ARRAY)
    glFlush()
    Iterations+=1
    glutSwapBuffers()

def halt():
    pass

def keyboard(k, x, y):
    global view_rotz
    LC_Z = as_8_bit( 'z' )
    UC_Z = as_8_bit( 'Z' )

    if k == LC_Z:
        view_rotz += 1.0
    elif k == UC_Z:
        view_rotz -= 1.0
    elif ord(k) == 27: # Escape
        glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE,GLUT_ACTION_CONTINUE_EXECUTION)
        glutSetOption(GLUT_ACTION_GLUTMAINLOOP_RETURNS,GLUT_ACTION_CONTINUE_EXECUTION) 
        glutLeaveMainLoop()
        return(False)
    else:
        return
    glRotatef(view_rotz, 0.0, 0.0, 1.0)
    glutPostRedisplay()

def special(k, x, y):
    global view_rotx, view_roty, view_rotz

    if k == GLUT_KEY_UP:
        view_rotx += 1.0
    elif k == GLUT_KEY_DOWN:
        view_rotx -= 1.0
    elif k == GLUT_KEY_LEFT:
        view_roty += 1.0
    elif k == GLUT_KEY_RIGHT:
        view_roty -= 1.0
    else:
        return
    glRotatef(view_rotx, 1.0, 0.0, 0.0)
    glRotatef(view_roty, 0.0, 1.0, 0.0)
    glutPostRedisplay()

def mouse(button, state, x, y):
    global angle, delta_angle, halted
    if button == GLUT_LEFT_BUTTON:
        angle = angle + delta_angle
    elif button == GLUT_RIGHT_BUTTON:
	angle = angle - delta_angle
    elif button == GLUT_MIDDLE_BUTTON and state == GLUT_DOWN:
        if halted:
	    glutIdleFunc(display)
	    halted = 0
        else:
	    glutIdleFunc(halt)
	    halted = 1

def setup_viewport():
    global SizeOfBox
    glMatrixMode(GL_PROJECTION)
    glLoadIdentity()
    glOrtho(-SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox)
    
def reshape(w, h):
    glViewport(0, 0, w, h)
    setup_viewport()

if __name__=='__main__':

    global Number,Step,clDataX,clDataV,MyDataX,MyDataV,Method,SizeOfBox
    
    # ValueType
    ValueType='FP32'
    class MyFloat(np.float32):pass
    #    clType8=cl_array.vec.float8
    clType4=cl_array.vec.float4
    # Set defaults values
    np.set_printoptions(precision=2)  
    # Id of Device : 1 is for first find !
    Device=0
    # Iterations is integer
    Number=2
    # Size of box
    SizeOfBox=MyFloat(1.)
    # Initial velocity of particules
    Velocity=MyFloat(1.)
    # Redo the last process
    Iterations=int(np.pi*1024)
    # Step
    Step=MyFloat(1./256)
    # Method of integration
    Method='ImplicitEuler'
    # InitialRandom
    InitialRandom=False
    # RNG Marsaglia Method
    RNG='MWC'
    # CheckEnergies
    CheckEnergies=False
    # Display samples in 3D
    GraphSamples=False
    # Viriel Distribution of stress
    VirielStress=True
    
    HowToUse='%s -h [Help] -r [InitialRandom] -e [VirielStress] -g [GraphSamples] -c [CheckEnergies] -d <DeviceId> -n <NumberOfParticules> -z <SizeOfBox> -v <Velocity> -s <Step> -i <Iterations> -m <ImplicitEuler|RungeKutta|ExplicitEuler|Heun> -t <FP32|FP64>'

    try:
        opts, args = getopt.getopt(sys.argv[1:],"rehgcd:n:z:v:i:s:m:t:",["random","viriel","graph","check","device=","number=","size=","velocity=","iterations=","step=","method=","valuetype="])
    except getopt.GetoptError:
        print(HowToUse % sys.argv[0])
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print(HowToUse % sys.argv[0])

            print("\nInformations about devices detected under OpenCL:")
            try:
                Id=0
                for platform in cl.get_platforms():
                    for device in platform.get_devices():
                        #deviceType=cl.device_type.to_string(device.type)
                        deviceType="xPU"
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
                        Id=Id+1
                sys.exit()
            except ImportError:
                print("Your platform does not seem to support OpenCL")
                sys.exit()

        elif opt in ("-t", "--valuetype"):
            if arg=='FP64':
                class MyFloat(np.float64): pass
                clType4=cl_array.vec.double4
            else:
                class MyFloat(np.float32):pass
                clType4=cl_array.vec.float4
            ValueType = arg
        elif opt in ("-d", "--device"):
            Device=int(arg)
        elif opt in ("-m", "--method"):
            Method=arg
        elif opt in ("-n", "--number"):
            Number=int(arg)
        elif opt in ("-z", "--size"):
            SizeOfBox=MyFloat(arg)
        elif opt in ("-v", "--velocity"):
            Velocity=MyFloat(arg)
            VirielStress=False
        elif opt in ("-s", "--step"):
            Step=MyFloat(arg)
        elif opt in ("-i", "--iterations"):
            Iterations=int(arg)
        elif opt in ("-r", "--random"):
            InitialRandom=True
        elif opt in ("-c", "--check"):
            CheckEnergies=True
        elif opt in ("-g", "--graph"):
            GraphSamples=True
        elif opt in ("-e", "--viriel"):
            VirielStress=True
                        
    SizeOfBox=MyFloat(SizeOfBox*Number)
    Velocity=MyFloat(Velocity)
    Step=MyFloat(Step)
                
    print("Device choosed : %s" % Device)
    print("Number of particules : %s" % Number)
    print("Size of Box : %s" % SizeOfBox)
    print("Initial velocity : %s" % Velocity)
    print("Number of iterations : %s" % Iterations)
    print("Step of iteration : %s" % Step)
    print("Method of resolution : %s" % Method)
    print("Initial Random for RNG Seed : %s" % InitialRandom)
    print("Check for Energies : %s" % CheckEnergies)
    print("Graph for Samples : %s" % GraphSamples)
    print("ValueType is : %s" % ValueType)
    print("Viriel distribution of stress %s" % VirielStress)

    # Create Numpy array of CL vector with 8 FP32    
    MyCoM = np.zeros(4,dtype=MyFloat)
    MyDataX = np.zeros(Number*4, dtype=MyFloat)
    MyDataV = np.zeros(Number*4, dtype=MyFloat)
    # MyCoM = np.zeros(1,dtype=clType4)
    # MyDataX = np.zeros(Number, dtype=clType4)
    # MyDataV = np.zeros(Number, dtype=clType4)
    MyPotential = np.zeros(Number, dtype=MyFloat)
    MyKinetic = np.zeros(Number, dtype=MyFloat)

    Marsaglia,Computing=DictionariesAPI()

    # Scan the OpenCL arrays
    Id=0
    HasXPU=False
    for platform in cl.get_platforms():
        for device in platform.get_devices():
            if Id==Device:
                PlatForm=platform
                XPU=device
                print("CPU/GPU selected: ",device.name.lstrip())
                print("Platform selected: ",platform.name)
                HasXPU=True
            Id+=1

    if HasXPU==False:
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
        sys.exit()      

    # Create Context
    try:
        ctx = cl.Context([XPU])
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
    except:
        print("Crash during context creation")

    print(Marsaglia[RNG],Computing[ValueType])
    # Build all routines used for the computing
    BuildOptions="-cl-mad-enable -cl-kernel-arg-info -cl-fast-relaxed-math -cl-std=CL1.2 -DTRNG=%i -DTYPE=%i" % (Marsaglia[RNG],Computing[ValueType])
    
    if 'Intel' in PlatForm.name or 'Experimental' in PlatForm.name or 'Clover' in PlatForm.name or 'Portable' in PlatForm.name :
        MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions)
    else:
        MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions+" -cl-strict-aliasing")
    
    mf = cl.mem_flags
    # Read/Write approach for buffering
    # clDataX = cl.Buffer(ctx, mf.READ_WRITE, MyDataX.nbytes)
    # clDataV = cl.Buffer(ctx, mf.READ_WRITE, MyDataV.nbytes)
    # clPotential = cl.Buffer(ctx, mf.READ_WRITE, MyPotential.nbytes)
    # clKinetic = cl.Buffer(ctx, mf.READ_WRITE, MyKinetic.nbytes)
    # clCoM = cl.Buffer(ctx, mf.READ_WRITE, MyCoM.nbytes)

    # Write/HostPoniter approach for buffering
    clDataX = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataX)
    clDataV = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataV)
    clPotential = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyPotential)
    clKinetic = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyKinetic)
    clCoM = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyCoM)

    print('All particles superimposed.')

    # Set particles to RNG points
    if InitialRandom:
        MyRoutines.SplutterPoints(queue,(Number,1),None,clDataX,SizeOfBox,np.uint32(nprnd(2**32)),np.uint32(nprnd(2**32)))
    else:
        MyRoutines.SplutterPoints(queue,(Number,1),None,clDataX,SizeOfBox,np.uint32(110271),np.uint32(250173))

    print('All particules distributed')

    CLLaunch=MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number))
    CLLaunch.wait()
    cl.enqueue_copy(queue,MyCoM,clCoM)
    print('Center Of Mass: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2]))
    
    if VirielStress:
        CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,MyFloat(0.),np.uint32(110271),np.uint32(250173))
    else:
        CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,Velocity,np.uint32(110271),np.uint32(250173))
    CLLaunch.wait()

    if GraphSamples:
        cl.enqueue_copy(queue, MyDataX, clDataX)
        # t0=np.array([[MyDataX[0][0],MyDataX[0][1],MyDataX[0][2]]])
        # t1=np.array([[MyDataX[1][0],MyDataX[1][1],MyDataX[1][2]]])
        # tL=np.array([[MyDataX[-1][0],MyDataX[-1][1],MyDataX[-1][2]]])

    CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential)
    CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic)
    CLLaunch.wait()
    cl.enqueue_copy(queue,MyPotential,clPotential)
    cl.enqueue_copy(queue,MyKinetic,clKinetic)
    print('Viriel=%s Potential=%s Kinetic=%s'% (np.sum(MyPotential)+2*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic)))
 
    if GraphSamples:
        cl.enqueue_copy(queue, MyDataX, clDataX)
        # t0=np.array([[MyDataX[0][0],MyDataX[0][1],MyDataX[0][2]]])
        # t1=np.array([[MyDataX[1][0],MyDataX[1][1],MyDataX[1][2]]])
        # tL=np.array([[MyDataX[-1][0],MyDataX[-1][1],MyDataX[-1][2]]])

    wall_time_start=time.time()

    print("Use the mouse buttons to control some of the dots.")
    print("Hit any key to quit.")
    global view_rotx,view_roty,view_rotz,Iterations
    (view_rotx,view_roty,view_rotz)=(0.0, 0.0, 0.0)
    Iterations=0
    glutInit(sys.argv)
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB)
    glutInitWindowSize(512,512)
    glutCreateWindow(b'NBodyGL')
    setup_viewport()
    glutReshapeFunc(reshape)
    glutDisplayFunc(display)
    glutIdleFunc(display)
#   glutMouseFunc(mouse)
    glutSpecialFunc(special)
    Loop=glutKeyboardFunc(keyboard)
    glutMainLoop()
    
    print("\nWall Duration on %s for each %s\n" % (Device,(time.time()-wall_time_start)/Iterations))
    
    MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number))
    CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential)
    CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic)
    CLLaunch.wait()
    cl.enqueue_copy(queue,MyCoM,clCoM)
    cl.enqueue_copy(queue,MyPotential,clPotential)
    cl.enqueue_copy(queue,MyKinetic,clKinetic)
    print('Center Of Mass: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2]))
    print('Viriel=%s Potential=%s Kinetic=%s'% (np.sum(MyPotential)+2.*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic)))
    
    if GraphSamples:    
        t0=np.transpose(np.reshape(t0,(Iterations+1,3)))
        t1=np.transpose(np.reshape(t1,(Iterations+1,3)))
        tL=np.transpose(np.reshape(tL,(Iterations+1,3)))
    
        import matplotlib.pyplot as plt
        from mpl_toolkits.mplot3d import Axes3D
    
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.scatter(t0[0],t0[1],t0[2], marker='^',color='blue')
        ax.scatter(t1[0],t1[1],t1[2], marker='o',color='red')
        ax.scatter(tL[0],tL[1],tL[2], marker='D',color='green')
   
        ax.set_xlabel('X Label')
        ax.set_ylabel('Y Label')
        ax.set_zlabel('Z Label')

        plt.show()
    
    clDataX.release()
    clDataV.release()
    clKinetic.release()
    clPotential.release()

