#include #include #include #include using namespace std; //************************************************************************* extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size); //************************************************************************* extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size); //************************************************************************* extern "C" void dvc_Barrier(); //************************************************************************* extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz, int nblocks, int nthreads, int S); //************************************************************************* extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz, int nblocks, int nthreads, int S); //************************************************************************* extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz, int Nx, int Ny, int Nz, int nblocks, int nthreads, int S); //************************************************************************* void Write_Out(double *array, int Nx, int Ny, int Nz){ int value; FILE *output; output = fopen("dist.list","w"); for (int k=0; k> FILENAME; // name of the input file input >> Nz; // number of nodes (x,y,z) input >> nBlocks; input >> nthreads; input >> tau; // relaxation time input >> Fx; // External force components (x,y,z) input >> Fy; input >> Fz; input >> timestepMax; // max no. of timesteps input >> interval; // error interval input >> tol; // error tolerance double rlx_setA = 1.f/tau; double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA); printf("tau = %f \n", tau); printf("Set A = %f \n", rlx_setA); printf("Set B = %f \n", rlx_setB); printf("Force(x) = %f \n", Fx); printf("Force(y) = %f \n", Fy); printf("Force(z) = %f \n", Fz); Nx = Ny = Nz; // Cubic domain int N = Nx*Ny*Nz; int dist_mem_size = N*sizeof(double); // unsigned int nBlocks = 32; // int nthreads = 128; int S = N/nthreads/nBlocks; // unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1); dim3 grid(nBlocks,1,1); printf("Number of blocks = %i \n", nBlocks); printf("Threads per block = %i \n", nthreads); printf("Sweeps per thread = %i \n", S); printf("Number of nodes per side = %i \n", Nx); printf("Total Number of nodes = %i \n", N); //....................................................................... printf("Read input media... \n"); // .......... READ THE INPUT FILE ....................................... int n; char value; char *id; id = new char[N]; int sum = 0; double porosity; ifstream PM(FILENAME.c_str(),ios::binary); for (int k=0;k 0) sum++; } } } PM.close(); printf("File porosity = %f\n", double(sum)/N); //....................................................................... //...........device phase ID................................................. char *ID; dvc_AllocateDeviceMemory((void **) &ID, N); // Allocate device memory // Copy to the device dvc_CopyToDevice(ID, id, N); //........................................................................... //......................device distributions................................. double *f_even,*f_odd; //........................................................................... dvc_AllocateDeviceMemory((void **) &f_even, 10*dist_mem_size); // Allocate device memory dvc_AllocateDeviceMemory((void **) &f_odd, 9*dist_mem_size); // Allocate device memory //........................................................................... //........................................................................... // cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable); // cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable); // cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable); // cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable); // cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable); printf("Setting the distributions, size = : %i\n", N); //........................................................................... // INITIALIZE <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S); //........................................................................... dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S); //************************************************************************* int timestep = 0; printf("No. of timesteps: %i \n", timestepMax); //.......create a stream for the LB calculation....... cudaStream_t stream; cudaStreamCreate(&stream); //.......create and start timer............ cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord( start, 0 ); //......................................... //************ MAIN ITERATION LOOP ***************************************/ while (timestep < timestepMax){ //................................................................... //........ Execute the swap kernel (device) ......................... // SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S); //................................................................... dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S); //........ Execute the collision kernel (device) .................... // MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S, // rlx_setA, rlx_setB, Fx, Fy, Fz); //............................................................ dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S); // Iteration completed! timestep++; //................................................................... } //************************************************************************/ // cudaThreadSynchronize(); dvc_Barrier(); //.......... stop and destroy timer............................. cudaEventRecord( stop, stream); cudaEventSynchronize( stop ); cudaEventElapsedTime( &time, start, stop ); printf("CPU time = %f \n", time); float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time; printf("MLUPS = %f \n", MLUPS); cudaStreamDestroy(stream); cudaEventDestroy( start ); cudaEventDestroy( stop ); //.............................................................. //.............................................................. /*//.........Compute the velocity and copy result to host ........ double *velocity; velocity = new double[3*N]; //......................device distributions.................................... double *vel; //.............................................................................. dvc_AllocateDeviceMemory((void **) &vel, 3*dist_mem_size); // Allocate device memory //.............................................................................. // Compute_VELOCITY <<< grid, nthreads >>> (ID, f_even, f_odd, vel, Nx, Ny, Nz, S); //.............................................................................. // cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost); //.............................................................................. //............................................................ //....Write the z-velocity to test poiseuille flow............ double vz,vz_avg; vz_avg = 0.0; /* FILE *output; output = fopen("velocity.out","w"); for (int k=0; k<1; k++){ for (int j=0; j<1; j++){ for (int i=0; i