Created prelimary cmake build

This commit is contained in:
Mark Berrill
2013-12-05 11:42:57 -05:00
committed by Mark A. Berrill
parent 9dd97d7f0f
commit 0430704252
39 changed files with 2644 additions and 2657 deletions

6
gpu/exe/CMakeLists.txt Executable file
View File

@@ -0,0 +1,6 @@
#INSTALL_LBPM_EXE( lb1_MRT_mpi )
#INSTALL_LBPM_EXE( lb2_Color )
#INSTALL_LBPM_EXE( lb2_Color_mpi )
#INSTALL_LBPM_EXE( lb2_Color_pBC_wia_mpi )
INSTALL_LBPM_EXE( lb2_Color_wia_mpi )

248
gpu/exe/lb1_MRT-swap.cu Normal file
View File

@@ -0,0 +1,248 @@
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <cuda.h>
//#include <cutil.h>
using namespace std;
//*************************************************************************
extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
int Ny, int Nz, int nblocks, int nthreads, int S);
//*************************************************************************
extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
int Ny, int Nz, int nblocks, int nthreads, int S);
//*************************************************************************
extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
//*************************************************************************
void Write_Out(double *array, int Nx, int Ny, int Nz){
int value;
FILE *output;
output = fopen("dist.list","w");
for (int k=0; k<Nz; k++){
for (int j=0; j<Ny; j++){
for (int i=0; i<Nx; i++){
int index = k*Nx*Ny+j*Nx+i;
value = int(array[index]);
fprintf(output, "| %i",value);
}
fprintf(output, " | \n");
}
fprintf(output,"************************************** \n");
}
fclose(output);
}
//**************************************************************************
// MRT implementation of the LBM using CUDA
//**************************************************************************
int main(void)
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
int device = 1;
printf("Number of devices = %i \n", deviceCount);
printf("Current device is = %i \n", device);
cudaSetDevice(device);
// BGK Model parameters
string FILENAME;
unsigned int nBlocks, nthreads;
int timestepMax, interval;
double tau,Fx,Fy,Fz,tol;
// Domain variables
int Nx,Ny,Nz;
ifstream input("MRT.in");
input >> FILENAME; // name of the input file
input >> Nz; // number of nodes (x,y,z)
input >> nBlocks;
input >> nthreads;
input >> tau; // relaxation time
input >> Fx; // External force components (x,y,z)
input >> Fy;
input >> Fz;
input >> timestepMax; // max no. of timesteps
input >> interval; // error interval
input >> tol; // error tolerance
double rlx_setA = 1.f/tau;
double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
printf("tau = %f \n", tau);
printf("Set A = %f \n", rlx_setA);
printf("Set B = %f \n", rlx_setB);
printf("Force(x) = %f \n", Fx);
printf("Force(y) = %f \n", Fy);
printf("Force(z) = %f \n", Fz);
Nx = Ny = Nz; // Cubic domain
int N = Nx*Ny*Nz;
int dist_mem_size = N*sizeof(double);
// unsigned int nBlocks = 32;
// int nthreads = 128;
int S = N/nthreads/nBlocks;
// unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
dim3 grid(nBlocks,1,1);
printf("Number of blocks = %i \n", nBlocks);
printf("Threads per block = %i \n", nthreads);
printf("Sweeps per thread = %i \n", S);
printf("Number of nodes per side = %i \n", Nx);
printf("Total Number of nodes = %i \n", N);
//.......................................................................
printf("Read input media... \n");
// .......... READ THE INPUT FILE .......................................
int n;
char value;
char *id;
id = new char[N];
int sum = 0;
double porosity;
ifstream PM(FILENAME.c_str(),ios::binary);
for (int k=0;k<Nz;k++){
for (int j=0;j<Ny;j++){
for (int i=0;i<Nx;i++){
PM.read((char *) (&value), sizeof(value));
n = k*Nx*Ny+j*Nx+i;
id[n] = value;
if (value > 0) sum++;
}
}
}
PM.close();
printf("File porosity = %f\n", double(sum)/N);
//.......................................................................
//...........device phase ID.................................................
char *ID;
cudaMalloc((void **) &ID, N); // Allocate device memory
// Copy to the device
cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
//...........................................................................
//......................device distributions.................................
double *f_even,*f_odd;
//...........................................................................
cudaMalloc((void **) &f_even, 10*dist_mem_size); // Allocate device memory
cudaMalloc((void **) &f_odd, 9*dist_mem_size); // Allocate device memory
//...........................................................................
//...........................................................................
// cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
// cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
// cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
// cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
// cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
printf("Setting the distributions, size = : %i\n", N);
//...........................................................................
// INITIALIZE <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
//...........................................................................
dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
//*************************************************************************
int timestep = 0;
printf("No. of timesteps: %i \n", timestepMax);
//.......create a stream for the LB calculation.......
cudaStream_t stream;
cudaStreamCreate(&stream);
//.......create and start timer............
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
//.........................................
//************ MAIN ITERATION LOOP ***************************************/
while (timestep < timestepMax){
//...................................................................
//........ Execute the swap kernel (device) .........................
// SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
//...................................................................
dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
//........ Execute the collision kernel (device) ....................
// MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
// rlx_setA, rlx_setB, Fx, Fy, Fz);
//............................................................
dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
// Iteration completed!
timestep++;
//...................................................................
}
//************************************************************************/
cudaThreadSynchronize();
//.......... stop and destroy timer.............................
cudaEventRecord( stop, stream);
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
printf("CPU time = %f \n", time);
float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
printf("MLUPS = %f \n", MLUPS);
cudaStreamDestroy(stream);
cudaEventDestroy( start );
cudaEventDestroy( stop );
//..............................................................
//..............................................................
//.........Compute the velocity and copy result to host ........
double *velocity;
velocity = new double[3*N];
//......................device distributions....................................
double *vel;
//..............................................................................
cudaMalloc((void **) &vel, 3*dist_mem_size); // Allocate device memory
//..............................................................................
// Compute_VELOCITY <<< grid, nthreads >>> (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
//..............................................................................
cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
//..............................................................................
//............................................................
//....Write the z-velocity to test poiseuille flow............
double vz,vz_avg;
vz_avg = 0.0;
FILE *output;
output = fopen("velocity.out","w");
for (int k=0; k<1; k++){
for (int j=0; j<1; j++){
for (int i=0; i<Nx; i++){
int n = k*Nx*Ny+j*Nx+i;
//.....print value........
vz = velocity[2*N+n];
vz_avg += vz;
fprintf(output, " %e",vz);
}
}
}
fclose(output);
vz = vz_avg/double(sum);
printf("Average Velocity = %e\n", vz);
// cleanup
cudaFree(f_even); cudaFree(f_odd); cudaFree(vel); cudaFree(ID);
free (velocity); free(id);
}

246
gpu/exe/lb1_MRT.cu Normal file
View File

@@ -0,0 +1,246 @@
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <cuda.h>
using namespace std;
//*************************************************************************
extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
//*************************************************************************
extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
//*************************************************************************
extern "C" void dvc_Barrier();
//*************************************************************************
extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
int Ny, int Nz, int nblocks, int nthreads, int S);
//*************************************************************************
extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
int Ny, int Nz, int nblocks, int nthreads, int S);
//*************************************************************************
extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
//*************************************************************************
void Write_Out(double *array, int Nx, int Ny, int Nz){
int value;
FILE *output;
output = fopen("dist.list","w");
for (int k=0; k<Nz; k++){
for (int j=0; j<Ny; j++){
for (int i=0; i<Nx; i++){
int index = k*Nx*Ny+j*Nx+i;
value = int(array[index]);
fprintf(output, "| %i",value);
}
fprintf(output, " | \n");
}
fprintf(output,"************************************** \n");
}
fclose(output);
}
//**************************************************************************
// MRT implementation of the LBM using CUDA
//**************************************************************************
int main(void)
{
// BGK Model parameters
string FILENAME;
unsigned int nBlocks, nthreads;
int timestepMax, interval;
double tau,Fx,Fy,Fz,tol;
// Domain variables
int Nx,Ny,Nz;
ifstream input("MRT.in");
input >> FILENAME; // name of the input file
input >> Nz; // number of nodes (x,y,z)
input >> nBlocks;
input >> nthreads;
input >> tau; // relaxation time
input >> Fx; // External force components (x,y,z)
input >> Fy;
input >> Fz;
input >> timestepMax; // max no. of timesteps
input >> interval; // error interval
input >> tol; // error tolerance
double rlx_setA = 1.f/tau;
double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
printf("tau = %f \n", tau);
printf("Set A = %f \n", rlx_setA);
printf("Set B = %f \n", rlx_setB);
printf("Force(x) = %f \n", Fx);
printf("Force(y) = %f \n", Fy);
printf("Force(z) = %f \n", Fz);
Nx = Ny = Nz; // Cubic domain
int N = Nx*Ny*Nz;
int dist_mem_size = N*sizeof(double);
// unsigned int nBlocks = 32;
// int nthreads = 128;
int S = N/nthreads/nBlocks;
// unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
dim3 grid(nBlocks,1,1);
printf("Number of blocks = %i \n", nBlocks);
printf("Threads per block = %i \n", nthreads);
printf("Sweeps per thread = %i \n", S);
printf("Number of nodes per side = %i \n", Nx);
printf("Total Number of nodes = %i \n", N);
//.......................................................................
printf("Read input media... \n");
// .......... READ THE INPUT FILE .......................................
int n;
char value;
char *id;
id = new char[N];
int sum = 0;
double porosity;
ifstream PM(FILENAME.c_str(),ios::binary);
for (int k=0;k<Nz;k++){
for (int j=0;j<Ny;j++){
for (int i=0;i<Nx;i++){
PM.read((char *) (&value), sizeof(value));
n = k*Nx*Ny+j*Nx+i;
id[n] = value;
if (value > 0) sum++;
}
}
}
PM.close();
printf("File porosity = %f\n", double(sum)/N);
//.......................................................................
//...........device phase ID.................................................
char *ID;
dvc_AllocateDeviceMemory((void **) &ID, N); // Allocate device memory
// Copy to the device
dvc_CopyToDevice(ID, id, N);
//...........................................................................
//......................device distributions.................................
double *f_even,*f_odd;
//...........................................................................
dvc_AllocateDeviceMemory((void **) &f_even, 10*dist_mem_size); // Allocate device memory
dvc_AllocateDeviceMemory((void **) &f_odd, 9*dist_mem_size); // Allocate device memory
//...........................................................................
//...........................................................................
// cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
// cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
// cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
// cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
// cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
printf("Setting the distributions, size = : %i\n", N);
//...........................................................................
// INITIALIZE <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
//...........................................................................
dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
//*************************************************************************
int timestep = 0;
printf("No. of timesteps: %i \n", timestepMax);
//.......create a stream for the LB calculation.......
cudaStream_t stream;
cudaStreamCreate(&stream);
//.......create and start timer............
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
//.........................................
//************ MAIN ITERATION LOOP ***************************************/
while (timestep < timestepMax){
//...................................................................
//........ Execute the swap kernel (device) .........................
// SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
//...................................................................
dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
//........ Execute the collision kernel (device) ....................
// MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
// rlx_setA, rlx_setB, Fx, Fy, Fz);
//............................................................
dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
// Iteration completed!
timestep++;
//...................................................................
}
//************************************************************************/
// cudaThreadSynchronize();
dvc_Barrier();
//.......... stop and destroy timer.............................
cudaEventRecord( stop, stream);
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
printf("CPU time = %f \n", time);
float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
printf("MLUPS = %f \n", MLUPS);
cudaStreamDestroy(stream);
cudaEventDestroy( start );
cudaEventDestroy( stop );
//..............................................................
//..............................................................
/*//.........Compute the velocity and copy result to host ........
double *velocity;
velocity = new double[3*N];
//......................device distributions....................................
double *vel;
//..............................................................................
dvc_AllocateDeviceMemory((void **) &vel, 3*dist_mem_size); // Allocate device memory
//..............................................................................
// Compute_VELOCITY <<< grid, nthreads >>> (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
//..............................................................................
// cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
//..............................................................................
//............................................................
//....Write the z-velocity to test poiseuille flow............
double vz,vz_avg;
vz_avg = 0.0;
/* FILE *output;
output = fopen("velocity.out","w");
for (int k=0; k<1; k++){
for (int j=0; j<1; j++){
for (int i=0; i<Nx; i++){
int n = k*Nx*Ny+j*Nx+i;
//.....print value........
vz = velocity[2*N+n];
vz_avg += vz;
fprintf(output, " %e",vz);
}
}
}
fclose(output);
vz = vz_avg/double(sum);
printf("Average Velocity = %e\n", vz);
*/
// cleanup
// cudaFree(f_even); cudaFree(f_odd); cudaFree(vel); cudaFree(ID);
// free (velocity); free(id);
}

1338
gpu/exe/lb1_MRT_mpi.cpp Normal file

File diff suppressed because it is too large Load Diff

1836
gpu/exe/lb1_MRT_mpi.cu Normal file

File diff suppressed because it is too large Load Diff

372
gpu/exe/lb2_Color.cpp Normal file
View File

@@ -0,0 +1,372 @@
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
//*************************************************************************
// Functions defined in Color.cu
//*************************************************************************
extern "C" void dvc_InitDenColor( int nblocks, int nthreads, int S,
char *ID, double *Den, double *Phi, double das, double dbs, int N);
//*************************************************************************
extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
char *ID, double *Phi, double *ColorGrad, int Nx, int Ny, int Nz);
//*************************************************************************
extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S,
char *ID, double *f_even, double *f_odd, double *ColorGrad, double *Velocity,
double rlxA, double rlxB,double alpha, double beta, double Fx, double Fy, double Fz,
int Nx, int Ny, int Nz, bool pBC);
//*************************************************************************
extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
char *ID, double *Den, double *Copy, double *Phi, double *ColorGrad, double *Velocity,
double beta, int Nx, int Ny, int Nz, bool pBC);
//*************************************************************************
extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S,
char *ID, double *Phi, double *Copy, double *Den, int N);
//*************************************************************************
//*************************************************************************
// Functions defined in D3Q19.cu
//*************************************************************************
extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S, char *ID, double *f_even, double *f_odd, int Nx,
int Ny, int Nz);
//*************************************************************************
extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S,
char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
//*************************************************************************
extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList, int start,
int sendCount, double *sendbuf, double *Dist, int N);
//*************************************************************************
extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy, int Cqz, int *RecvList, int start,
int recvCount, double *recvbuf, double *Dist, int Nx, int Ny, int Nz);
//*************************************************************************
//***************************************************************************************
// Functions defined in D3Q7.cu
//***************************************************************************************
extern "C" void dvc_PackDenD3Q7(int grid, int threads, int *list, int count, double *sendbuf,
int number, double *Data, int N);
//***************************************************************************************
extern "C" void dvc_UnpackDenD3Q7(int grid, int threads, int *list, int count, double *recvbuf,
int number, double *Data, int N);
//***************************************************************************************
extern "C" void dvc_PackValues(int grid, int threads, int *list, int count, double *sendbuf,
double *Data, int N);
//***************************************************************************************
extern "C" void dvc_UnpackValues(int grid, int threads, int *list, int count, double *recvbuf,
double *Data, int N);
//***************************************************************************************
//*************************************************************************
// Functions defined in CudaExtras.cu
//*************************************************************************
extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
//*************************************************************************
extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
//*************************************************************************
extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size);
//*************************************************************************
extern "C" void dvc_Barrier();
//*************************************************************************
//*************************************************************************
// Implementation of Two-Phase Immiscible LBM using CUDA
//*************************************************************************
using namespace std;
inline void PackID(int *list, int count, char *sendbuf, char *ID){
// Fill in the phase ID values from neighboring processors
// This packs up the values that need to be sent from one processor to another
int idx,n;
for (idx=0; idx<count; idx++){
n = list[idx];
sendbuf[idx] = ID[n];
}
}
//***************************************************************************************
inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
// Fill in the phase ID values from neighboring processors
// This unpacks the values once they have been recieved from neighbors
int idx,n;
for (idx=0; idx<count; idx++){
n = list[idx];
ID[n] = recvbuf[idx];
}
}
//***************************************************************************************
int main(int argc, char **argv)
{
int rank = 0;
int nprocs =1;
int nprocx,nprocy,nprocz;
int iproc,jproc,kproc;
if (rank == 0){
printf("********************************************************\n");
printf("Running Hybrid Implementation of Color LBM \n");
printf("********************************************************\n");
}
// Color Model parameters
string FILENAME;
unsigned int nBlocks, nthreads;
int Nx,Ny,Nz;
int timestepMax, interval;
double tau,Fx,Fy,Fz,tol;
double alpha, beta;
double das, dbs;
double din,dout;
bool pBC;
int i,j,k,n;
if (rank==0){
//.............................................................
// READ SIMULATION PARMAETERS FROM INPUT FILE
//.............................................................
ifstream input("Color.in");
// Line 1: Name of the phase indicator file (s=0,w=1,n=2)
input >> FILENAME;
// Line 2: domain size (Nx, Ny, Nz)
input >> Nz; // number of nodes (x,y,z)
input >> nBlocks;
input >> nthreads;
// Line 3: model parameters (tau, alpha, beta, das, dbs)
input >> tau;
input >> alpha;
input >> beta;
input >> das;
input >> dbs;
// Line 4: External force components (Fx,Fy, Fz)
input >> Fx;
input >> Fy;
input >> Fz;
// Line 5: Pressure Boundary conditions
input >> pBC;
input >> din;
input >> dout;
// Line 6: time-stepping criteria
input >> timestepMax; // max no. of timesteps
input >> interval; // error interval
input >> tol; // error tolerance
//.............................................................
ifstream domain("Domain.in");
domain >> nprocx;
domain >> nprocy;
domain >> nprocz;
}
double rlxA = 1.f/tau;
double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
if (nprocs != nprocx*nprocy*nprocz){
printf("Fatal error in processor number! \n");
printf("nprocx = %i \n",nprocx);
printf("nprocy = %i \n",nprocy);
printf("nprocz = %i \n",nprocz);
}
if (rank==0){
printf("********************************************************\n");
printf("tau = %f \n", tau);
printf("alpha = %f \n", alpha);
printf("beta = %f \n", beta);
printf("das = %f \n", beta);
printf("dbs = %f \n", beta);
printf("Force(x) = %f \n", Fx);
printf("Force(y) = %f \n", Fy);
printf("Force(z) = %f \n", Fz);
printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
printf("Parallel domain size = %i x %i x %i\n",nprocx,nprocy,nprocz);
printf("********************************************************\n");
}
Nz += 2;
Nx = Ny = Nz; // Cubic domain
int N = Nx*Ny*Nz;
int dist_mem_size = N*sizeof(double);
// unsigned int nBlocks = 32;
// int nthreads = 128;
int S = N/nthreads/nBlocks;
// unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
// dim3 grid(nBlocks,1,1);
if (rank==0) printf("Number of blocks = %i \n", nBlocks);
if (rank==0) printf("Threads per block = %i \n", nthreads);
if (rank==0) printf("Sweeps per thread = %i \n", S);
if (rank==0) printf("Number of nodes per side = %i \n", Nx);
if (rank==0) printf("Total Number of nodes = %i \n", N);
if (rank==0) printf("********************************************************\n");
//.......................................................................
if (rank == 0) printf("Read input media... \n");
//.......................................................................
char LocalRankString[8];
char LocalRankFilename[40];
sprintf(LocalRankString,"%05d",rank);
sprintf(LocalRankFilename,"%s%s","ID.",LocalRankString);
// printf("Local File Name = %s \n",LocalRankFilename);
// .......... READ THE INPUT FILE .......................................
char value;
char *id;
id = new char[N];
int sum = 0;
// double porosity;
//.......................................................................
ifstream PM(LocalRankFilename,ios::binary);
for (k=0;k<Nz;k++){
for (j=0;j<Ny;j++){
for (i=0;i<Nx;i++){
n = k*Nx*Ny+j*Nx+i;
id[n] = 0;
}
}
}
for ( k=1;k<Nz-1;k++){
for ( j=1;j<Ny-1;j++){
for ( i=1;i<Nx-1;i++){
PM.read((char *) (&value), sizeof(value));
n = k*Nx*Ny+j*Nx+i;
id[n] = value;
if (value > 0) sum++;
}
}
}
PM.close();
// printf("File porosity = %f\n", double(sum)/N);
//...........device phase ID.................................................
if (rank==0) printf ("Copying phase ID to device \n");
char *ID;
dvc_AllocateDeviceMemory((void **) &ID, N); // Allocate device memory
// Copy to the device
dvc_CopyToDevice(ID, id, N);
//...........................................................................
if (rank==0) printf ("Allocating distributions \n");
//......................device distributions.................................
double *f_even,*f_odd;
//...........................................................................
dvc_AllocateDeviceMemory((void **) &f_even, 10*dist_mem_size); // Allocate device memory
dvc_AllocateDeviceMemory((void **) &f_odd, 9*dist_mem_size); // Allocate device memory
//...........................................................................
//...........................................................................
// MAIN VARIABLES ALLOCATED HERE
//...........................................................................
double *Phi,*Den,*Copy;
double *ColorGrad, *Velocity;
//...........................................................................
dvc_AllocateDeviceMemory((void **) &Phi, dist_mem_size);
dvc_AllocateDeviceMemory((void **) &Den, 2*dist_mem_size);
dvc_AllocateDeviceMemory((void **) &Copy, 2*dist_mem_size);
dvc_AllocateDeviceMemory((void **) &Velocity, 3*dist_mem_size);
dvc_AllocateDeviceMemory((void **) &ColorGrad, 3*dist_mem_size);
//...........................................................................
if (rank==0) printf("Setting the distributions, size = : %i\n", N);
//...........................................................................
dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi, das, dbs, N);
//...........................................................................
dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
//...........................................................................
//...........................................................................
// Grids used to pack faces on the GPU for MPI
int faceGrid,edgeGrid,packThreads;
packThreads=512;
edgeGrid=1;
faceGrid=Nx*Ny/packThreads;
int timestep = 0;
if (rank==0) printf("********************************************************\n");
if (rank==0) printf("No. of timesteps: %i \n", timestepMax);
//.......create a stream for the LB calculation.......
// cudaStream_t stream;
// cudaStreamCreate(&stream);
//.......create and start timer............
double start,stop;
double walltime;
start = clock();
//************ MAIN ITERATION LOOP ***************************************/
while (timestep < timestepMax){
//*************************************************************************
// Compute the color gradient
//*************************************************************************
dvc_ComputeColorGradient(nBlocks, nthreads, S,
ID, Phi, ColorGrad, Nx, Ny, Nz);
//*************************************************************************
//*************************************************************************
// Perform collision step for the momentum transport
//*************************************************************************
dvc_ColorCollide(nBlocks, nthreads, S,
ID, f_even, f_odd, ColorGrad, Velocity,
rlxA, rlxB,alpha, beta, Fx, Fy, Fz, Nx, Ny, Nz, pBC);
//*************************************************************************
//*************************************************************************
// Carry out the density streaming step for mass transport
//*************************************************************************
dvc_DensityStreamD3Q7(nBlocks, nthreads, S,
ID, Den, Copy, Phi, ColorGrad, Velocity,beta, Nx, Ny, Nz, pBC);
//*************************************************************************
//*************************************************************************
// Swap the distributions for momentum transport
//*************************************************************************
dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
//*************************************************************************
//*************************************************************************
// Compute the phase indicator field and reset Copy, Den
//*************************************************************************
dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
//*************************************************************************
// Iteration completed!
timestep++;
//...................................................................
}
//************************************************************************/
dvc_Barrier();
stop = clock();
// cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
walltime = (stop - start)/CLOCKS_PER_SEC;
// cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 << " MLUPS" << endl;
double MLUPS = double(Nx*Ny*Nz*timestep)/walltime/1000000;
if (rank==0) printf("********************************************************\n");
if (rank==0) printf("CPU time = %f \n", walltime);
if (rank==0) printf("Lattice update rate (per core)= %f MLUPS \n", MLUPS);
MLUPS *= nprocs;
if (rank==0) printf("Lattice update rate (total)= %f MLUPS \n", MLUPS);
if (rank==0) printf("********************************************************\n");
//************************************************************************/
// Write out the phase indicator field
//************************************************************************/
sprintf(LocalRankFilename,"%s%s","Phase.",LocalRankString);
// printf("Local File Name = %s \n",LocalRankFilename);
double *phiOut;
phiOut = new double[N];
dvc_CopyToHost(phiOut,Phi,N*sizeof(double));
FILE *PHASE;
PHASE = fopen(LocalRankFilename,"wb");
fwrite(phiOut,8,N,PHASE);
fclose(PHASE);
//************************************************************************/
}

423
gpu/exe/lb2_Color.cu Normal file
View File

@@ -0,0 +1,423 @@
#ifdef useMPI
#include <mpi.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <math.h>
#include <cuda.h>
using namespace std;
//*************************************************************************
// HokieSpeed
//nvcc -Xcompiler -fopenmp -lgomp -O3 -arch sm_20 -o hybridATLKR lb2_ATLKR_hybrid.cu
// -I$VT_MPI_INC -L$VT_MPI_LIB -lmpi
//*************************************************************************
//*************************************************************************
// Implementation of Two-Phase Immiscible LBM using CUDA
//*************************************************************************
//*************************************************************************
extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S,
char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
//*************************************************************************
extern "C" void dvc_InitDenColor( int nblocks, int nthreads, int S,
char *ID, double *Den, double *Phi, double das, double dbs, int N);
//*************************************************************************
extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
char *ID, double *Phi, double *ColorGrad, int Nx, int Ny, int Nz);
//*************************************************************************
extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S,
char *ID, double *f_even, double *f_odd, double *ColorGrad, double *Velocity,
double rlxA, double rlxB,double alpha, double beta, double Fx, double Fy, double Fz,
int Nx, int Ny, int Nz, bool pBC);
//*************************************************************************
extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
char *ID, double *Den, double *Copy, double *Phi, double *ColorGrad, double *Velocity,
double beta, int Nx, int Ny, int Nz, bool pBC);
//*************************************************************************
extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S,
char *ID, double *Phi, double *Copy, double *Den, int N);
//*************************************************************************
extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
//*************************************************************************
extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
//*************************************************************************
extern "C" void dvc_Barrier();
//*************************************************************************
extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S,
char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
//*************************************************************************
extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList, int start,
int sendCount, double *sendbuf, double *Dist, int N);
//*************************************************************************
extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy, int Cqz, int *RecvList, int start,
int recvCount, double *recvbuf, double *Dist, int Nx, int Ny, int Nz);
//*************************************************************************
int main(int argc, char *argv[])
{
//********** Initialize MPI ****************
int numprocs,rank;
#ifdef useMPI
MPI_Status stat;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
#else
numprocs = 1;
rank = 0;
#endif
//******************************************
if (rank == 0){
printf("********************************************************\n");
printf("Running Hybrid Implementation of Color LBM \n");
printf("********************************************************\n");
}
// Color Model parameters
string FILENAME;
unsigned int nBlocks, nthreads;
int Nx,Ny,Nz;
int timestepMax, interval;
double tau,Fx,Fy,Fz,tol;
double alpha, beta;
double das, dbs;
double din,dout;
bool pBC;
if (rank==0){
//.............................................................
// READ SIMULATION PARMAETERS FROM INPUT FILE
//.............................................................
ifstream input("Color.in");
// Line 1: Name of the phase indicator file (s=0,w=1,n=2)
input >> FILENAME;
// Line 2: domain size (Nx, Ny, Nz)
input >> Nz; // number of nodes (x,y,z)
input >> nBlocks;
input >> nthreads;
// Line 3: model parameters (tau, alpha, beta, das, dbs)
input >> tau;
input >> alpha;
input >> beta;
input >> das;
input >> dbs;
// Line 4: External force components (Fx,Fy, Fz)
input >> Fx;
input >> Fy;
input >> Fz;
// Line 5: Pressure Boundary conditions
input >> pBC;
input >> din;
input >> dout;
// Line 6: time-stepping criteria
input >> timestepMax; // max no. of timesteps
input >> interval; // error interval
input >> tol; // error tolerance
//.............................................................
}
#ifdef useMPI
// **************************************************************
// Broadcast simulation parameters from rank 0 to all other procs
MPI_Barrier(MPI_COMM_WORLD);
//.................................................
MPI_Bcast(&Nz,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&nBlocks,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&nthreads,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&Fx,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&Fy,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&Fz,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&alpha,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&das,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&dbs,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&pBC,1,MPI_LOGICAL,0,MPI_COMM_WORLD);
MPI_Bcast(&din,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&dout,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast(&timestepMax,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&interval,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&tol,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
//.................................................
MPI_Barrier(MPI_COMM_WORLD);
// **************************************************************
#endif
double rlxA = 1.f/tau;
double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
if (pBC && rank == 0){
printf("Assigning presusre boundary conditions \n");
printf("Inlet density = %f \n", din);
printf("Outlet density = %f \n", dout);
}
if (rank==0){
printf("....Parameters................\n");
printf("tau = %f \n", tau);
printf("alpha = %f \n", alpha);
printf("beta = %f \n", beta);
printf("das = %f \n", das);
printf("dbs = %f \n", dbs);
printf("Force(x) = %f \n", Fx);
printf("Force(y) = %f \n", Fy);
printf("Force(z) = %f \n", Fz);
printf("Nz = %i \n", Nz);
printf("timestepMax = %i \n", timestepMax);
printf("...............................\n");
}
// Identical cubic sub-domains
Nx = Ny = Nz;// = 16*s; // Cubic domain
int N = Nx*Ny*Nz;
int dist_mem_size = N*sizeof(double);
// unsigned int nBlocks = 32;
// int nthreads = 128;
int S = N/nthreads/nBlocks;
if (nBlocks*nthreads*S < N) S++;
// int S = 1;
// unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
// dim3 grid(nBlocks,1,1);
if (rank==1){
printf("Number of blocks = %i \n", nBlocks);
printf("Threads per block = %i \n", nthreads);
printf("Sweeps per thread = %i \n", S);
printf("Number of nodes per side = %i \n", Nx);
printf("Total Number of nodes = %i \n", N);
printf("...............................\n");
}
//.......................................................................
// .......... READ THE INPUT FILE .......................................
int n;
char value;
char *id;
id = new char[N];
int sum = 0;
// RANK 0 READS THE INPUT FILE
if (rank==0){
printf("Read input media... \n");
ifstream PM(FILENAME.c_str(),ios::binary);
for (int k=0;k<Nz;k++){
for (int j=0;j<Ny;j++){
for (int i=0;i<Nx;i++){
PM.read((char *) (&value), sizeof(value));
n = k*Nx*Ny+j*Nx+i;
if (value>0){
if (pBC) value=2; // Saturate with NWP
if (k<8){
value=1;
}
}
id[n] = value;
if (value > 0) sum++;
}
}
}
PM.close();
printf("File porosity = %f\n", double(sum)/N);
}
//......... for pressure BC only............................
// Void the first / last rows if pressure BC are to be used
if (pBC){
for (int k=0;k<Nz;k++){
for (int j=0;j<Ny;j++){
for (int i=0;i<Nx;i++){
n = k*Nx*Ny+j*Nx+i;
if (k<4) id[n] = 1;
if (k>Nz-5) id[n] = 2;
}
}
// Skip the non-boundary values
if (k==4) k=Nz-5;
}
}
#ifdef useMPI //............................................................
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(&id[0],N,MPI_CHAR,0,MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
#endif
if (rank == 0) printf("Domain set.\n");
//...........................................................................
int SBC;
int outlet = N-Nx*Ny;
if (pBC){
SBC = Nx*Ny/nthreads/nBlocks+1;
printf("Number of sweeps for inlet / outlet: %i \n", SBC);
}
//...........................................................................
//...........................................................................
//...........device phase ID.................................................
char *ID;
cudaMalloc((void **) &ID, N); // Allocate device memory
// Copy to the device
cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
//...........................................................................
//......................device distributions.................................
double *f_even,*f_odd;
//...........................................................................
cudaMalloc((void **) &f_even, 10*dist_mem_size); // Allocate device memory
cudaMalloc((void **) &f_odd, 9*dist_mem_size); // Allocate device memory
// f_even = new double[10*N];
// f_odd = new double[9*N];
//...........................................................................
//...........................................................................
// MAIN VARIABLES ALLOCATED HERE
//...........................................................................
double *Phi,*Den,*Copy;
double *ColorGrad, *Velocity;
//...........................................................................
cudaMalloc((void **) &Phi, dist_mem_size);
cudaMalloc((void **) &Den, 2*dist_mem_size);
cudaMalloc((void **) &Copy, 2*dist_mem_size);
cudaMalloc((void **) &Velocity, 3*dist_mem_size);
cudaMalloc((void **) &ColorGrad, 3*dist_mem_size);
//...........................................................................
//...........................................................................
if (rank==0) printf("Setting the distributions, size = : %i\n", N);
//...........................................................................
dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi, das, dbs, N);
//...........................................................................
dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
//...........................................................................
int timestep;
// double starttime,stoptime;
if (rank==0) printf("No. of timesteps: %i \n", timestepMax);
timestep = 0;
//.......create and start timer............
cudaEvent_t start, stop;
float time;
//.......create a stream for the LB calculation.......
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
//.........................................
//************ MAIN TIMESTEP LOOP ***************************************/
while (timestep < timestepMax){
//*************************************************************************
// Compute the color gradient
//*************************************************************************
dvc_ComputeColorGradient(nBlocks, nthreads, S,
ID, Phi, ColorGrad, Nx, Ny, Nz);
//*************************************************************************
//*************************************************************************
// Perform collision step for the momentum transport
//*************************************************************************
dvc_ColorCollide(nBlocks, nthreads, S,
ID, f_even, f_odd, ColorGrad, Velocity,
rlxA, rlxB,alpha, beta, Fx, Fy, Fz, Nx, Ny, Nz, pBC);
//*************************************************************************
//*************************************************************************
// Carry out the density streaming step for mass transport
//*************************************************************************
dvc_DensityStreamD3Q7(nBlocks, nthreads, S,
ID, Den, Copy, Phi, ColorGrad, Velocity,beta, Nx, Ny, Nz, pBC);
//*************************************************************************
//*************************************************************************
// Swap the distributions for momentum transport
//*************************************************************************
dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
//*************************************************************************
//*************************************************************************
// Compute the phase indicator field and reset Copy, Den
//*************************************************************************
dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
//*************************************************************************
dvc_Barrier();
timestep++;
//.............................................................................
}
//************************************************************************/
dvc_Barrier();
//.......... stop and destroy timer.............................
cudaEventRecord( stop, stream);
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
printf("CPU time = %f \n", time);
float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
printf("MLUPS = %f \n", MLUPS);
cudaEventDestroy( start );
cudaEventDestroy( stop );
double *Data;
Data = new double[3*N];
cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
// Write out the Phase Indicator Field
FILE *phase;
phase = fopen("Phase.out","wb");
fwrite(Data,8,N,phase);
fclose(phase);
//....................................................
// Write out the pressure - (reuse Phi arrays since we're done with those)
// ComputeDensity<<< grid, nthreads>>> (ID, f_even, f_odd, Phi, Nx, Ny, Nz, S);
// cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
// FILE *PRESSURE;
// PRESSURE = fopen("Pressure.out","wb");
// fwrite(Phi,8,N,PRESSURE);
// fclose(PRESSURE);
//....................................................
// Write out the Color Gradient
cudaMemcpy(Data, ColorGrad, 3*dist_mem_size, cudaMemcpyDeviceToHost);
FILE *CG;
CG = fopen("ColorGrad.out","wb");
fwrite(Data,8,3*N,CG);
fclose(CG);
// Write out the Velocity
// FILE *VEL;
// VEL = fopen("Velocity.out","wb");
// fwrite(Velocity,8,3*N,VEL);
// fclose(VEL);
// cleanup
cudaFree(ID);
cudaFree(f_even); cudaFree(f_odd);
cudaFree(Velocity);
cudaFree(Phi);
cudaFree (ColorGrad);
cudaFree (Den); cudaFree(Copy);
cudaFree (Phi);
free(id);
//***********Finish up!*********************************
#ifdef useMPI
MPI_Finalize();
#endif
return 0;
}

1621
gpu/exe/lb2_Color_mpi.cpp Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff