clean up

2023-10-23 04:19:15 -04:00 · 2023-10-23 04:19:15 -04:00 · 12a9496a7e
commit 12a9496a7e
parent 1ed3428ef6
9 changed files with 0 additions and 10384 deletions
--- a/cuda/exe/CMakeLists.txt
+++ b/cuda/exe/CMakeLists.txt
@ -1,11 +0,0 @@
-#INSTALL_LBPM_EXE( lb1_MRT_mpi )
-INSTALL_LBPM_EXE( lb2_Color )
-INSTALL_LBPM_EXE( lb2_Color_mpi )
-#INSTALL_LBPM_EXE( lb2_Color_pBC_wia_mpi )
-INSTALL_LBPM_EXE( lb2_Color_wia_mpi )
-
-
-# Run the serial ConstrainedBubble inputs as a weekly test
-CONFIGURE_FILE( ${LBPM_SOURCE_DIR}/example/ConstrainedBubble/Color.in ${CMAKE_CURRENT_BINARY_DIR}/Color.in COPYONLY )
-CONFIGURE_FILE( ${LBPM_SOURCE_DIR}/example/ConstrainedBubble/Domain.in ${CMAKE_CURRENT_BINARY_DIR}/Domain.in COPYONLY )
-ADD_LBPM_WEEKLY_TEST( lb2_Color_wia_mpi 1 )
--- a/cuda/exe/lb1_MRT-swap.cu
+++ b/cuda/exe/lb1_MRT-swap.cu
@ -1,248 +0,0 @@
-#include <stdio.h>
-#include <iostream>
-#include <fstream>
-#include <cuda.h>
-//#include <cutil.h>
-
-using namespace std;
-
-//*************************************************************************
-extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
-							  int Ny, int Nz, int nblocks, int nthreads, int S);
-//*************************************************************************
-extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
-							  int Ny, int Nz, int nblocks, int nthreads, int S);
-//*************************************************************************
-extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
-		int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
-//*************************************************************************
-
-void Write_Out(double *array, int Nx, int Ny, int Nz){
-	int value;
-	FILE *output;
-	output = fopen("dist.list","w");
-	for (int k=0; k<Nz; k++){
-		for (int j=0; j<Ny; j++){
-			for (int i=0; i<Nx; i++){
-				int index = k*Nx*Ny+j*Nx+i;
-				value = int(array[index]);
-				fprintf(output, "| %i",value);
-			}
-			fprintf(output, " | \n");
-		}
-		fprintf(output,"************************************** \n");	
-	}
-	fclose(output);
-}
-
-//**************************************************************************
-// MRT implementation of the LBM using CUDA
-//**************************************************************************
-
-int main(void)
-{
-
-	int deviceCount;
-	cudaGetDeviceCount(&deviceCount);
-	int device = 1;
-	printf("Number of devices = %i \n", deviceCount);
-	printf("Current device is = %i \n", device);
-	cudaSetDevice(device);
-	
-	// BGK Model parameters
-	string FILENAME;	
-	unsigned int nBlocks, nthreads;
-	int timestepMax, interval;
-	double tau,Fx,Fy,Fz,tol;
-	// Domain variables
-	int Nx,Ny,Nz;
-
-	ifstream input("MRT.in");
-	input >> FILENAME;		// name of the input file
-	input >> Nz;			// number of nodes (x,y,z)
-	input >> nBlocks;				
-	input >> nthreads;				
-	input >> tau;				// relaxation time 
-	input >> Fx;			// External force components (x,y,z)
-	input >> Fy;
-	input >> Fz;
-	input >> timestepMax;			// max no. of timesteps
-	input >> interval;			// error interval
-	input >> tol;				// error tolerance
-	
-	double rlx_setA = 1.f/tau;
-	double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
-	
-	printf("tau = %f \n", tau);
-	printf("Set A = %f \n", rlx_setA);
-	printf("Set B = %f \n", rlx_setB);
-	printf("Force(x) = %f \n", Fx);
-	printf("Force(y) = %f \n", Fy);
-	printf("Force(z) = %f \n", Fz);
-
-	Nx = Ny = Nz;	// Cubic domain
-	
-	int N = Nx*Ny*Nz;
-	int dist_mem_size = N*sizeof(double);
-	
-//	unsigned int nBlocks = 32;
-//	int nthreads = 128;
-	int S = N/nthreads/nBlocks;
-	
-//	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
-	dim3 grid(nBlocks,1,1);
-		
-	printf("Number of blocks = %i \n", nBlocks);
-	printf("Threads per block = %i \n", nthreads);
-	printf("Sweeps per thread = %i \n", S);
-	printf("Number of nodes per side = %i \n", Nx);
-	printf("Total Number of nodes = %i \n", N);
-	
-	//.......................................................................
-	printf("Read input media... \n");
-	// .......... READ THE INPUT FILE .......................................
-	int n;
-	char value;
-	char *id;
-	id = new char[N];	
-	int sum = 0;
-	double porosity;
-	ifstream PM(FILENAME.c_str(),ios::binary);
-	for (int k=0;k<Nz;k++){
-		for (int j=0;j<Ny;j++){
-			for (int i=0;i<Nx;i++){
-				PM.read((char *) (&value), sizeof(value));
-				n = k*Nx*Ny+j*Nx+i;
-				id[n] = value;
-				if (value > 0) sum++;
-			}
-		}
-	}
-	PM.close();
-	printf("File porosity = %f\n", double(sum)/N);
-	//.......................................................................
-	//...........device phase ID.................................................
-	char *ID;
-	cudaMalloc((void **) &ID, N);						// Allocate device memory
-	// Copy to the device
-	cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
-	//...........................................................................
-	
-	//......................device distributions.................................
-	double *f_even,*f_odd;
-	//...........................................................................
-	cudaMalloc((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
-	cudaMalloc((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
-	//...........................................................................
-
-	//...........................................................................
-//	cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
-//	cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
-//	cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
-//	cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
-//	cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
-
-	printf("Setting the distributions, size = : %i\n", N);
-	//...........................................................................
-//	INITIALIZE <<< grid, nthreads >>>  (ID, f_even, f_odd, Nx, Ny, Nz, S);
-	//...........................................................................
-	dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
-	//*************************************************************************
-
-	int timestep = 0;
-	printf("No. of timesteps: %i \n", timestepMax);
-	
-	//.......create a stream for the LB calculation.......
-	cudaStream_t stream;
-	cudaStreamCreate(&stream);
-	
-	//.......create and start timer............
-	cudaEvent_t start, stop; 
-	float time; 
- 
-	cudaEventCreate(&start); 
-	cudaEventCreate(&stop); 
-	cudaEventRecord( start, 0 ); 
-	//.........................................
-	
-	//************ MAIN ITERATION LOOP ***************************************/
-	while (timestep < timestepMax){
-	
-		//...................................................................
-		//........ Execute the swap kernel (device) .........................
-//		SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
-		//...................................................................
-		dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
-
-		//........ Execute the collision kernel (device) ....................
-//		MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
-//									rlx_setA, rlx_setB, Fx, Fy, Fz);
-		//............................................................
-		dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
-		// Iteration completed!
-
-		timestep++;
-		//...................................................................
-		
-	}
-	//************************************************************************/
-	
-	cudaThreadSynchronize();
-	//.......... stop and destroy timer.............................
-	cudaEventRecord( stop, stream); 
-	cudaEventSynchronize( stop ); 
- 
-	cudaEventElapsedTime( &time, start, stop ); 
-	printf("CPU time = %f \n", time);
-	
-	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
-	printf("MLUPS = %f \n", MLUPS);
-
-	cudaStreamDestroy(stream);
-	cudaEventDestroy( start ); 
-	cudaEventDestroy( stop ); 
-	//..............................................................
-	
-	//..............................................................
-	//.........Compute the velocity and copy result to host ........
-	double *velocity;
-	velocity = new double[3*N];
-	//......................device distributions....................................
-	double *vel;
-	//..............................................................................
-	cudaMalloc((void **) &vel, 3*dist_mem_size);	// Allocate device memory
-	//..............................................................................
-//	Compute_VELOCITY <<< grid, nthreads >>>  (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
-	//..............................................................................
-	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
-	//..............................................................................
-
-	//............................................................	
-	//....Write the z-velocity to test poiseuille flow............
-	double vz,vz_avg;	
-	vz_avg = 0.0;
-
-	FILE *output;
-	output = fopen("velocity.out","w");
-	for (int k=0; k<1; k++){
-		for (int j=0; j<1; j++){
-			for (int i=0; i<Nx; i++){
-				int n = k*Nx*Ny+j*Nx+i;
-				//.....print value........
-				vz = velocity[2*N+n];
-				vz_avg += vz;
-				fprintf(output, " %e",vz);
-			}
-		}
-	}
-	fclose(output);
-	
-	vz = vz_avg/double(sum);
-	printf("Average Velocity = %e\n", vz);
-
-
-	// cleanup	
-	cudaFree(f_even);	cudaFree(f_odd);	cudaFree(vel);	cudaFree(ID);
-	free (velocity);	free(id);
-	
-}
--- a/cuda/exe/lb1_MRT.cu
+++ b/cuda/exe/lb1_MRT.cu
@ -1,246 +0,0 @@
-#include <stdio.h>
-#include <iostream>
-#include <fstream>
-#include <cuda.h>
-
-using namespace std;
-
-//*************************************************************************
-extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
-//*************************************************************************
-extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
-//*************************************************************************
-extern "C" void dvc_Barrier();
-//*************************************************************************
-extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
-							  int Ny, int Nz, int nblocks, int nthreads, int S);
-//*************************************************************************
-extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
-							  int Ny, int Nz, int nblocks, int nthreads, int S);
-//*************************************************************************
-extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
-		int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
-//*************************************************************************
-
-void Write_Out(double *array, int Nx, int Ny, int Nz){
-	int value;
-	FILE *output;
-	output = fopen("dist.list","w");
-	for (int k=0; k<Nz; k++){
-		for (int j=0; j<Ny; j++){
-			for (int i=0; i<Nx; i++){
-				int index = k*Nx*Ny+j*Nx+i;
-				value = int(array[index]);
-				fprintf(output, "| %i",value);
-			}
-			fprintf(output, " | \n");
-		}
-		fprintf(output,"************************************** \n");	
-	}
-	fclose(output);
-}
-
-//**************************************************************************
-// MRT implementation of the LBM using CUDA
-//**************************************************************************
-
-int main(void)
-{
-
-	// BGK Model parameters
-	string FILENAME;	
-	unsigned int nBlocks, nthreads;
-	int timestepMax, interval;
-	double tau,Fx,Fy,Fz,tol;
-	// Domain variables
-	int Nx,Ny,Nz;
-
-	ifstream input("MRT.in");
-	input >> FILENAME;		// name of the input file
-	input >> Nz;			// number of nodes (x,y,z)
-	input >> nBlocks;				
-	input >> nthreads;				
-	input >> tau;				// relaxation time 
-	input >> Fx;			// External force components (x,y,z)
-	input >> Fy;
-	input >> Fz;
-	input >> timestepMax;			// max no. of timesteps
-	input >> interval;			// error interval
-	input >> tol;				// error tolerance
-	
-	double rlx_setA = 1.f/tau;
-	double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
-	
-	printf("tau = %f \n", tau);
-	printf("Set A = %f \n", rlx_setA);
-	printf("Set B = %f \n", rlx_setB);
-	printf("Force(x) = %f \n", Fx);
-	printf("Force(y) = %f \n", Fy);
-	printf("Force(z) = %f \n", Fz);
-
-	Nx = Ny = Nz;	// Cubic domain
-	
-	int N = Nx*Ny*Nz;
-	int dist_mem_size = N*sizeof(double);
-	
-//	unsigned int nBlocks = 32;
-//	int nthreads = 128;
-	int S = N/nthreads/nBlocks;
-	
-//	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
-	dim3 grid(nBlocks,1,1);
-		
-	printf("Number of blocks = %i \n", nBlocks);
-	printf("Threads per block = %i \n", nthreads);
-	printf("Sweeps per thread = %i \n", S);
-	printf("Number of nodes per side = %i \n", Nx);
-	printf("Total Number of nodes = %i \n", N);
-	
-	//.......................................................................
-	printf("Read input media... \n");
-	// .......... READ THE INPUT FILE .......................................
-	int n;
-	char value;
-	char *id;
-	id = new char[N];	
-	int sum = 0;
-	double porosity;
-	ifstream PM(FILENAME.c_str(),ios::binary);
-	for (int k=0;k<Nz;k++){
-		for (int j=0;j<Ny;j++){
-			for (int i=0;i<Nx;i++){
-				PM.read((char *) (&value), sizeof(value));
-				n = k*Nx*Ny+j*Nx+i;
-				id[n] = value;
-				if (value > 0) sum++;
-			}
-		}
-	}
-	PM.close();
-	printf("File porosity = %f\n", double(sum)/N);
-	//.......................................................................
-	//...........device phase ID.................................................
-	char *ID;
-	 dvc_AllocateDeviceMemory((void **) &ID, N);						// Allocate device memory
-	// Copy to the device
-	dvc_CopyToDevice(ID, id, N);
-	//...........................................................................
-	
-	//......................device distributions.................................
-	double *f_even,*f_odd;
-	//...........................................................................
-	 dvc_AllocateDeviceMemory((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
-	 dvc_AllocateDeviceMemory((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
-	//...........................................................................
-
-	//...........................................................................
-//	cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
-//	cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
-//	cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
-//	cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
-//	cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
-
-	printf("Setting the distributions, size = : %i\n", N);
-	//...........................................................................
-//	INITIALIZE <<< grid, nthreads >>>  (ID, f_even, f_odd, Nx, Ny, Nz, S);
-	//...........................................................................
-	dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
-	//*************************************************************************
-
-	int timestep = 0;
-	printf("No. of timesteps: %i \n", timestepMax);
-	
-	//.......create a stream for the LB calculation.......
-	cudaStream_t stream;
-	cudaStreamCreate(&stream);
-	
-	//.......create and start timer............
-	cudaEvent_t start, stop; 
-	float time; 
- 
-	cudaEventCreate(&start); 
-	cudaEventCreate(&stop); 
-	cudaEventRecord( start, 0 ); 
-	//.........................................
-	
-	//************ MAIN ITERATION LOOP ***************************************/
-	while (timestep < timestepMax){
-	
-		//...................................................................
-		//........ Execute the swap kernel (device) .........................
-//		SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
-		//...................................................................
-		dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
-
-		//........ Execute the collision kernel (device) ....................
-//		MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
-//									rlx_setA, rlx_setB, Fx, Fy, Fz);
-		//............................................................
-		dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
-		// Iteration completed!
-
-		timestep++;
-		//...................................................................
-		
-	}
-	//************************************************************************/
-	
-//	cudaThreadSynchronize();
-	dvc_Barrier();
-	//.......... stop and destroy timer.............................
-	cudaEventRecord( stop, stream); 
-	cudaEventSynchronize( stop ); 
- 
-	cudaEventElapsedTime( &time, start, stop ); 
-	printf("CPU time = %f \n", time);
-	
-	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
-	printf("MLUPS = %f \n", MLUPS);
-
-	cudaStreamDestroy(stream);
-	cudaEventDestroy( start ); 
-	cudaEventDestroy( stop ); 
-	//..............................................................
-	
-	//..............................................................
-	/*//.........Compute the velocity and copy result to host ........
-	double *velocity;
-	velocity = new double[3*N];
-	//......................device distributions....................................
-	double *vel;
-	//..............................................................................
-	 dvc_AllocateDeviceMemory((void **) &vel, 3*dist_mem_size);	// Allocate device memory
-	//..............................................................................
-//	Compute_VELOCITY <<< grid, nthreads >>>  (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
-	//..............................................................................
-//	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
-	//..............................................................................
-
-	//............................................................	
-	//....Write the z-velocity to test poiseuille flow............
-	double vz,vz_avg;	
-	vz_avg = 0.0;
-
-/*	FILE *output;
-	output = fopen("velocity.out","w");
-	for (int k=0; k<1; k++){
-		for (int j=0; j<1; j++){
-			for (int i=0; i<Nx; i++){
-				int n = k*Nx*Ny+j*Nx+i;
-				//.....print value........
-				vz = velocity[2*N+n];
-				vz_avg += vz;
-				fprintf(output, " %e",vz);
-			}
-		}
-	}
-	fclose(output);
-	
-	vz = vz_avg/double(sum);
-	printf("Average Velocity = %e\n", vz);
-*/
-	// cleanup	
-//	cudaFree(f_even);	cudaFree(f_odd);	cudaFree(vel);	cudaFree(ID);
-//	free (velocity);	free(id);
-	
-}
--- a/cuda/exe/lb1_MRT_mpi.cpp
+++ b/cuda/exe/lb1_MRT_mpi.cpp
--- a/cuda/exe/lb1_MRT_mpi.cu
+++ b/cuda/exe/lb1_MRT_mpi.cu
--- a/cuda/exe/lb2_Color.cpp
+++ b/cuda/exe/lb2_Color.cpp
@ -1,400 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <fstream>
-
-//*************************************************************************
-// Functions defined in Color.cu
-//*************************************************************************
-extern "C" void dvc_InitDenColor(int nblocks, int nthreads, int S, char *ID,
-                                 double *Den, double *Phi, double das,
-                                 double dbs, int N);
-//*************************************************************************
-extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
-                                         char *ID, double *Phi,
-                                         double *ColorGrad, int Nx, int Ny,
-                                         int Nz);
-//*************************************************************************
-extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S, char *ID,
-                                 double *f_even, double *f_odd,
-                                 double *ColorGrad, double *Velocity,
-                                 double rlxA, double rlxB, double alpha,
-                                 double beta, double Fx, double Fy, double Fz,
-                                 int Nx, int Ny, int Nz, bool pBC);
-//*************************************************************************
-extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
-                                      char *ID, double *Den, double *Copy,
-                                      double *Phi, double *ColorGrad,
-                                      double *Velocity, double beta, int Nx,
-                                      int Ny, int Nz, bool pBC);
-//*************************************************************************
-extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S, char *ID,
-                               double *Phi, double *Copy, double *Den, int N);
-//*************************************************************************
-//*************************************************************************
-// Functions defined in D3Q19.cu
-//*************************************************************************
-extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S, char *ID,
-                              double *f_even, double *f_odd, int Nx, int Ny,
-                              int Nz);
-//*************************************************************************
-extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S, char *ID,
-                              double *f_even, double *f_odd, int Nx, int Ny,
-                              int Nz);
-//*************************************************************************
-extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList,
-                             int start, int sendCount, double *sendbuf,
-                             double *Dist, int N);
-//*************************************************************************
-extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy,
-                               int Cqz, int *RecvList, int start, int recvCount,
-                               double *recvbuf, double *Dist, int Nx, int Ny,
-                               int Nz);
-//*************************************************************************
-//***************************************************************************************
-// Functions defined in D3Q7.cu
-//***************************************************************************************
-extern "C" void dvc_PackDenD3Q7(int grid, int threads, int *list, int count,
-                                double *sendbuf, int number, double *Data,
-                                int N);
-//***************************************************************************************
-extern "C" void dvc_UnpackDenD3Q7(int grid, int threads, int *list, int count,
-                                  double *recvbuf, int number, double *Data,
-                                  int N);
-//***************************************************************************************
-extern "C" void dvc_PackValues(int grid, int threads, int *list, int count,
-                               double *sendbuf, double *Data, int N);
-//***************************************************************************************
-extern "C" void dvc_UnpackValues(int grid, int threads, int *list, int count,
-                                 double *recvbuf, double *Data, int N);
-//***************************************************************************************
-//*************************************************************************
-// Functions defined in CudaExtras.cu
-//*************************************************************************
-extern "C" void dvc_AllocateDeviceMemory(void **address, size_t size);
-//*************************************************************************
-extern "C" void dvc_CopyToDevice(void *dest, void *source, size_t size);
-//*************************************************************************
-extern "C" void dvc_CopyToHost(void *dest, void *source, size_t size);
-//*************************************************************************
-extern "C" void dvc_Barrier();
-//*************************************************************************
-
-//*************************************************************************
-// Implementation of Two-Phase Immiscible LBM using CUDA
-//*************************************************************************
-
-using namespace std;
-
-inline void PackID(int *list, int count, char *sendbuf, char *ID) {
-    // Fill in the phase ID values from neighboring processors
-    // This packs up the values that need to be sent from one processor to another
-    int idx, n;
-
-    for (idx = 0; idx < count; idx++) {
-        n = list[idx];
-        sendbuf[idx] = ID[n];
-    }
-}
-//***************************************************************************************
-inline void UnpackID(int *list, int count, char *recvbuf, char *ID) {
-    // Fill in the phase ID values from neighboring processors
-    // This unpacks the values once they have been recieved from neighbors
-    int idx, n;
-
-    for (idx = 0; idx < count; idx++) {
-        n = list[idx];
-        ID[n] = recvbuf[idx];
-    }
-}
-//***************************************************************************************
-
-int main(int argc, char **argv) {
-
-    int rank = 0;
-    int nprocs = 1;
-    int nprocx, nprocy, nprocz;
-    int iproc, jproc, kproc;
-    if (rank == 0) {
-        printf("********************************************************\n");
-        printf("Running Hybrid Implementation of Color LBM	\n");
-        printf("********************************************************\n");
-    }
-    // Color Model parameters
-    string FILENAME;
-    unsigned int nBlocks, nthreads;
-    int Nx, Ny, Nz;
-    int timestepMax, interval;
-    double tau, Fx, Fy, Fz, tol;
-    double alpha, beta;
-    double das, dbs;
-    double din, dout;
-    bool pBC;
-    int i, j, k, n;
-
-    if (rank == 0) {
-        //.............................................................
-        //		READ SIMULATION PARMAETERS FROM INPUT FILE
-        //.............................................................
-        ifstream input("Color.in");
-        // Line 1: Name of the phase indicator file (s=0,w=1,n=2)
-        input >> FILENAME;
-        // Line 2: domain size (Nx, Ny, Nz)
-        input >> Nz; // number of nodes (x,y,z)
-        input >> nBlocks;
-        input >> nthreads;
-        // Line 3: model parameters (tau, alpha, beta, das, dbs)
-        input >> tau;
-        input >> alpha;
-        input >> beta;
-        input >> das;
-        input >> dbs;
-        // Line 4: External force components (Fx,Fy, Fz)
-        input >> Fx;
-        input >> Fy;
-        input >> Fz;
-        // Line 5: Pressure Boundary conditions
-        input >> pBC;
-        input >> din;
-        input >> dout;
-        // Line 6: time-stepping criteria
-        input >> timestepMax; // max no. of timesteps
-        input >> interval;    // error interval
-        input >> tol;         // error tolerance
-        //.............................................................
-
-        ifstream domain("Domain.in");
-        domain >> nprocx;
-        domain >> nprocy;
-        domain >> nprocz;
-    }
-
-    double rlxA = 1.f / tau;
-    double rlxB = 8.f * (2.f - rlxA) / (8.f - rlxA);
-
-    if (nprocs != nprocx * nprocy * nprocz) {
-        printf("Fatal error in processor number! \n");
-        printf("nprocx =  %i \n", nprocx);
-        printf("nprocy =  %i \n", nprocy);
-        printf("nprocz =  %i \n", nprocz);
-    }
-
-    if (rank == 0) {
-        printf("********************************************************\n");
-        printf("tau = %f \n", tau);
-        printf("alpha = %f \n", alpha);
-        printf("beta = %f \n", beta);
-        printf("das = %f \n", beta);
-        printf("dbs = %f \n", beta);
-        printf("Force(x) = %f \n", Fx);
-        printf("Force(y) = %f \n", Fy);
-        printf("Force(z) = %f \n", Fz);
-        printf("Sub-domain size = %i x %i x %i\n", Nz, Nz, Nz);
-        printf("Parallel domain size = %i x %i x %i\n", nprocx, nprocy, nprocz);
-        printf("********************************************************\n");
-    }
-
-    Nz += 2;
-    Nx = Ny = Nz; // Cubic domain
-
-    int N = Nx * Ny * Nz;
-    int dist_mem_size = N * sizeof(double);
-
-    //	unsigned int nBlocks = 32;
-    //	int nthreads = 128;
-    int S = N / nthreads / nBlocks;
-
-    //	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
-    //	dim3 grid(nBlocks,1,1);
-
-    if (rank == 0)
-        printf("Number of blocks = %i \n", nBlocks);
-    if (rank == 0)
-        printf("Threads per block = %i \n", nthreads);
-    if (rank == 0)
-        printf("Sweeps per thread = %i \n", S);
-    if (rank == 0)
-        printf("Number of nodes per side = %i \n", Nx);
-    if (rank == 0)
-        printf("Total Number of nodes = %i \n", N);
-    if (rank == 0)
-        printf("********************************************************\n");
-
-    //.......................................................................
-    if (rank == 0)
-        printf("Read input media... \n");
-    //.......................................................................
-    char LocalRankString[8];
-    char LocalRankFilename[40];
-    sprintf(LocalRankString, "%05d", rank);
-    sprintf(LocalRankFilename, "%s%s", "ID.", LocalRankString);
-    //	printf("Local File Name =  %s \n",LocalRankFilename);
-    // .......... READ THE INPUT FILE .......................................
-    char value;
-    char *id;
-    id = new char[N];
-    int sum = 0;
-    //	double porosity;
-    //.......................................................................
-    ifstream PM(LocalRankFilename, ios::binary);
-    for (k = 0; k < Nz; k++) {
-        for (j = 0; j < Ny; j++) {
-            for (i = 0; i < Nx; i++) {
-                n = k * Nx * Ny + j * Nx + i;
-                id[n] = 0;
-            }
-        }
-    }
-    for (k = 1; k < Nz - 1; k++) {
-        for (j = 1; j < Ny - 1; j++) {
-            for (i = 1; i < Nx - 1; i++) {
-                PM.read((char *)(&value), sizeof(value));
-                n = k * Nx * Ny + j * Nx + i;
-                id[n] = value;
-                if (value > 0)
-                    sum++;
-            }
-        }
-    }
-    PM.close();
-    //	printf("File porosity = %f\n", double(sum)/N);
-
-    //...........device phase ID.................................................
-    if (rank == 0)
-        printf("Copying phase ID to device \n");
-    char *ID;
-    dvc_AllocateDeviceMemory((void **)&ID, N); // Allocate device memory
-    // Copy to the device
-    dvc_CopyToDevice(ID, id, N);
-    //...........................................................................
-
-    if (rank == 0)
-        printf("Allocating distributions \n");
-    //......................device distributions.................................
-    double *f_even, *f_odd;
-    //...........................................................................
-    dvc_AllocateDeviceMemory((void **)&f_even,
-                             10 * dist_mem_size); // Allocate device memory
-    dvc_AllocateDeviceMemory((void **)&f_odd,
-                             9 * dist_mem_size); // Allocate device memory
-    //...........................................................................
-    //...........................................................................
-    //				MAIN  VARIABLES ALLOCATED HERE
-    //...........................................................................
-    double *Phi, *Den, *Copy;
-    double *ColorGrad, *Velocity;
-    //...........................................................................
-    dvc_AllocateDeviceMemory((void **)&Phi, dist_mem_size);
-    dvc_AllocateDeviceMemory((void **)&Den, 2 * dist_mem_size);
-    dvc_AllocateDeviceMemory((void **)&Copy, 2 * dist_mem_size);
-    dvc_AllocateDeviceMemory((void **)&Velocity, 3 * dist_mem_size);
-    dvc_AllocateDeviceMemory((void **)&ColorGrad, 3 * dist_mem_size);
-    //...........................................................................
-    if (rank == 0)
-        printf("Setting the distributions, size = : %i\n", N);
-    //...........................................................................
-    dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
-    dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi, das, dbs, N);
-    //...........................................................................
-    dvc_ComputePhi(nBlocks, nthreads, S, ID, Phi, Copy, Den, N);
-    //...........................................................................
-
-    //...........................................................................
-    // Grids used to pack faces on the GPU for MPI
-    int faceGrid, edgeGrid, packThreads;
-    packThreads = 512;
-    edgeGrid = 1;
-    faceGrid = Nx * Ny / packThreads;
-
-    int timestep = 0;
-    if (rank == 0)
-        printf("********************************************************\n");
-    if (rank == 0)
-        printf("No. of timesteps: %i \n", timestepMax);
-
-    //.......create a stream for the LB calculation.......
-    //	cudaStream_t stream;
-    //	cudaStreamCreate(&stream);
-
-    //.......create and start timer............
-    double start, stop;
-    double walltime;
-    start = clock();
-
-    //************ MAIN ITERATION LOOP ***************************************/
-    while (timestep < timestepMax) {
-
-        //*************************************************************************
-        // 		Compute the color gradient
-        //*************************************************************************
-        dvc_ComputeColorGradient(nBlocks, nthreads, S, ID, Phi, ColorGrad, Nx,
-                                 Ny, Nz);
-        //*************************************************************************
-
-        //*************************************************************************
-        // 		Perform collision step for the momentum transport
-        //*************************************************************************
-        dvc_ColorCollide(nBlocks, nthreads, S, ID, f_even, f_odd, ColorGrad,
-                         Velocity, rlxA, rlxB, alpha, beta, Fx, Fy, Fz, Nx, Ny,
-                         Nz, pBC);
-        //*************************************************************************
-
-        //*************************************************************************
-        // 		Carry out the density streaming step for mass transport
-        //*************************************************************************
-        dvc_DensityStreamD3Q7(nBlocks, nthreads, S, ID, Den, Copy, Phi,
-                              ColorGrad, Velocity, beta, Nx, Ny, Nz, pBC);
-        //*************************************************************************
-
-        //*************************************************************************
-        // 		Swap the distributions for momentum transport
-        //*************************************************************************
-        dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
-        //*************************************************************************
-
-        //*************************************************************************
-        // 		Compute the phase indicator field and reset Copy, Den
-        //*************************************************************************
-        dvc_ComputePhi(nBlocks, nthreads, S, ID, Phi, Copy, Den, N);
-        //*************************************************************************
-
-        // Iteration completed!
-        timestep++;
-
-        //...................................................................
-    }
-    //************************************************************************/
-    dvc_Barrier();
-    stop = clock();
-
-    //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
-    walltime = (stop - start) / CLOCKS_PER_SEC;
-    //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
-    double MLUPS = double(Nx * Ny * Nz * timestep) / walltime / 1000000;
-    if (rank == 0)
-        printf("********************************************************\n");
-    if (rank == 0)
-        printf("CPU time = %f \n", walltime);
-    if (rank == 0)
-        printf("Lattice update rate (per core)= %f MLUPS \n", MLUPS);
-    MLUPS *= nprocs;
-    if (rank == 0)
-        printf("Lattice update rate (total)= %f MLUPS \n", MLUPS);
-    if (rank == 0)
-        printf("********************************************************\n");
-
-    //************************************************************************/
-    // Write out the phase indicator field
-    //************************************************************************/
-    sprintf(LocalRankFilename, "%s%s", "Phase.", LocalRankString);
-    //	printf("Local File Name =  %s \n",LocalRankFilename);
-    double *phiOut;
-    phiOut = new double[N];
-    dvc_CopyToHost(phiOut, Phi, N * sizeof(double));
-
-    FILE *PHASE;
-    PHASE = fopen(LocalRankFilename, "wb");
-    fwrite(phiOut, 8, N, PHASE);
-    fclose(PHASE);
-    //************************************************************************/
-}
--- a/cuda/exe/lb2_Color.cu
+++ b/cuda/exe/lb2_Color.cu
@ -1,425 +0,0 @@
-#ifdef useMPI
-#include <mpi.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <fstream>
-#include <math.h>
-#include <cuda.h>
-
-using namespace std;
-//*************************************************************************
-// HokieSpeed
-//nvcc -Xcompiler -fopenmp -lgomp -O3 -arch sm_20 -o hybridATLKR lb2_ATLKR_hybrid.cu
-// -I$VT_MPI_INC -L$VT_MPI_LIB -lmpi
-//*************************************************************************
-
-//*************************************************************************
-// Implementation of Two-Phase Immiscible LBM using CUDA
-//*************************************************************************
-
-//*************************************************************************
-extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S,
-		char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
-//*************************************************************************
-extern "C" void dvc_InitDenColor( int nblocks, int nthreads, int S,
-		char *ID, double *Den, double *Phi, double das, double dbs, int N);
-//*************************************************************************
-extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
-		char *ID, double *Phi, double *ColorGrad, int Nx, int Ny, int Nz);
-//*************************************************************************
-extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S,
-		char *ID, double *f_even, double *f_odd, double *ColorGrad, double *Velocity,
-		double rlxA, double rlxB,double alpha, double beta, double Fx, double Fy, double Fz,
-		int Nx, int Ny, int Nz, bool pBC);
-//*************************************************************************
-extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
-		char *ID, double *Den, double *Copy, double *Phi, double *ColorGrad, double *Velocity,
-		double beta, int Nx, int Ny, int Nz, bool pBC);
-//*************************************************************************
-extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S,
-		char *ID, double *Phi, double *Copy, double *Den, int N);
-//*************************************************************************
-extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
-//*************************************************************************
-extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
-//*************************************************************************
-extern "C" void dvc_Barrier();
-//*************************************************************************
-extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S,
-		char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
-//*************************************************************************
-extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList, int start,
-		int sendCount, double *sendbuf, double *Dist, int N);
-//*************************************************************************
-extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy, int Cqz, int *RecvList, int start,
-		int recvCount, double *recvbuf, double *Dist, int Nx, int Ny, int Nz);
-//*************************************************************************
-
-int main(int argc, char *argv[])
-{
-	
-	//********** Initialize MPI ****************
-	int numprocs,rank;
-#ifdef useMPI
-	MPI_Status stat;
-	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_size(comm,&numprocs);
-	MPI_Comm_rank(comm,&rank);
-#else
-    MPI_Comm comm = MPI_COMM_WORLD;
-	numprocs = 1;
-	rank = 0;
-#endif
-	//******************************************
-	
-	if (rank == 0){
-		printf("********************************************************\n");
-		printf("Running Hybrid Implementation of Color LBM	\n");
-		printf("********************************************************\n");
-	}
-	// Color Model parameters
-	string FILENAME;
-	unsigned int nBlocks, nthreads;
-	int Nx,Ny,Nz;
-	int timestepMax, interval;
-	double tau,Fx,Fy,Fz,tol;
-	double alpha, beta;
-	double das, dbs;
-	double din,dout;
-	bool pBC;
-
-	if (rank==0){
-		//.............................................................
-		//		READ SIMULATION PARMAETERS FROM INPUT FILE 
-		//.............................................................
-		ifstream input("Color.in");
-		// Line 1: Name of the phase indicator file (s=0,w=1,n=2)
-		input >> FILENAME;				
-		// Line 2: domain size (Nx, Ny, Nz)
-		input >> Nz;				// number of nodes (x,y,z)
-		input >> nBlocks;				
-		input >> nthreads;			
-		// Line 3: model parameters (tau, alpha, beta, das, dbs)
-		input >> tau;
-		input >> alpha;
-		input >> beta;
-		input >> das;	
-		input >> dbs;
-		// Line 4: External force components (Fx,Fy, Fz)
-		input >> Fx;				
-		input >> Fy;
-		input >> Fz;
-		// Line 5: Pressure Boundary conditions
-		input >> pBC;
-		input >> din;
-		input >> dout;
-		// Line 6: time-stepping criteria
-		input >> timestepMax;			// max no. of timesteps
-		input >> interval;			// error interval
-		input >> tol;				// error tolerance
-		//.............................................................
-	}
-#ifdef useMPI
-	// **************************************************************
-	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
-	//.................................................
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
-	//.................................................
-	MPI_Barrier(comm);
-	// **************************************************************
-#endif
-	
-	double rlxA = 1.f/tau;
-	double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
-
-	if (pBC && rank == 0){
-		printf("Assigning presusre boundary conditions \n");
-		printf("Inlet density = %f \n", din);
-		printf("Outlet density = %f \n", dout);
-	}
-
-	if (rank==0){
-		printf("....Parameters................\n");
-		printf("tau = %f \n", tau);
-		printf("alpha = %f \n", alpha);
-		printf("beta = %f \n", beta);
-		printf("das = %f \n", das);
-		printf("dbs = %f \n", dbs);
-		printf("Force(x) = %f \n", Fx);
-		printf("Force(y) = %f \n", Fy);
-		printf("Force(z) = %f \n", Fz);
-		printf("Nz = %i \n", Nz);
-		printf("timestepMax = %i \n", timestepMax);
-		printf("...............................\n");
-	}
-	
-	// Identical cubic sub-domains
-	Nx = Ny = Nz;// = 16*s;		// Cubic domain
-	int N = Nx*Ny*Nz;
-	int dist_mem_size = N*sizeof(double);
-
-//	unsigned int nBlocks = 32;
-//	int nthreads = 128;
-	int S = N/nthreads/nBlocks;
-	if (nBlocks*nthreads*S < N)	S++;
-//	int S = 1;
-	
-//	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
-//	dim3 grid(nBlocks,1,1);
-	if (rank==1){
-		printf("Number of blocks = %i \n", nBlocks);
-		printf("Threads per block = %i \n", nthreads);
-		printf("Sweeps per thread = %i \n", S);
-		printf("Number of nodes per side = %i \n", Nx);
-		printf("Total Number of nodes = %i \n", N);
-		printf("...............................\n");
-	}
-	
-	//.......................................................................
-	// .......... READ THE INPUT FILE .......................................
-	int n;
-	char value;
-	char *id;
-	id = new char[N];	
-	int sum = 0;
-	// RANK 0 READS THE INPUT FILE	
-	if (rank==0){
-		printf("Read input media... \n");
-		ifstream PM(FILENAME.c_str(),ios::binary);
-		for (int k=0;k<Nz;k++){
-			for (int j=0;j<Ny;j++){
-				for (int i=0;i<Nx;i++){
-					PM.read((char *) (&value), sizeof(value));
-					n = k*Nx*Ny+j*Nx+i;
-
-					if (value>0){
-						if (pBC) value=2; 	// Saturate with NWP
-						if (k<8){
-							value=1;
-						}
-					}
-
-					id[n] = value;
-					if (value > 0) sum++;
-				}
-			}
-		}
-		PM.close();
-		printf("File porosity = %f\n", double(sum)/N);
-	}
-	//......... for pressure BC only............................
-	// Void the first / last rows if pressure BC are to be used
-	if (pBC){
-		for (int k=0;k<Nz;k++){
-			for (int j=0;j<Ny;j++){
-				for (int i=0;i<Nx;i++){
-					n = k*Nx*Ny+j*Nx+i;
-					if (k<4) id[n] = 1;
-					if (k>Nz-5) id[n] = 2;
-				}
-			}
-			// Skip the non-boundary values
-			if (k==4)	k=Nz-5;
-		}
-	}
-#ifdef useMPI	//............................................................
-	MPI_Barrier(comm);
-	MPI_Bcast(&id[0],N,MPI_CHAR,0,comm);
-	MPI_Barrier(comm);
-#endif
-	if (rank == 0) printf("Domain set.\n");
-	//...........................................................................
-
-	int SBC;
-	int outlet = N-Nx*Ny;
-	if (pBC){
-		SBC = Nx*Ny/nthreads/nBlocks+1;
-		printf("Number of sweeps for inlet / outlet: %i \n", SBC);
-	}
-	//...........................................................................
-	
-	//...........................................................................
-	//...........device phase ID.................................................
-	char *ID;
-	cudaMalloc((void **) &ID, N);						// Allocate device memory
-	// Copy to the device
-	cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
-	//...........................................................................
-	
-	//......................device distributions.................................
-	double *f_even,*f_odd;
-	//...........................................................................
-	cudaMalloc((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
-	cudaMalloc((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
-//	f_even = new double[10*N];
-//	f_odd = new double[9*N];
-	//...........................................................................
-
-	//...........................................................................
-	//				MAIN  VARIABLES ALLOCATED HERE
-	//...........................................................................
-	double *Phi,*Den,*Copy;
-	double *ColorGrad, *Velocity;
-	//...........................................................................
-	cudaMalloc((void **) &Phi, dist_mem_size);
-	cudaMalloc((void **) &Den, 2*dist_mem_size);
-	cudaMalloc((void **) &Copy, 2*dist_mem_size);
-	cudaMalloc((void **) &Velocity, 3*dist_mem_size);
-	cudaMalloc((void **) &ColorGrad, 3*dist_mem_size);
-	//...........................................................................
-	
-	//...........................................................................
-	if (rank==0)	printf("Setting the distributions, size = : %i\n", N);
-	//...........................................................................
-	dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
-	dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi,  das, dbs, N);
-	//...........................................................................
-	dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
-	//...........................................................................
-
-	int timestep;
-//	double starttime,stoptime;
-	if (rank==0)	printf("No. of timesteps: %i \n", timestepMax);
-	timestep = 0;
-	//.......create and start timer............
-	cudaEvent_t start, stop;
-	float time;
-	//.......create a stream for the LB calculation.......
-	cudaStream_t stream;
-	cudaStreamCreate(&stream);
-
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord( start, 0 );
-	//.........................................
-	//************ MAIN TIMESTEP LOOP ***************************************/
-	while (timestep < timestepMax){
-
-		//*************************************************************************
-		// 		Compute the color gradient
-		//*************************************************************************
-		dvc_ComputeColorGradient(nBlocks, nthreads, S,
-				ID, Phi, ColorGrad, Nx, Ny, Nz);
-		//*************************************************************************
-
-		//*************************************************************************
-		// 		Perform collision step for the momentum transport
-		//*************************************************************************
-		dvc_ColorCollide(nBlocks, nthreads, S,
-				ID, f_even, f_odd, ColorGrad, Velocity,
-				rlxA, rlxB,alpha, beta, Fx, Fy, Fz, Nx, Ny, Nz, pBC);
-		//*************************************************************************
-
-		//*************************************************************************
-		// 		Carry out the density streaming step for mass transport
-		//*************************************************************************
-		dvc_DensityStreamD3Q7(nBlocks, nthreads, S,
-				ID, Den, Copy, Phi, ColorGrad, Velocity,beta, Nx, Ny, Nz, pBC);
-		//*************************************************************************
-
-		//*************************************************************************
-		// 		Swap the distributions for momentum transport
-		//*************************************************************************
-		dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
-		//*************************************************************************
-
-		//*************************************************************************
-		// 		Compute the phase indicator field and reset Copy, Den
-		//*************************************************************************
-		dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
-		//*************************************************************************
-
-		dvc_Barrier();
-		timestep++;
-		//.............................................................................
-	}
-	//************************************************************************/
-	dvc_Barrier();
-	//.......... stop and destroy timer.............................
-	cudaEventRecord( stop, stream);
-	cudaEventSynchronize( stop );
-
-	cudaEventElapsedTime( &time, start, stop );
-	printf("CPU time = %f \n", time);
-
-	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
-	printf("MLUPS = %f \n", MLUPS);
-
-	cudaEventDestroy( start );
-	cudaEventDestroy( stop );
-
-	double *Data;
-	Data = new double[3*N];
-
-	cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
-
-	// Write out the Phase Indicator Field
-	FILE *phase;
-	phase = fopen("Phase.out","wb");
-	fwrite(Data,8,N,phase);
-	fclose(phase);
-
-	//....................................................
-	// Write out the pressure - (reuse Phi arrays since we're done with those)
-//	ComputeDensity<<< grid, nthreads>>> (ID, f_even, f_odd, Phi, Nx, Ny, Nz, S);
-//	cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
-//	FILE *PRESSURE;
-//	PRESSURE = fopen("Pressure.out","wb");
-//	fwrite(Phi,8,N,PRESSURE);
-//	fclose(PRESSURE);
-	//....................................................
-
-	// Write out the Color Gradient
-
-	cudaMemcpy(Data, ColorGrad, 3*dist_mem_size, cudaMemcpyDeviceToHost);
-
-	FILE *CG;
-	CG = fopen("ColorGrad.out","wb");
-	fwrite(Data,8,3*N,CG);
-	fclose(CG);
-	
-	// Write out the Velocity
-//	FILE *VEL;
-//	VEL = fopen("Velocity.out","wb");
-//	fwrite(Velocity,8,3*N,VEL);
-//	fclose(VEL);
-
-	// cleanup	
-	cudaFree(ID);
-	cudaFree(f_even);	cudaFree(f_odd);	
-	cudaFree(Velocity);
-	cudaFree(Phi);
-	
-	cudaFree (ColorGrad);
-	cudaFree (Den);		cudaFree(Copy);
-	cudaFree (Phi);
-	free(id);
-	
-	//***********Finish up!*********************************
-#ifdef useMPI
-	MPI_Finalize();
-#endif
-	return 0;
-	
-}
--- a/cuda/exe/lb2_Color_mpi.cpp
+++ b/cuda/exe/lb2_Color_mpi.cpp
--- a/cuda/exe/lb2_Color_pBC_wia_mpi.cpp
+++ b/cuda/exe/lb2_Color_pBC_wia_mpi.cpp