clean up

2023-10-23 04:19:15 -04:00 · 2023-10-23 04:19:15 -04:00 · 12a9496a7e
commit 12a9496a7e
parent 1ed3428ef6
9 changed files with 0 additions and 10384 deletions
--- a/cuda/exe/CMakeLists.txt
+++ b/cuda/exe/CMakeLists.txt
@ -1,11 +0,0 @@
 #INSTALL_LBPM_EXE( lb1_MRT_mpi )
 INSTALL_LBPM_EXE( lb2_Color )
 INSTALL_LBPM_EXE( lb2_Color_mpi )
 #INSTALL_LBPM_EXE( lb2_Color_pBC_wia_mpi )
 INSTALL_LBPM_EXE( lb2_Color_wia_mpi )
 # Run the serial ConstrainedBubble inputs as a weekly test
 CONFIGURE_FILE( ${LBPM_SOURCE_DIR}/example/ConstrainedBubble/Color.in ${CMAKE_CURRENT_BINARY_DIR}/Color.in COPYONLY )
 CONFIGURE_FILE( ${LBPM_SOURCE_DIR}/example/ConstrainedBubble/Domain.in ${CMAKE_CURRENT_BINARY_DIR}/Domain.in COPYONLY )
 ADD_LBPM_WEEKLY_TEST( lb2_Color_wia_mpi 1 )
--- a/cuda/exe/lb1_MRT-swap.cu
+++ b/cuda/exe/lb1_MRT-swap.cu
@ -1,248 +0,0 @@
 #include <stdio.h>
 #include <iostream>
 #include <fstream>
 #include <cuda.h>
 //#include <cutil.h>
 using namespace std;
 //*************************************************************************
 extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
 							  int Ny, int Nz, int nblocks, int nthreads, int S);
 //*************************************************************************
 extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
 							  int Ny, int Nz, int nblocks, int nthreads, int S);
 //*************************************************************************
 extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
 		int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
 //*************************************************************************
 void Write_Out(double *array, int Nx, int Ny, int Nz){
 	int value;
 	FILE *output;
 	output = fopen("dist.list","w");
 	for (int k=0; k<Nz; k++){
 		for (int j=0; j<Ny; j++){
 			for (int i=0; i<Nx; i++){
 				int index = k*Nx*Ny+j*Nx+i;
 				value = int(array[index]);
 				fprintf(output, "| %i",value);
 			}
 			fprintf(output, " | \n");
 		}
 		fprintf(output,"************************************** \n");	
 	}
 	fclose(output);
 }
 //**************************************************************************
 // MRT implementation of the LBM using CUDA
 //**************************************************************************
 int main(void)
 {
 	int deviceCount;
 	cudaGetDeviceCount(&deviceCount);
 	int device = 1;
 	printf("Number of devices = %i \n", deviceCount);
 	printf("Current device is = %i \n", device);
 	cudaSetDevice(device);
 	// BGK Model parameters
 	string FILENAME;	
 	unsigned int nBlocks, nthreads;
 	int timestepMax, interval;
 	double tau,Fx,Fy,Fz,tol;
 	// Domain variables
 	int Nx,Ny,Nz;
 	ifstream input("MRT.in");
 	input >> FILENAME;		// name of the input file
 	input >> Nz;			// number of nodes (x,y,z)
 	input >> nBlocks;				
 	input >> nthreads;				
 	input >> tau;				// relaxation time 
 	input >> Fx;			// External force components (x,y,z)
 	input >> Fy;
 	input >> Fz;
 	input >> timestepMax;			// max no. of timesteps
 	input >> interval;			// error interval
 	input >> tol;				// error tolerance
 	double rlx_setA = 1.f/tau;
 	double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
 	printf("tau = %f \n", tau);
 	printf("Set A = %f \n", rlx_setA);
 	printf("Set B = %f \n", rlx_setB);
 	printf("Force(x) = %f \n", Fx);
 	printf("Force(y) = %f \n", Fy);
 	printf("Force(z) = %f \n", Fz);
 	Nx = Ny = Nz;	// Cubic domain
 	int N = Nx*Ny*Nz;
 	int dist_mem_size = N*sizeof(double);
 //	unsigned int nBlocks = 32;
 //	int nthreads = 128;
 	int S = N/nthreads/nBlocks;
 //	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
 	dim3 grid(nBlocks,1,1);
 	printf("Number of blocks = %i \n", nBlocks);
 	printf("Threads per block = %i \n", nthreads);
 	printf("Sweeps per thread = %i \n", S);
 	printf("Number of nodes per side = %i \n", Nx);
 	printf("Total Number of nodes = %i \n", N);
 	//.......................................................................
 	printf("Read input media... \n");
 	// .......... READ THE INPUT FILE .......................................
 	int n;
 	char value;
 	char *id;
 	id = new char[N];	
 	int sum = 0;
 	double porosity;
 	ifstream PM(FILENAME.c_str(),ios::binary);
 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
 			for (int i=0;i<Nx;i++){
 				PM.read((char *) (&value), sizeof(value));
 				n = k*Nx*Ny+j*Nx+i;
 				id[n] = value;
 				if (value > 0) sum++;
 			}
 		}
 	}
 	PM.close();
 	printf("File porosity = %f\n", double(sum)/N);
 	//.......................................................................
 	//...........device phase ID.................................................
 	char *ID;
 	cudaMalloc((void **) &ID, N);						// Allocate device memory
 	// Copy to the device
 	cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
 	//...........................................................................
 	//......................device distributions.................................
 	double *f_even,*f_odd;
 	//...........................................................................
 	cudaMalloc((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
 	cudaMalloc((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
 	//...........................................................................
 	//...........................................................................
 //	cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
 //	cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
 //	cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
 //	cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
 //	cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
 	printf("Setting the distributions, size = : %i\n", N);
 	//...........................................................................
 //	INITIALIZE <<< grid, nthreads >>>  (ID, f_even, f_odd, Nx, Ny, Nz, S);
 	//...........................................................................
 	dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
 	//*************************************************************************
 	int timestep = 0;
 	printf("No. of timesteps: %i \n", timestepMax);
 	//.......create a stream for the LB calculation.......
 	cudaStream_t stream;
 	cudaStreamCreate(&stream);
 	//.......create and start timer............
 	cudaEvent_t start, stop; 
 	float time; 
 	cudaEventCreate(&start); 
 	cudaEventCreate(&stop); 
 	cudaEventRecord( start, 0 ); 
 	//.........................................
 	//************ MAIN ITERATION LOOP ***************************************/
 	while (timestep < timestepMax){
 		//...................................................................
 		//........ Execute the swap kernel (device) .........................
 //		SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
 		//...................................................................
 		dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
 		//........ Execute the collision kernel (device) ....................
 //		MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
 //									rlx_setA, rlx_setB, Fx, Fy, Fz);
 		//............................................................
 		dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
 		// Iteration completed!
 		timestep++;
 		//...................................................................
 	}
 	//************************************************************************/
 	cudaThreadSynchronize();
 	//.......... stop and destroy timer.............................
 	cudaEventRecord( stop, stream); 
 	cudaEventSynchronize( stop ); 
 	cudaEventElapsedTime( &time, start, stop ); 
 	printf("CPU time = %f \n", time);
 	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
 	printf("MLUPS = %f \n", MLUPS);
 	cudaStreamDestroy(stream);
 	cudaEventDestroy( start ); 
 	cudaEventDestroy( stop ); 
 	//..............................................................
 	//..............................................................
 	//.........Compute the velocity and copy result to host ........
 	double *velocity;
 	velocity = new double[3*N];
 	//......................device distributions....................................
 	double *vel;
 	//..............................................................................
 	cudaMalloc((void **) &vel, 3*dist_mem_size);	// Allocate device memory
 	//..............................................................................
 //	Compute_VELOCITY <<< grid, nthreads >>>  (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
 	//..............................................................................
 	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
 	//..............................................................................
 	//............................................................	
 	//....Write the z-velocity to test poiseuille flow............
 	double vz,vz_avg;	
 	vz_avg = 0.0;
 	FILE *output;
 	output = fopen("velocity.out","w");
 	for (int k=0; k<1; k++){
 		for (int j=0; j<1; j++){
 			for (int i=0; i<Nx; i++){
 				int n = k*Nx*Ny+j*Nx+i;
 				//.....print value........
 				vz = velocity[2*N+n];
 				vz_avg += vz;
 				fprintf(output, " %e",vz);
 			}
 		}
 	}
 	fclose(output);
 	vz = vz_avg/double(sum);
 	printf("Average Velocity = %e\n", vz);
 	// cleanup	
 	cudaFree(f_even);	cudaFree(f_odd);	cudaFree(vel);	cudaFree(ID);
 	free (velocity);	free(id);
 }
--- a/cuda/exe/lb1_MRT.cu
+++ b/cuda/exe/lb1_MRT.cu
@ -1,246 +0,0 @@
 #include <stdio.h>
 #include <iostream>
 #include <fstream>
 #include <cuda.h>
 using namespace std;
 //*************************************************************************
 extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
 //*************************************************************************
 extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
 //*************************************************************************
 extern "C" void dvc_Barrier();
 //*************************************************************************
 extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
 							  int Ny, int Nz, int nblocks, int nthreads, int S);
 //*************************************************************************
 extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
 							  int Ny, int Nz, int nblocks, int nthreads, int S);
 //*************************************************************************
 extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
 		int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
 //*************************************************************************
 void Write_Out(double *array, int Nx, int Ny, int Nz){
 	int value;
 	FILE *output;
 	output = fopen("dist.list","w");
 	for (int k=0; k<Nz; k++){
 		for (int j=0; j<Ny; j++){
 			for (int i=0; i<Nx; i++){
 				int index = k*Nx*Ny+j*Nx+i;
 				value = int(array[index]);
 				fprintf(output, "| %i",value);
 			}
 			fprintf(output, " | \n");
 		}
 		fprintf(output,"************************************** \n");	
 	}
 	fclose(output);
 }
 //**************************************************************************
 // MRT implementation of the LBM using CUDA
 //**************************************************************************
 int main(void)
 {
 	// BGK Model parameters
 	string FILENAME;	
 	unsigned int nBlocks, nthreads;
 	int timestepMax, interval;
 	double tau,Fx,Fy,Fz,tol;
 	// Domain variables
 	int Nx,Ny,Nz;
 	ifstream input("MRT.in");
 	input >> FILENAME;		// name of the input file
 	input >> Nz;			// number of nodes (x,y,z)
 	input >> nBlocks;				
 	input >> nthreads;				
 	input >> tau;				// relaxation time 
 	input >> Fx;			// External force components (x,y,z)
 	input >> Fy;
 	input >> Fz;
 	input >> timestepMax;			// max no. of timesteps
 	input >> interval;			// error interval
 	input >> tol;				// error tolerance
 	double rlx_setA = 1.f/tau;
 	double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
 	printf("tau = %f \n", tau);
 	printf("Set A = %f \n", rlx_setA);
 	printf("Set B = %f \n", rlx_setB);
 	printf("Force(x) = %f \n", Fx);
 	printf("Force(y) = %f \n", Fy);
 	printf("Force(z) = %f \n", Fz);
 	Nx = Ny = Nz;	// Cubic domain
 	int N = Nx*Ny*Nz;
 	int dist_mem_size = N*sizeof(double);
 //	unsigned int nBlocks = 32;
 //	int nthreads = 128;
 	int S = N/nthreads/nBlocks;
 //	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
 	dim3 grid(nBlocks,1,1);
 	printf("Number of blocks = %i \n", nBlocks);
 	printf("Threads per block = %i \n", nthreads);
 	printf("Sweeps per thread = %i \n", S);
 	printf("Number of nodes per side = %i \n", Nx);
 	printf("Total Number of nodes = %i \n", N);
 	//.......................................................................
 	printf("Read input media... \n");
 	// .......... READ THE INPUT FILE .......................................
 	int n;
 	char value;
 	char *id;
 	id = new char[N];	
 	int sum = 0;
 	double porosity;
 	ifstream PM(FILENAME.c_str(),ios::binary);
 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
 			for (int i=0;i<Nx;i++){
 				PM.read((char *) (&value), sizeof(value));
 				n = k*Nx*Ny+j*Nx+i;
 				id[n] = value;
 				if (value > 0) sum++;
 			}
 		}
 	}
 	PM.close();
 	printf("File porosity = %f\n", double(sum)/N);
 	//.......................................................................
 	//...........device phase ID.................................................
 	char *ID;
 	 dvc_AllocateDeviceMemory((void **) &ID, N);						// Allocate device memory
 	// Copy to the device
 	dvc_CopyToDevice(ID, id, N);
 	//...........................................................................
 	//......................device distributions.................................
 	double *f_even,*f_odd;
 	//...........................................................................
 	 dvc_AllocateDeviceMemory((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
 	 dvc_AllocateDeviceMemory((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
 	//...........................................................................
 	//...........................................................................
 //	cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
 //	cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
 //	cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
 //	cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
 //	cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
 	printf("Setting the distributions, size = : %i\n", N);
 	//...........................................................................
 //	INITIALIZE <<< grid, nthreads >>>  (ID, f_even, f_odd, Nx, Ny, Nz, S);
 	//...........................................................................
 	dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
 	//*************************************************************************
 	int timestep = 0;
 	printf("No. of timesteps: %i \n", timestepMax);
 	//.......create a stream for the LB calculation.......
 	cudaStream_t stream;
 	cudaStreamCreate(&stream);
 	//.......create and start timer............
 	cudaEvent_t start, stop; 
 	float time; 
 	cudaEventCreate(&start); 
 	cudaEventCreate(&stop); 
 	cudaEventRecord( start, 0 ); 
 	//.........................................
 	//************ MAIN ITERATION LOOP ***************************************/
 	while (timestep < timestepMax){
 		//...................................................................
 		//........ Execute the swap kernel (device) .........................
 //		SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
 		//...................................................................
 		dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
 		//........ Execute the collision kernel (device) ....................
 //		MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
 //									rlx_setA, rlx_setB, Fx, Fy, Fz);
 		//............................................................
 		dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
 		// Iteration completed!
 		timestep++;
 		//...................................................................
 	}
 	//************************************************************************/
 //	cudaThreadSynchronize();
 	dvc_Barrier();
 	//.......... stop and destroy timer.............................
 	cudaEventRecord( stop, stream); 
 	cudaEventSynchronize( stop ); 
 	cudaEventElapsedTime( &time, start, stop ); 
 	printf("CPU time = %f \n", time);
 	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
 	printf("MLUPS = %f \n", MLUPS);
 	cudaStreamDestroy(stream);
 	cudaEventDestroy( start ); 
 	cudaEventDestroy( stop ); 
 	//..............................................................
 	//..............................................................
 	/*//.........Compute the velocity and copy result to host ........
 	double *velocity;
 	velocity = new double[3*N];
 	//......................device distributions....................................
 	double *vel;
 	//..............................................................................
 	 dvc_AllocateDeviceMemory((void **) &vel, 3*dist_mem_size);	// Allocate device memory
 	//..............................................................................
 //	Compute_VELOCITY <<< grid, nthreads >>>  (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
 	//..............................................................................
 //	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
 	//..............................................................................
 	//............................................................	
 	//....Write the z-velocity to test poiseuille flow............
 	double vz,vz_avg;	
 	vz_avg = 0.0;
 /*	FILE *output;
 	output = fopen("velocity.out","w");
 	for (int k=0; k<1; k++){
 		for (int j=0; j<1; j++){
 			for (int i=0; i<Nx; i++){
 				int n = k*Nx*Ny+j*Nx+i;
 				//.....print value........
 				vz = velocity[2*N+n];
 				vz_avg += vz;
 				fprintf(output, " %e",vz);
 			}
 		}
 	}
 	fclose(output);
 	vz = vz_avg/double(sum);
 	printf("Average Velocity = %e\n", vz);
 */
 	// cleanup	
 //	cudaFree(f_even);	cudaFree(f_odd);	cudaFree(vel);	cudaFree(ID);
 //	free (velocity);	free(id);
 }
--- a/cuda/exe/lb1_MRT_mpi.cpp
+++ b/cuda/exe/lb1_MRT_mpi.cpp
--- a/cuda/exe/lb1_MRT_mpi.cu
+++ b/cuda/exe/lb1_MRT_mpi.cu
--- a/cuda/exe/lb2_Color.cpp
+++ b/cuda/exe/lb2_Color.cpp
@ -1,400 +0,0 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>
 //*************************************************************************
 // Functions defined in Color.cu
 //*************************************************************************
 extern "C" void dvc_InitDenColor(int nblocks, int nthreads, int S, char *ID,
                                 double *Den, double *Phi, double das,
                                 double dbs, int N);
 //*************************************************************************
 extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
                                         char *ID, double *Phi,
                                         double *ColorGrad, int Nx, int Ny,
                                         int Nz);
 //*************************************************************************
 extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S, char *ID,
                                 double *f_even, double *f_odd,
                                 double *ColorGrad, double *Velocity,
                                 double rlxA, double rlxB, double alpha,
                                 double beta, double Fx, double Fy, double Fz,
                                 int Nx, int Ny, int Nz, bool pBC);
 //*************************************************************************
 extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
                                      char *ID, double *Den, double *Copy,
                                      double *Phi, double *ColorGrad,
                                      double *Velocity, double beta, int Nx,
                                      int Ny, int Nz, bool pBC);
 //*************************************************************************
 extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S, char *ID,
                               double *Phi, double *Copy, double *Den, int N);
 //*************************************************************************
 //*************************************************************************
 // Functions defined in D3Q19.cu
 //*************************************************************************
 extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S, char *ID,
                              double *f_even, double *f_odd, int Nx, int Ny,
                              int Nz);
 //*************************************************************************
 extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S, char *ID,
                              double *f_even, double *f_odd, int Nx, int Ny,
                              int Nz);
 //*************************************************************************
 extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList,
                             int start, int sendCount, double *sendbuf,
                             double *Dist, int N);
 //*************************************************************************
 extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy,
                               int Cqz, int *RecvList, int start, int recvCount,
                               double *recvbuf, double *Dist, int Nx, int Ny,
                               int Nz);
 //*************************************************************************
 //***************************************************************************************
 // Functions defined in D3Q7.cu
 //***************************************************************************************
 extern "C" void dvc_PackDenD3Q7(int grid, int threads, int *list, int count,
                                double *sendbuf, int number, double *Data,
                                int N);
 //***************************************************************************************
 extern "C" void dvc_UnpackDenD3Q7(int grid, int threads, int *list, int count,
                                  double *recvbuf, int number, double *Data,
                                  int N);
 //***************************************************************************************
 extern "C" void dvc_PackValues(int grid, int threads, int *list, int count,
                               double *sendbuf, double *Data, int N);
 //***************************************************************************************
 extern "C" void dvc_UnpackValues(int grid, int threads, int *list, int count,
                                 double *recvbuf, double *Data, int N);
 //***************************************************************************************
 //*************************************************************************
 // Functions defined in CudaExtras.cu
 //*************************************************************************
 extern "C" void dvc_AllocateDeviceMemory(void **address, size_t size);
 //*************************************************************************
 extern "C" void dvc_CopyToDevice(void *dest, void *source, size_t size);
 //*************************************************************************
 extern "C" void dvc_CopyToHost(void *dest, void *source, size_t size);
 //*************************************************************************
 extern "C" void dvc_Barrier();
 //*************************************************************************
 //*************************************************************************
 // Implementation of Two-Phase Immiscible LBM using CUDA
 //*************************************************************************
 using namespace std;
 inline void PackID(int *list, int count, char *sendbuf, char *ID) {
    // Fill in the phase ID values from neighboring processors
    // This packs up the values that need to be sent from one processor to another
    int idx, n;
    for (idx = 0; idx < count; idx++) {
        n = list[idx];
        sendbuf[idx] = ID[n];
    }
 }
 //***************************************************************************************
 inline void UnpackID(int *list, int count, char *recvbuf, char *ID) {
    // Fill in the phase ID values from neighboring processors
    // This unpacks the values once they have been recieved from neighbors
    int idx, n;
    for (idx = 0; idx < count; idx++) {
        n = list[idx];
        ID[n] = recvbuf[idx];
    }
 }
 //***************************************************************************************
 int main(int argc, char **argv) {
    int rank = 0;
    int nprocs = 1;
    int nprocx, nprocy, nprocz;
    int iproc, jproc, kproc;
    if (rank == 0) {
        printf("********************************************************\n");
        printf("Running Hybrid Implementation of Color LBM	\n");
        printf("********************************************************\n");
    }
    // Color Model parameters
    string FILENAME;
    unsigned int nBlocks, nthreads;
    int Nx, Ny, Nz;
    int timestepMax, interval;
    double tau, Fx, Fy, Fz, tol;
    double alpha, beta;
    double das, dbs;
    double din, dout;
    bool pBC;
    int i, j, k, n;
    if (rank == 0) {
        //.............................................................
        //		READ SIMULATION PARMAETERS FROM INPUT FILE
        //.............................................................
        ifstream input("Color.in");
        // Line 1: Name of the phase indicator file (s=0,w=1,n=2)
        input >> FILENAME;
        // Line 2: domain size (Nx, Ny, Nz)
        input >> Nz; // number of nodes (x,y,z)
        input >> nBlocks;
        input >> nthreads;
        // Line 3: model parameters (tau, alpha, beta, das, dbs)
        input >> tau;
        input >> alpha;
        input >> beta;
        input >> das;
        input >> dbs;
        // Line 4: External force components (Fx,Fy, Fz)
        input >> Fx;
        input >> Fy;
        input >> Fz;
        // Line 5: Pressure Boundary conditions
        input >> pBC;
        input >> din;
        input >> dout;
        // Line 6: time-stepping criteria
        input >> timestepMax; // max no. of timesteps
        input >> interval;    // error interval
        input >> tol;         // error tolerance
        //.............................................................
        ifstream domain("Domain.in");
        domain >> nprocx;
        domain >> nprocy;
        domain >> nprocz;
    }
    double rlxA = 1.f / tau;
    double rlxB = 8.f * (2.f - rlxA) / (8.f - rlxA);
    if (nprocs != nprocx * nprocy * nprocz) {
        printf("Fatal error in processor number! \n");
        printf("nprocx =  %i \n", nprocx);
        printf("nprocy =  %i \n", nprocy);
        printf("nprocz =  %i \n", nprocz);
    }
    if (rank == 0) {
        printf("********************************************************\n");
        printf("tau = %f \n", tau);
        printf("alpha = %f \n", alpha);
        printf("beta = %f \n", beta);
        printf("das = %f \n", beta);
        printf("dbs = %f \n", beta);
        printf("Force(x) = %f \n", Fx);
        printf("Force(y) = %f \n", Fy);
        printf("Force(z) = %f \n", Fz);
        printf("Sub-domain size = %i x %i x %i\n", Nz, Nz, Nz);
        printf("Parallel domain size = %i x %i x %i\n", nprocx, nprocy, nprocz);
        printf("********************************************************\n");
    }
    Nz += 2;
    Nx = Ny = Nz; // Cubic domain
    int N = Nx * Ny * Nz;
    int dist_mem_size = N * sizeof(double);
    //	unsigned int nBlocks = 32;
    //	int nthreads = 128;
    int S = N / nthreads / nBlocks;
    //	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
    //	dim3 grid(nBlocks,1,1);
    if (rank == 0)
        printf("Number of blocks = %i \n", nBlocks);
    if (rank == 0)
        printf("Threads per block = %i \n", nthreads);
    if (rank == 0)
        printf("Sweeps per thread = %i \n", S);
    if (rank == 0)
        printf("Number of nodes per side = %i \n", Nx);
    if (rank == 0)
        printf("Total Number of nodes = %i \n", N);
    if (rank == 0)
        printf("********************************************************\n");
    //.......................................................................
    if (rank == 0)
        printf("Read input media... \n");
    //.......................................................................
    char LocalRankString[8];
    char LocalRankFilename[40];
    sprintf(LocalRankString, "%05d", rank);
    sprintf(LocalRankFilename, "%s%s", "ID.", LocalRankString);
    //	printf("Local File Name =  %s \n",LocalRankFilename);
    // .......... READ THE INPUT FILE .......................................
    char value;
    char *id;
    id = new char[N];
    int sum = 0;
    //	double porosity;
    //.......................................................................
    ifstream PM(LocalRankFilename, ios::binary);
    for (k = 0; k < Nz; k++) {
        for (j = 0; j < Ny; j++) {
            for (i = 0; i < Nx; i++) {
                n = k * Nx * Ny + j * Nx + i;
                id[n] = 0;
            }
        }
    }
    for (k = 1; k < Nz - 1; k++) {
        for (j = 1; j < Ny - 1; j++) {
            for (i = 1; i < Nx - 1; i++) {
                PM.read((char *)(&value), sizeof(value));
                n = k * Nx * Ny + j * Nx + i;
                id[n] = value;
                if (value > 0)
                    sum++;
            }
        }
    }
    PM.close();
    //	printf("File porosity = %f\n", double(sum)/N);
    //...........device phase ID.................................................
    if (rank == 0)
        printf("Copying phase ID to device \n");
    char *ID;
    dvc_AllocateDeviceMemory((void **)&ID, N); // Allocate device memory
    // Copy to the device
    dvc_CopyToDevice(ID, id, N);
    //...........................................................................
    if (rank == 0)
        printf("Allocating distributions \n");
    //......................device distributions.................................
    double *f_even, *f_odd;
    //...........................................................................
    dvc_AllocateDeviceMemory((void **)&f_even,
                             10 * dist_mem_size); // Allocate device memory
    dvc_AllocateDeviceMemory((void **)&f_odd,
                             9 * dist_mem_size); // Allocate device memory
    //...........................................................................
    //...........................................................................
    //				MAIN  VARIABLES ALLOCATED HERE
    //...........................................................................
    double *Phi, *Den, *Copy;
    double *ColorGrad, *Velocity;
    //...........................................................................
    dvc_AllocateDeviceMemory((void **)&Phi, dist_mem_size);
    dvc_AllocateDeviceMemory((void **)&Den, 2 * dist_mem_size);
    dvc_AllocateDeviceMemory((void **)&Copy, 2 * dist_mem_size);
    dvc_AllocateDeviceMemory((void **)&Velocity, 3 * dist_mem_size);
    dvc_AllocateDeviceMemory((void **)&ColorGrad, 3 * dist_mem_size);
    //...........................................................................
    if (rank == 0)
        printf("Setting the distributions, size = : %i\n", N);
    //...........................................................................
    dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
    dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi, das, dbs, N);
    //...........................................................................
    dvc_ComputePhi(nBlocks, nthreads, S, ID, Phi, Copy, Den, N);
    //...........................................................................
    //...........................................................................
    // Grids used to pack faces on the GPU for MPI
    int faceGrid, edgeGrid, packThreads;
    packThreads = 512;
    edgeGrid = 1;
    faceGrid = Nx * Ny / packThreads;
    int timestep = 0;
    if (rank == 0)
        printf("********************************************************\n");
    if (rank == 0)
        printf("No. of timesteps: %i \n", timestepMax);
    //.......create a stream for the LB calculation.......
    //	cudaStream_t stream;
    //	cudaStreamCreate(&stream);
    //.......create and start timer............
    double start, stop;
    double walltime;
    start = clock();
    //************ MAIN ITERATION LOOP ***************************************/
    while (timestep < timestepMax) {
        //*************************************************************************
        // 		Compute the color gradient
        //*************************************************************************
        dvc_ComputeColorGradient(nBlocks, nthreads, S, ID, Phi, ColorGrad, Nx,
                                 Ny, Nz);
        //*************************************************************************
        //*************************************************************************
        // 		Perform collision step for the momentum transport
        //*************************************************************************
        dvc_ColorCollide(nBlocks, nthreads, S, ID, f_even, f_odd, ColorGrad,
                         Velocity, rlxA, rlxB, alpha, beta, Fx, Fy, Fz, Nx, Ny,
                         Nz, pBC);
        //*************************************************************************
        //*************************************************************************
        // 		Carry out the density streaming step for mass transport
        //*************************************************************************
        dvc_DensityStreamD3Q7(nBlocks, nthreads, S, ID, Den, Copy, Phi,
                              ColorGrad, Velocity, beta, Nx, Ny, Nz, pBC);
        //*************************************************************************
        //*************************************************************************
        // 		Swap the distributions for momentum transport
        //*************************************************************************
        dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
        //*************************************************************************
        //*************************************************************************
        // 		Compute the phase indicator field and reset Copy, Den
        //*************************************************************************
        dvc_ComputePhi(nBlocks, nthreads, S, ID, Phi, Copy, Den, N);
        //*************************************************************************
        // Iteration completed!
        timestep++;
        //...................................................................
    }
    //************************************************************************/
    dvc_Barrier();
    stop = clock();
    //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
    walltime = (stop - start) / CLOCKS_PER_SEC;
    //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
    double MLUPS = double(Nx * Ny * Nz * timestep) / walltime / 1000000;
    if (rank == 0)
        printf("********************************************************\n");
    if (rank == 0)
        printf("CPU time = %f \n", walltime);
    if (rank == 0)
        printf("Lattice update rate (per core)= %f MLUPS \n", MLUPS);
    MLUPS *= nprocs;
    if (rank == 0)
        printf("Lattice update rate (total)= %f MLUPS \n", MLUPS);
    if (rank == 0)
        printf("********************************************************\n");
    //************************************************************************/
    // Write out the phase indicator field
    //************************************************************************/
    sprintf(LocalRankFilename, "%s%s", "Phase.", LocalRankString);
    //	printf("Local File Name =  %s \n",LocalRankFilename);
    double *phiOut;
    phiOut = new double[N];
    dvc_CopyToHost(phiOut, Phi, N * sizeof(double));
    FILE *PHASE;
    PHASE = fopen(LocalRankFilename, "wb");
    fwrite(phiOut, 8, N, PHASE);
    fclose(PHASE);
    //************************************************************************/
 }
--- a/cuda/exe/lb2_Color.cu
+++ b/cuda/exe/lb2_Color.cu
@ -1,425 +0,0 @@
 #ifdef useMPI
 #include <mpi.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>
 #include <math.h>
 #include <cuda.h>
 using namespace std;
 //*************************************************************************
 // HokieSpeed
 //nvcc -Xcompiler -fopenmp -lgomp -O3 -arch sm_20 -o hybridATLKR lb2_ATLKR_hybrid.cu
 // -I$VT_MPI_INC -L$VT_MPI_LIB -lmpi
 //*************************************************************************
 //*************************************************************************
 // Implementation of Two-Phase Immiscible LBM using CUDA
 //*************************************************************************
 //*************************************************************************
 extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S,
 		char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
 //*************************************************************************
 extern "C" void dvc_InitDenColor( int nblocks, int nthreads, int S,
 		char *ID, double *Den, double *Phi, double das, double dbs, int N);
 //*************************************************************************
 extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
 		char *ID, double *Phi, double *ColorGrad, int Nx, int Ny, int Nz);
 //*************************************************************************
 extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S,
 		char *ID, double *f_even, double *f_odd, double *ColorGrad, double *Velocity,
 		double rlxA, double rlxB,double alpha, double beta, double Fx, double Fy, double Fz,
 		int Nx, int Ny, int Nz, bool pBC);
 //*************************************************************************
 extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
 		char *ID, double *Den, double *Copy, double *Phi, double *ColorGrad, double *Velocity,
 		double beta, int Nx, int Ny, int Nz, bool pBC);
 //*************************************************************************
 extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S,
 		char *ID, double *Phi, double *Copy, double *Den, int N);
 //*************************************************************************
 extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
 //*************************************************************************
 extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
 //*************************************************************************
 extern "C" void dvc_Barrier();
 //*************************************************************************
 extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S,
 		char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
 //*************************************************************************
 extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList, int start,
 		int sendCount, double *sendbuf, double *Dist, int N);
 //*************************************************************************
 extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy, int Cqz, int *RecvList, int start,
 		int recvCount, double *recvbuf, double *Dist, int Nx, int Ny, int Nz);
 //*************************************************************************
 int main(int argc, char *argv[])
 {
 	//********** Initialize MPI ****************
 	int numprocs,rank;
 #ifdef useMPI
 	MPI_Status stat;
 	MPI_Init(&argc,&argv);
    MPI_Comm comm = MPI_COMM_WORLD;
 	MPI_Comm_size(comm,&numprocs);
 	MPI_Comm_rank(comm,&rank);
 #else
    MPI_Comm comm = MPI_COMM_WORLD;
 	numprocs = 1;
 	rank = 0;
 #endif
 	//******************************************
 	if (rank == 0){
 		printf("********************************************************\n");
 		printf("Running Hybrid Implementation of Color LBM	\n");
 		printf("********************************************************\n");
 	}
 	// Color Model parameters
 	string FILENAME;
 	unsigned int nBlocks, nthreads;
 	int Nx,Ny,Nz;
 	int timestepMax, interval;
 	double tau,Fx,Fy,Fz,tol;
 	double alpha, beta;
 	double das, dbs;
 	double din,dout;
 	bool pBC;
 	if (rank==0){
 		//.............................................................
 		//		READ SIMULATION PARMAETERS FROM INPUT FILE 
 		//.............................................................
 		ifstream input("Color.in");
 		// Line 1: Name of the phase indicator file (s=0,w=1,n=2)
 		input >> FILENAME;				
 		// Line 2: domain size (Nx, Ny, Nz)
 		input >> Nz;				// number of nodes (x,y,z)
 		input >> nBlocks;				
 		input >> nthreads;			
 		// Line 3: model parameters (tau, alpha, beta, das, dbs)
 		input >> tau;
 		input >> alpha;
 		input >> beta;
 		input >> das;	
 		input >> dbs;
 		// Line 4: External force components (Fx,Fy, Fz)
 		input >> Fx;				
 		input >> Fy;
 		input >> Fz;
 		// Line 5: Pressure Boundary conditions
 		input >> pBC;
 		input >> din;
 		input >> dout;
 		// Line 6: time-stepping criteria
 		input >> timestepMax;			// max no. of timesteps
 		input >> interval;			// error interval
 		input >> tol;				// error tolerance
 		//.............................................................
 	}
 #ifdef useMPI
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
 	MPI_Barrier(comm);
 	//.................................................
 	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
 	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
 	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
 	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
 	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
 	MPI_Bcast(&interval,1,MPI_INT,0,comm);
 	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	MPI_Barrier(comm);
 	// **************************************************************
 #endif
 	double rlxA = 1.f/tau;
 	double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
 	if (pBC && rank == 0){
 		printf("Assigning presusre boundary conditions \n");
 		printf("Inlet density = %f \n", din);
 		printf("Outlet density = %f \n", dout);
 	}
 	if (rank==0){
 		printf("....Parameters................\n");
 		printf("tau = %f \n", tau);
 		printf("alpha = %f \n", alpha);
 		printf("beta = %f \n", beta);
 		printf("das = %f \n", das);
 		printf("dbs = %f \n", dbs);
 		printf("Force(x) = %f \n", Fx);
 		printf("Force(y) = %f \n", Fy);
 		printf("Force(z) = %f \n", Fz);
 		printf("Nz = %i \n", Nz);
 		printf("timestepMax = %i \n", timestepMax);
 		printf("...............................\n");
 	}
 	// Identical cubic sub-domains
 	Nx = Ny = Nz;// = 16*s;		// Cubic domain
 	int N = Nx*Ny*Nz;
 	int dist_mem_size = N*sizeof(double);
 //	unsigned int nBlocks = 32;
 //	int nthreads = 128;
 	int S = N/nthreads/nBlocks;
 	if (nBlocks*nthreads*S < N)	S++;
 //	int S = 1;
 //	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
 //	dim3 grid(nBlocks,1,1);
 	if (rank==1){
 		printf("Number of blocks = %i \n", nBlocks);
 		printf("Threads per block = %i \n", nthreads);
 		printf("Sweeps per thread = %i \n", S);
 		printf("Number of nodes per side = %i \n", Nx);
 		printf("Total Number of nodes = %i \n", N);
 		printf("...............................\n");
 	}
 	//.......................................................................
 	// .......... READ THE INPUT FILE .......................................
 	int n;
 	char value;
 	char *id;
 	id = new char[N];	
 	int sum = 0;
 	// RANK 0 READS THE INPUT FILE	
 	if (rank==0){
 		printf("Read input media... \n");
 		ifstream PM(FILENAME.c_str(),ios::binary);
 		for (int k=0;k<Nz;k++){
 			for (int j=0;j<Ny;j++){
 				for (int i=0;i<Nx;i++){
 					PM.read((char *) (&value), sizeof(value));
 					n = k*Nx*Ny+j*Nx+i;
 					if (value>0){
 						if (pBC) value=2; 	// Saturate with NWP
 						if (k<8){
 							value=1;
 						}
 					}
 					id[n] = value;
 					if (value > 0) sum++;
 				}
 			}
 		}
 		PM.close();
 		printf("File porosity = %f\n", double(sum)/N);
 	}
 	//......... for pressure BC only............................
 	// Void the first / last rows if pressure BC are to be used
 	if (pBC){
 		for (int k=0;k<Nz;k++){
 			for (int j=0;j<Ny;j++){
 				for (int i=0;i<Nx;i++){
 					n = k*Nx*Ny+j*Nx+i;
 					if (k<4) id[n] = 1;
 					if (k>Nz-5) id[n] = 2;
 				}
 			}
 			// Skip the non-boundary values
 			if (k==4)	k=Nz-5;
 		}
 	}
 #ifdef useMPI	//............................................................
 	MPI_Barrier(comm);
 	MPI_Bcast(&id[0],N,MPI_CHAR,0,comm);
 	MPI_Barrier(comm);
 #endif
 	if (rank == 0) printf("Domain set.\n");
 	//...........................................................................
 	int SBC;
 	int outlet = N-Nx*Ny;
 	if (pBC){
 		SBC = Nx*Ny/nthreads/nBlocks+1;
 		printf("Number of sweeps for inlet / outlet: %i \n", SBC);
 	}
 	//...........................................................................
 	//...........................................................................
 	//...........device phase ID.................................................
 	char *ID;
 	cudaMalloc((void **) &ID, N);						// Allocate device memory
 	// Copy to the device
 	cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
 	//...........................................................................
 	//......................device distributions.................................
 	double *f_even,*f_odd;
 	//...........................................................................
 	cudaMalloc((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
 	cudaMalloc((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
 //	f_even = new double[10*N];
 //	f_odd = new double[9*N];
 	//...........................................................................
 	//...........................................................................
 	//				MAIN  VARIABLES ALLOCATED HERE
 	//...........................................................................
 	double *Phi,*Den,*Copy;
 	double *ColorGrad, *Velocity;
 	//...........................................................................
 	cudaMalloc((void **) &Phi, dist_mem_size);
 	cudaMalloc((void **) &Den, 2*dist_mem_size);
 	cudaMalloc((void **) &Copy, 2*dist_mem_size);
 	cudaMalloc((void **) &Velocity, 3*dist_mem_size);
 	cudaMalloc((void **) &ColorGrad, 3*dist_mem_size);
 	//...........................................................................
 	//...........................................................................
 	if (rank==0)	printf("Setting the distributions, size = : %i\n", N);
 	//...........................................................................
 	dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
 	dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi,  das, dbs, N);
 	//...........................................................................
 	dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
 	//...........................................................................
 	int timestep;
 //	double starttime,stoptime;
 	if (rank==0)	printf("No. of timesteps: %i \n", timestepMax);
 	timestep = 0;
 	//.......create and start timer............
 	cudaEvent_t start, stop;
 	float time;
 	//.......create a stream for the LB calculation.......
 	cudaStream_t stream;
 	cudaStreamCreate(&stream);
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
 	cudaEventRecord( start, 0 );
 	//.........................................
 	//************ MAIN TIMESTEP LOOP ***************************************/
 	while (timestep < timestepMax){
 		//*************************************************************************
 		// 		Compute the color gradient
 		//*************************************************************************
 		dvc_ComputeColorGradient(nBlocks, nthreads, S,
 				ID, Phi, ColorGrad, Nx, Ny, Nz);
 		//*************************************************************************
 		//*************************************************************************
 		// 		Perform collision step for the momentum transport
 		//*************************************************************************
 		dvc_ColorCollide(nBlocks, nthreads, S,
 				ID, f_even, f_odd, ColorGrad, Velocity,
 				rlxA, rlxB,alpha, beta, Fx, Fy, Fz, Nx, Ny, Nz, pBC);
 		//*************************************************************************
 		//*************************************************************************
 		// 		Carry out the density streaming step for mass transport
 		//*************************************************************************
 		dvc_DensityStreamD3Q7(nBlocks, nthreads, S,
 				ID, Den, Copy, Phi, ColorGrad, Velocity,beta, Nx, Ny, Nz, pBC);
 		//*************************************************************************
 		//*************************************************************************
 		// 		Swap the distributions for momentum transport
 		//*************************************************************************
 		dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
 		//*************************************************************************
 		//*************************************************************************
 		// 		Compute the phase indicator field and reset Copy, Den
 		//*************************************************************************
 		dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
 		//*************************************************************************
 		dvc_Barrier();
 		timestep++;
 		//.............................................................................
 	}
 	//************************************************************************/
 	dvc_Barrier();
 	//.......... stop and destroy timer.............................
 	cudaEventRecord( stop, stream);
 	cudaEventSynchronize( stop );
 	cudaEventElapsedTime( &time, start, stop );
 	printf("CPU time = %f \n", time);
 	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
 	printf("MLUPS = %f \n", MLUPS);
 	cudaEventDestroy( start );
 	cudaEventDestroy( stop );
 	double *Data;
 	Data = new double[3*N];
 	cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
 	// Write out the Phase Indicator Field
 	FILE *phase;
 	phase = fopen("Phase.out","wb");
 	fwrite(Data,8,N,phase);
 	fclose(phase);
 	//....................................................
 	// Write out the pressure - (reuse Phi arrays since we're done with those)
 //	ComputeDensity<<< grid, nthreads>>> (ID, f_even, f_odd, Phi, Nx, Ny, Nz, S);
 //	cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
 //	FILE *PRESSURE;
 //	PRESSURE = fopen("Pressure.out","wb");
 //	fwrite(Phi,8,N,PRESSURE);
 //	fclose(PRESSURE);
 	//....................................................
 	// Write out the Color Gradient
 	cudaMemcpy(Data, ColorGrad, 3*dist_mem_size, cudaMemcpyDeviceToHost);
 	FILE *CG;
 	CG = fopen("ColorGrad.out","wb");
 	fwrite(Data,8,3*N,CG);
 	fclose(CG);
 	// Write out the Velocity
 //	FILE *VEL;
 //	VEL = fopen("Velocity.out","wb");
 //	fwrite(Velocity,8,3*N,VEL);
 //	fclose(VEL);
 	// cleanup	
 	cudaFree(ID);
 	cudaFree(f_even);	cudaFree(f_odd);	
 	cudaFree(Velocity);
 	cudaFree(Phi);
 	cudaFree (ColorGrad);
 	cudaFree (Den);		cudaFree(Copy);
 	cudaFree (Phi);
 	free(id);
 	//***********Finish up!*********************************
 #ifdef useMPI
 	MPI_Finalize();
 #endif
 	return 0;
 }
--- a/cuda/exe/lb2_Color_mpi.cpp
+++ b/cuda/exe/lb2_Color_mpi.cpp
--- a/cuda/exe/lb2_Color_pBC_wia_mpi.cpp
+++ b/cuda/exe/lb2_Color_pBC_wia_mpi.cpp