Created prelimary cmake build

2013-12-05 11:42:57 -05:00
parent 9dd97d7f0f
commit 0430704252
39 changed files with 2644 additions and 2657 deletions
--- a/gpu/exe/CMakeLists.txt
+++ b/gpu/exe/CMakeLists.txt
@@ -0,0 +1,6 @@
+#INSTALL_LBPM_EXE( lb1_MRT_mpi )
+#INSTALL_LBPM_EXE( lb2_Color )
+#INSTALL_LBPM_EXE( lb2_Color_mpi )
+#INSTALL_LBPM_EXE( lb2_Color_pBC_wia_mpi )
+INSTALL_LBPM_EXE( lb2_Color_wia_mpi )
+
--- a/gpu/exe/lb1_MRT-swap.cu
+++ b/gpu/exe/lb1_MRT-swap.cu
@@ -0,0 +1,248 @@
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <cuda.h>
+//#include <cutil.h>
+
+using namespace std;
+
+//*************************************************************************
+extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
+							  int Ny, int Nz, int nblocks, int nthreads, int S);
+//*************************************************************************
+extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
+							  int Ny, int Nz, int nblocks, int nthreads, int S);
+//*************************************************************************
+extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
+		int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
+//*************************************************************************
+
+void Write_Out(double *array, int Nx, int Ny, int Nz){
+	int value;
+	FILE *output;
+	output = fopen("dist.list","w");
+	for (int k=0; k<Nz; k++){
+		for (int j=0; j<Ny; j++){
+			for (int i=0; i<Nx; i++){
+				int index = k*Nx*Ny+j*Nx+i;
+				value = int(array[index]);
+				fprintf(output, "| %i",value);
+			}
+			fprintf(output, " | \n");
+		}
+		fprintf(output,"************************************** \n");	
+	}
+	fclose(output);
+}
+
+//**************************************************************************
+// MRT implementation of the LBM using CUDA
+//**************************************************************************
+
+int main(void)
+{
+
+	int deviceCount;
+	cudaGetDeviceCount(&deviceCount);
+	int device = 1;
+	printf("Number of devices = %i \n", deviceCount);
+	printf("Current device is = %i \n", device);
+	cudaSetDevice(device);
+	
+	// BGK Model parameters
+	string FILENAME;	
+	unsigned int nBlocks, nthreads;
+	int timestepMax, interval;
+	double tau,Fx,Fy,Fz,tol;
+	// Domain variables
+	int Nx,Ny,Nz;
+
+	ifstream input("MRT.in");
+	input >> FILENAME;		// name of the input file
+	input >> Nz;			// number of nodes (x,y,z)
+	input >> nBlocks;				
+	input >> nthreads;				
+	input >> tau;				// relaxation time 
+	input >> Fx;			// External force components (x,y,z)
+	input >> Fy;
+	input >> Fz;
+	input >> timestepMax;			// max no. of timesteps
+	input >> interval;			// error interval
+	input >> tol;				// error tolerance
+	
+	double rlx_setA = 1.f/tau;
+	double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
+	
+	printf("tau = %f \n", tau);
+	printf("Set A = %f \n", rlx_setA);
+	printf("Set B = %f \n", rlx_setB);
+	printf("Force(x) = %f \n", Fx);
+	printf("Force(y) = %f \n", Fy);
+	printf("Force(z) = %f \n", Fz);
+
+	Nx = Ny = Nz;	// Cubic domain
+	
+	int N = Nx*Ny*Nz;
+	int dist_mem_size = N*sizeof(double);
+	
+//	unsigned int nBlocks = 32;
+//	int nthreads = 128;
+	int S = N/nthreads/nBlocks;
+	
+//	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
+	dim3 grid(nBlocks,1,1);
+		
+	printf("Number of blocks = %i \n", nBlocks);
+	printf("Threads per block = %i \n", nthreads);
+	printf("Sweeps per thread = %i \n", S);
+	printf("Number of nodes per side = %i \n", Nx);
+	printf("Total Number of nodes = %i \n", N);
+	
+	//.......................................................................
+	printf("Read input media... \n");
+	// .......... READ THE INPUT FILE .......................................
+	int n;
+	char value;
+	char *id;
+	id = new char[N];	
+	int sum = 0;
+	double porosity;
+	ifstream PM(FILENAME.c_str(),ios::binary);
+	for (int k=0;k<Nz;k++){
+		for (int j=0;j<Ny;j++){
+			for (int i=0;i<Nx;i++){
+				PM.read((char *) (&value), sizeof(value));
+				n = k*Nx*Ny+j*Nx+i;
+				id[n] = value;
+				if (value > 0) sum++;
+			}
+		}
+	}
+	PM.close();
+	printf("File porosity = %f\n", double(sum)/N);
+	//.......................................................................
+	//...........device phase ID.................................................
+	char *ID;
+	cudaMalloc((void **) &ID, N);						// Allocate device memory
+	// Copy to the device
+	cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
+	//...........................................................................
+	
+	//......................device distributions.................................
+	double *f_even,*f_odd;
+	//...........................................................................
+	cudaMalloc((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
+	cudaMalloc((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
+	//...........................................................................
+
+	//...........................................................................
+//	cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
+//	cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
+//	cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
+//	cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
+//	cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
+
+	printf("Setting the distributions, size = : %i\n", N);
+	//...........................................................................
+//	INITIALIZE <<< grid, nthreads >>>  (ID, f_even, f_odd, Nx, Ny, Nz, S);
+	//...........................................................................
+	dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
+	//*************************************************************************
+
+	int timestep = 0;
+	printf("No. of timesteps: %i \n", timestepMax);
+	
+	//.......create a stream for the LB calculation.......
+	cudaStream_t stream;
+	cudaStreamCreate(&stream);
+	
+	//.......create and start timer............
+	cudaEvent_t start, stop; 
+	float time; 
+ 
+	cudaEventCreate(&start); 
+	cudaEventCreate(&stop); 
+	cudaEventRecord( start, 0 ); 
+	//.........................................
+	
+	//************ MAIN ITERATION LOOP ***************************************/
+	while (timestep < timestepMax){
+	
+		//...................................................................
+		//........ Execute the swap kernel (device) .........................
+//		SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
+		//...................................................................
+		dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
+
+		//........ Execute the collision kernel (device) ....................
+//		MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
+//									rlx_setA, rlx_setB, Fx, Fy, Fz);
+		//............................................................
+		dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
+		// Iteration completed!
+
+		timestep++;
+		//...................................................................
+		
+	}
+	//************************************************************************/
+	
+	cudaThreadSynchronize();
+	//.......... stop and destroy timer.............................
+	cudaEventRecord( stop, stream); 
+	cudaEventSynchronize( stop ); 
+ 
+	cudaEventElapsedTime( &time, start, stop ); 
+	printf("CPU time = %f \n", time);
+	
+	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
+	printf("MLUPS = %f \n", MLUPS);
+
+	cudaStreamDestroy(stream);
+	cudaEventDestroy( start ); 
+	cudaEventDestroy( stop ); 
+	//..............................................................
+	
+	//..............................................................
+	//.........Compute the velocity and copy result to host ........
+	double *velocity;
+	velocity = new double[3*N];
+	//......................device distributions....................................
+	double *vel;
+	//..............................................................................
+	cudaMalloc((void **) &vel, 3*dist_mem_size);	// Allocate device memory
+	//..............................................................................
+//	Compute_VELOCITY <<< grid, nthreads >>>  (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
+	//..............................................................................
+	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
+	//..............................................................................
+
+	//............................................................	
+	//....Write the z-velocity to test poiseuille flow............
+	double vz,vz_avg;	
+	vz_avg = 0.0;
+
+	FILE *output;
+	output = fopen("velocity.out","w");
+	for (int k=0; k<1; k++){
+		for (int j=0; j<1; j++){
+			for (int i=0; i<Nx; i++){
+				int n = k*Nx*Ny+j*Nx+i;
+				//.....print value........
+				vz = velocity[2*N+n];
+				vz_avg += vz;
+				fprintf(output, " %e",vz);
+			}
+		}
+	}
+	fclose(output);
+	
+	vz = vz_avg/double(sum);
+	printf("Average Velocity = %e\n", vz);
+
+
+	// cleanup	
+	cudaFree(f_even);	cudaFree(f_odd);	cudaFree(vel);	cudaFree(ID);
+	free (velocity);	free(id);
+	
+}
--- a/gpu/exe/lb1_MRT.cu
+++ b/gpu/exe/lb1_MRT.cu
@@ -0,0 +1,246 @@
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <cuda.h>
+
+using namespace std;
+
+//*************************************************************************
+extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
+//*************************************************************************
+extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
+//*************************************************************************
+extern "C" void dvc_Barrier();
+//*************************************************************************
+extern "C" void dvc_InitD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
+							  int Ny, int Nz, int nblocks, int nthreads, int S);
+//*************************************************************************
+extern "C" void dvc_SwapD3Q19(char *ID, double *f_even, double *f_odd, int Nx,
+							  int Ny, int Nz, int nblocks, int nthreads, int S);
+//*************************************************************************
+extern "C" void dvc_MRT(char *ID, double *f_even, double *f_odd, double rlxA, double rlxB, double Fx, double Fy, double Fz,
+		int Nx, int Ny, int Nz, int nblocks, int nthreads, int S);
+//*************************************************************************
+
+void Write_Out(double *array, int Nx, int Ny, int Nz){
+	int value;
+	FILE *output;
+	output = fopen("dist.list","w");
+	for (int k=0; k<Nz; k++){
+		for (int j=0; j<Ny; j++){
+			for (int i=0; i<Nx; i++){
+				int index = k*Nx*Ny+j*Nx+i;
+				value = int(array[index]);
+				fprintf(output, "| %i",value);
+			}
+			fprintf(output, " | \n");
+		}
+		fprintf(output,"************************************** \n");	
+	}
+	fclose(output);
+}
+
+//**************************************************************************
+// MRT implementation of the LBM using CUDA
+//**************************************************************************
+
+int main(void)
+{
+
+	// BGK Model parameters
+	string FILENAME;	
+	unsigned int nBlocks, nthreads;
+	int timestepMax, interval;
+	double tau,Fx,Fy,Fz,tol;
+	// Domain variables
+	int Nx,Ny,Nz;
+
+	ifstream input("MRT.in");
+	input >> FILENAME;		// name of the input file
+	input >> Nz;			// number of nodes (x,y,z)
+	input >> nBlocks;				
+	input >> nthreads;				
+	input >> tau;				// relaxation time 
+	input >> Fx;			// External force components (x,y,z)
+	input >> Fy;
+	input >> Fz;
+	input >> timestepMax;			// max no. of timesteps
+	input >> interval;			// error interval
+	input >> tol;				// error tolerance
+	
+	double rlx_setA = 1.f/tau;
+	double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
+	
+	printf("tau = %f \n", tau);
+	printf("Set A = %f \n", rlx_setA);
+	printf("Set B = %f \n", rlx_setB);
+	printf("Force(x) = %f \n", Fx);
+	printf("Force(y) = %f \n", Fy);
+	printf("Force(z) = %f \n", Fz);
+
+	Nx = Ny = Nz;	// Cubic domain
+	
+	int N = Nx*Ny*Nz;
+	int dist_mem_size = N*sizeof(double);
+	
+//	unsigned int nBlocks = 32;
+//	int nthreads = 128;
+	int S = N/nthreads/nBlocks;
+	
+//	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
+	dim3 grid(nBlocks,1,1);
+		
+	printf("Number of blocks = %i \n", nBlocks);
+	printf("Threads per block = %i \n", nthreads);
+	printf("Sweeps per thread = %i \n", S);
+	printf("Number of nodes per side = %i \n", Nx);
+	printf("Total Number of nodes = %i \n", N);
+	
+	//.......................................................................
+	printf("Read input media... \n");
+	// .......... READ THE INPUT FILE .......................................
+	int n;
+	char value;
+	char *id;
+	id = new char[N];	
+	int sum = 0;
+	double porosity;
+	ifstream PM(FILENAME.c_str(),ios::binary);
+	for (int k=0;k<Nz;k++){
+		for (int j=0;j<Ny;j++){
+			for (int i=0;i<Nx;i++){
+				PM.read((char *) (&value), sizeof(value));
+				n = k*Nx*Ny+j*Nx+i;
+				id[n] = value;
+				if (value > 0) sum++;
+			}
+		}
+	}
+	PM.close();
+	printf("File porosity = %f\n", double(sum)/N);
+	//.......................................................................
+	//...........device phase ID.................................................
+	char *ID;
+	 dvc_AllocateDeviceMemory((void **) &ID, N);						// Allocate device memory
+	// Copy to the device
+	dvc_CopyToDevice(ID, id, N);
+	//...........................................................................
+	
+	//......................device distributions.................................
+	double *f_even,*f_odd;
+	//...........................................................................
+	 dvc_AllocateDeviceMemory((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
+	 dvc_AllocateDeviceMemory((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
+	//...........................................................................
+
+	//...........................................................................
+//	cudaHostAlloc(&fa,dist_mem_size,cudaHostAllocPortable);
+//	cudaHostAlloc(&fb,dist_mem_size,cudaHostAllocPortable);
+//	cudaHostRegister(fa,dist_mem_size,cudaHostRegisterPortable);
+//	cudaHostRegister(fb,dist_mem_size,cudaHostRegisterPortable);
+//	cudaHostRegister(id,N*sizeof(char),cudaHostAllocPortable);
+
+	printf("Setting the distributions, size = : %i\n", N);
+	//...........................................................................
+//	INITIALIZE <<< grid, nthreads >>>  (ID, f_even, f_odd, Nx, Ny, Nz, S);
+	//...........................................................................
+	dvc_InitD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
+	//*************************************************************************
+
+	int timestep = 0;
+	printf("No. of timesteps: %i \n", timestepMax);
+	
+	//.......create a stream for the LB calculation.......
+	cudaStream_t stream;
+	cudaStreamCreate(&stream);
+	
+	//.......create and start timer............
+	cudaEvent_t start, stop; 
+	float time; 
+ 
+	cudaEventCreate(&start); 
+	cudaEventCreate(&stop); 
+	cudaEventRecord( start, 0 ); 
+	//.........................................
+	
+	//************ MAIN ITERATION LOOP ***************************************/
+	while (timestep < timestepMax){
+	
+		//...................................................................
+		//........ Execute the swap kernel (device) .........................
+//		SWAP <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S);
+		//...................................................................
+		dvc_SwapD3Q19(ID,f_even,f_odd,Nx,Ny,Nz,nBlocks,nthreads,S);
+
+		//........ Execute the collision kernel (device) ....................
+//		MRT <<< grid, nthreads >>> (ID, f_even, f_odd, Nx, Ny, Nz, S,
+//									rlx_setA, rlx_setB, Fx, Fy, Fz);
+		//............................................................
+		dvc_MRT(ID, f_even, f_odd, rlx_setA, rlx_setB, Fx, Fy, Fz,Nx,Ny,Nz,nBlocks,nthreads,S);
+		// Iteration completed!
+
+		timestep++;
+		//...................................................................
+		
+	}
+	//************************************************************************/
+	
+//	cudaThreadSynchronize();
+	dvc_Barrier();
+	//.......... stop and destroy timer.............................
+	cudaEventRecord( stop, stream); 
+	cudaEventSynchronize( stop ); 
+ 
+	cudaEventElapsedTime( &time, start, stop ); 
+	printf("CPU time = %f \n", time);
+	
+	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
+	printf("MLUPS = %f \n", MLUPS);
+
+	cudaStreamDestroy(stream);
+	cudaEventDestroy( start ); 
+	cudaEventDestroy( stop ); 
+	//..............................................................
+	
+	//..............................................................
+	/*//.........Compute the velocity and copy result to host ........
+	double *velocity;
+	velocity = new double[3*N];
+	//......................device distributions....................................
+	double *vel;
+	//..............................................................................
+	 dvc_AllocateDeviceMemory((void **) &vel, 3*dist_mem_size);	// Allocate device memory
+	//..............................................................................
+//	Compute_VELOCITY <<< grid, nthreads >>>  (ID, f_even, f_odd, vel, Nx, Ny, Nz, S);
+	//..............................................................................
+//	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
+	//..............................................................................
+
+	//............................................................	
+	//....Write the z-velocity to test poiseuille flow............
+	double vz,vz_avg;	
+	vz_avg = 0.0;
+
+/*	FILE *output;
+	output = fopen("velocity.out","w");
+	for (int k=0; k<1; k++){
+		for (int j=0; j<1; j++){
+			for (int i=0; i<Nx; i++){
+				int n = k*Nx*Ny+j*Nx+i;
+				//.....print value........
+				vz = velocity[2*N+n];
+				vz_avg += vz;
+				fprintf(output, " %e",vz);
+			}
+		}
+	}
+	fclose(output);
+	
+	vz = vz_avg/double(sum);
+	printf("Average Velocity = %e\n", vz);
+*/
+	// cleanup	
+//	cudaFree(f_even);	cudaFree(f_odd);	cudaFree(vel);	cudaFree(ID);
+//	free (velocity);	free(id);
+	
+}
--- a/gpu/exe/lb1_MRT_mpi.cpp
+++ b/gpu/exe/lb1_MRT_mpi.cpp
--- a/gpu/exe/lb1_MRT_mpi.cu
+++ b/gpu/exe/lb1_MRT_mpi.cu
--- a/gpu/exe/lb2_Color.cpp
+++ b/gpu/exe/lb2_Color.cpp
@@ -0,0 +1,372 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+
+//*************************************************************************
+// Functions defined in Color.cu
+//*************************************************************************
+extern "C" void dvc_InitDenColor( int nblocks, int nthreads, int S,
+		char *ID, double *Den, double *Phi, double das, double dbs, int N);
+//*************************************************************************
+extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
+		char *ID, double *Phi, double *ColorGrad, int Nx, int Ny, int Nz);
+//*************************************************************************
+extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S,
+		char *ID, double *f_even, double *f_odd, double *ColorGrad, double *Velocity,
+		double rlxA, double rlxB,double alpha, double beta, double Fx, double Fy, double Fz,
+		int Nx, int Ny, int Nz, bool pBC);
+//*************************************************************************
+extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
+		char *ID, double *Den, double *Copy, double *Phi, double *ColorGrad, double *Velocity,
+		double beta, int Nx, int Ny, int Nz, bool pBC);
+//*************************************************************************
+extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S,
+		char *ID, double *Phi, double *Copy, double *Den, int N);
+//*************************************************************************
+//*************************************************************************
+// Functions defined in D3Q19.cu
+//*************************************************************************
+extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S, char *ID, double *f_even, double *f_odd, int Nx,
+							  int Ny, int Nz);
+//*************************************************************************
+extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S,
+		char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
+//*************************************************************************
+extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList, int start,
+		int sendCount, double *sendbuf, double *Dist, int N);
+//*************************************************************************
+extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy, int Cqz, int *RecvList, int start,
+		int recvCount, double *recvbuf, double *Dist, int Nx, int Ny, int Nz);
+//*************************************************************************
+//***************************************************************************************
+// Functions defined in D3Q7.cu
+//***************************************************************************************
+extern "C" void dvc_PackDenD3Q7(int grid, int threads, int *list, int count, double *sendbuf,
+		int number, double *Data, int N);
+//***************************************************************************************
+extern "C" void dvc_UnpackDenD3Q7(int grid, int threads, int *list, int count, double *recvbuf,
+		int number, double *Data, int N);
+//***************************************************************************************
+extern "C" void dvc_PackValues(int grid, int threads, int *list, int count, double *sendbuf,
+		double *Data, int N);
+//***************************************************************************************
+extern "C" void dvc_UnpackValues(int grid, int threads, int *list, int count, double *recvbuf,
+		double *Data, int N);
+//***************************************************************************************
+//*************************************************************************
+// Functions defined in CudaExtras.cu
+//*************************************************************************
+extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
+//*************************************************************************
+extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
+//*************************************************************************
+extern "C" void dvc_CopyToHost(void* dest, void* source, size_t size);
+//*************************************************************************
+extern "C" void dvc_Barrier();
+//*************************************************************************
+
+//*************************************************************************
+// Implementation of Two-Phase Immiscible LBM using CUDA
+//*************************************************************************
+
+using namespace std;
+
+
+inline void PackID(int *list, int count, char *sendbuf, char *ID){
+	// Fill in the phase ID values from neighboring processors
+	// This packs up the values that need to be sent from one processor to another
+	int idx,n;
+
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		sendbuf[idx] = ID[n];
+	}
+}
+//***************************************************************************************
+inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
+	// Fill in the phase ID values from neighboring processors
+	// This unpacks the values once they have been recieved from neighbors
+	int idx,n;
+
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		ID[n] = recvbuf[idx];
+	}
+}
+//***************************************************************************************
+
+int main(int argc, char **argv)
+{
+
+	int rank = 0;
+	int nprocs =1;
+	int nprocx,nprocy,nprocz;
+	int iproc,jproc,kproc;
+	if (rank == 0){
+		printf("********************************************************\n");
+		printf("Running Hybrid Implementation of Color LBM	\n");
+		printf("********************************************************\n");
+	}
+	// Color Model parameters
+	string FILENAME;
+	unsigned int nBlocks, nthreads;
+	int Nx,Ny,Nz;
+	int timestepMax, interval;
+	double tau,Fx,Fy,Fz,tol;
+	double alpha, beta;
+	double das, dbs;
+	double din,dout;
+	bool pBC;
+	int i,j,k,n;
+
+	if (rank==0){
+		//.............................................................
+		//		READ SIMULATION PARMAETERS FROM INPUT FILE
+		//.............................................................
+		ifstream input("Color.in");
+		// Line 1: Name of the phase indicator file (s=0,w=1,n=2)
+		input >> FILENAME;
+		// Line 2: domain size (Nx, Ny, Nz)
+		input >> Nz;				// number of nodes (x,y,z)
+		input >> nBlocks;
+		input >> nthreads;
+		// Line 3: model parameters (tau, alpha, beta, das, dbs)
+		input >> tau;
+		input >> alpha;
+		input >> beta;
+		input >> das;
+		input >> dbs;
+		// Line 4: External force components (Fx,Fy, Fz)
+		input >> Fx;
+		input >> Fy;
+		input >> Fz;
+		// Line 5: Pressure Boundary conditions
+		input >> pBC;
+		input >> din;
+		input >> dout;
+		// Line 6: time-stepping criteria
+		input >> timestepMax;			// max no. of timesteps
+		input >> interval;			// error interval
+		input >> tol;				// error tolerance
+		//.............................................................
+
+		ifstream domain("Domain.in");
+		domain >> nprocx;
+		domain >> nprocy;
+		domain >> nprocz;
+	}
+
+	double rlxA = 1.f/tau;
+	double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
+
+	if (nprocs != nprocx*nprocy*nprocz){
+		printf("Fatal error in processor number! \n");
+		printf("nprocx =  %i \n",nprocx);
+		printf("nprocy =  %i \n",nprocy);
+		printf("nprocz =  %i \n",nprocz);
+	}
+
+	if (rank==0){
+		printf("********************************************************\n");
+		printf("tau = %f \n", tau);
+		printf("alpha = %f \n", alpha);
+		printf("beta = %f \n", beta);
+		printf("das = %f \n", beta);
+		printf("dbs = %f \n", beta);
+		printf("Force(x) = %f \n", Fx);
+		printf("Force(y) = %f \n", Fy);
+		printf("Force(z) = %f \n", Fz);
+		printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
+		printf("Parallel domain size = %i x %i x %i\n",nprocx,nprocy,nprocz);
+		printf("********************************************************\n");
+
+	}
+
+	Nz += 2;
+	Nx = Ny = Nz;	// Cubic domain
+
+	int N = Nx*Ny*Nz;
+	int dist_mem_size = N*sizeof(double);
+
+//	unsigned int nBlocks = 32;
+//	int nthreads = 128;
+	int S = N/nthreads/nBlocks;
+
+//	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
+//	dim3 grid(nBlocks,1,1);
+
+	if (rank==0) printf("Number of blocks = %i \n", nBlocks);
+	if (rank==0) printf("Threads per block = %i \n", nthreads);
+	if (rank==0) printf("Sweeps per thread = %i \n", S);
+	if (rank==0) printf("Number of nodes per side = %i \n", Nx);
+	if (rank==0) printf("Total Number of nodes = %i \n", N);
+	if (rank==0) printf("********************************************************\n");
+
+	//.......................................................................
+	if (rank == 0)	printf("Read input media... \n");
+	//.......................................................................
+	char LocalRankString[8];
+	char LocalRankFilename[40];
+	sprintf(LocalRankString,"%05d",rank);
+	sprintf(LocalRankFilename,"%s%s","ID.",LocalRankString);
+//	printf("Local File Name =  %s \n",LocalRankFilename);
+	// .......... READ THE INPUT FILE .......................................
+	char value;
+	char *id;
+	id = new char[N];
+	int sum = 0;
+//	double porosity;
+	//.......................................................................
+	ifstream PM(LocalRankFilename,ios::binary);
+	for (k=0;k<Nz;k++){
+		for (j=0;j<Ny;j++){
+			for (i=0;i<Nx;i++){
+				n = k*Nx*Ny+j*Nx+i;
+				id[n] = 0;
+			}
+		}
+	}
+	for ( k=1;k<Nz-1;k++){
+		for ( j=1;j<Ny-1;j++){
+			for ( i=1;i<Nx-1;i++){
+				PM.read((char *) (&value), sizeof(value));
+				n = k*Nx*Ny+j*Nx+i;
+				id[n] = value;
+				if (value > 0) sum++;
+			}
+		}
+	}
+	PM.close();
+//	printf("File porosity = %f\n", double(sum)/N);
+
+	//...........device phase ID.................................................
+	if (rank==0)	printf ("Copying phase ID to device \n");
+	char *ID;
+	dvc_AllocateDeviceMemory((void **) &ID, N);						// Allocate device memory
+	// Copy to the device
+	dvc_CopyToDevice(ID, id, N);
+	//...........................................................................
+
+	if (rank==0)	printf ("Allocating distributions \n");
+	//......................device distributions.................................
+	double *f_even,*f_odd;
+	//...........................................................................
+	dvc_AllocateDeviceMemory((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
+	dvc_AllocateDeviceMemory((void **) &f_odd, 9*dist_mem_size);	// Allocate device memory
+	//...........................................................................
+	//...........................................................................
+	//				MAIN  VARIABLES ALLOCATED HERE
+	//...........................................................................
+	double *Phi,*Den,*Copy;
+	double *ColorGrad, *Velocity;
+	//...........................................................................
+	dvc_AllocateDeviceMemory((void **) &Phi, dist_mem_size);
+	dvc_AllocateDeviceMemory((void **) &Den, 2*dist_mem_size);
+	dvc_AllocateDeviceMemory((void **) &Copy, 2*dist_mem_size);
+	dvc_AllocateDeviceMemory((void **) &Velocity, 3*dist_mem_size);
+	dvc_AllocateDeviceMemory((void **) &ColorGrad, 3*dist_mem_size);
+	//...........................................................................
+	if (rank==0)	printf("Setting the distributions, size = : %i\n", N);
+	//...........................................................................
+	dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
+	dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi,  das, dbs, N);
+	//...........................................................................
+	dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
+	//...........................................................................
+	
+	//...........................................................................
+	// Grids used to pack faces on the GPU for MPI
+	int faceGrid,edgeGrid,packThreads;
+	packThreads=512;
+	edgeGrid=1;
+	faceGrid=Nx*Ny/packThreads;
+
+
+	int timestep = 0;
+	if (rank==0) printf("********************************************************\n");
+	if (rank==0)	printf("No. of timesteps: %i \n", timestepMax);
+
+	//.......create a stream for the LB calculation.......
+//	cudaStream_t stream;
+//	cudaStreamCreate(&stream);
+
+	//.......create and start timer............
+	double start,stop;
+	double walltime;
+	start = clock();
+
+
+	//************ MAIN ITERATION LOOP ***************************************/
+	while (timestep < timestepMax){
+
+		//*************************************************************************
+		// 		Compute the color gradient
+		//*************************************************************************
+		dvc_ComputeColorGradient(nBlocks, nthreads, S,
+				ID, Phi, ColorGrad, Nx, Ny, Nz);
+		//*************************************************************************
+
+		//*************************************************************************
+		// 		Perform collision step for the momentum transport
+		//*************************************************************************
+		dvc_ColorCollide(nBlocks, nthreads, S,
+				ID, f_even, f_odd, ColorGrad, Velocity,
+				rlxA, rlxB,alpha, beta, Fx, Fy, Fz, Nx, Ny, Nz, pBC);
+		//*************************************************************************
+
+		//*************************************************************************
+		// 		Carry out the density streaming step for mass transport
+		//*************************************************************************
+		dvc_DensityStreamD3Q7(nBlocks, nthreads, S,
+				ID, Den, Copy, Phi, ColorGrad, Velocity,beta, Nx, Ny, Nz, pBC);
+		//*************************************************************************
+		
+		//*************************************************************************
+		// 		Swap the distributions for momentum transport
+		//*************************************************************************
+		dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
+		//*************************************************************************
+
+		//*************************************************************************
+		// 		Compute the phase indicator field and reset Copy, Den
+		//*************************************************************************
+		dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
+		//*************************************************************************
+
+		// Iteration completed!
+		timestep++;
+		
+		//...................................................................
+	}
+	//************************************************************************/
+	dvc_Barrier();
+	stop = clock();
+
+//	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
+	walltime = (stop - start)/CLOCKS_PER_SEC;
+//	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
+	double MLUPS = double(Nx*Ny*Nz*timestep)/walltime/1000000;
+	if (rank==0) printf("********************************************************\n");
+	if (rank==0) printf("CPU time = %f \n", walltime);
+	if (rank==0) printf("Lattice update rate (per core)= %f MLUPS \n", MLUPS);
+	MLUPS *= nprocs;
+	if (rank==0) printf("Lattice update rate (total)= %f MLUPS \n", MLUPS);
+	if (rank==0) printf("********************************************************\n");
+	
+	//************************************************************************/
+	// Write out the phase indicator field 
+	//************************************************************************/
+	sprintf(LocalRankFilename,"%s%s","Phase.",LocalRankString);
+	//	printf("Local File Name =  %s \n",LocalRankFilename);
+	double *phiOut;
+	phiOut = new double[N];
+	dvc_CopyToHost(phiOut,Phi,N*sizeof(double));
+		
+	FILE *PHASE;
+	PHASE = fopen(LocalRankFilename,"wb");
+	fwrite(phiOut,8,N,PHASE);
+	fclose(PHASE);
+	//************************************************************************/
+
+}
--- a/gpu/exe/lb2_Color.cu
+++ b/gpu/exe/lb2_Color.cu
@@ -0,0 +1,423 @@
+#ifdef useMPI
+#include <mpi.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+#include <math.h>
+#include <cuda.h>
+
+using namespace std;
+//*************************************************************************
+// HokieSpeed
+//nvcc -Xcompiler -fopenmp -lgomp -O3 -arch sm_20 -o hybridATLKR lb2_ATLKR_hybrid.cu
+// -I$VT_MPI_INC -L$VT_MPI_LIB -lmpi
+//*************************************************************************
+
+//*************************************************************************
+// Implementation of Two-Phase Immiscible LBM using CUDA
+//*************************************************************************
+
+//*************************************************************************
+extern "C" void dvc_InitD3Q19(int nblocks, int nthreads, int S,
+		char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
+//*************************************************************************
+extern "C" void dvc_InitDenColor( int nblocks, int nthreads, int S,
+		char *ID, double *Den, double *Phi, double das, double dbs, int N);
+//*************************************************************************
+extern "C" void dvc_ComputeColorGradient(int nBlocks, int nthreads, int S,
+		char *ID, double *Phi, double *ColorGrad, int Nx, int Ny, int Nz);
+//*************************************************************************
+extern "C" void dvc_ColorCollide(int nBlocks, int nthreads, int S,
+		char *ID, double *f_even, double *f_odd, double *ColorGrad, double *Velocity,
+		double rlxA, double rlxB,double alpha, double beta, double Fx, double Fy, double Fz,
+		int Nx, int Ny, int Nz, bool pBC);
+//*************************************************************************
+extern "C" void dvc_DensityStreamD3Q7(int nBlocks, int nthreads, int S,
+		char *ID, double *Den, double *Copy, double *Phi, double *ColorGrad, double *Velocity,
+		double beta, int Nx, int Ny, int Nz, bool pBC);
+//*************************************************************************
+extern "C" void dvc_ComputePhi(int nBlocks, int nthreads, int S,
+		char *ID, double *Phi, double *Copy, double *Den, int N);
+//*************************************************************************
+extern "C" void dvc_AllocateDeviceMemory(void** address, size_t size);
+//*************************************************************************
+extern "C" void dvc_CopyToDevice(void* dest, void* source, size_t size);
+//*************************************************************************
+extern "C" void dvc_Barrier();
+//*************************************************************************
+extern "C" void dvc_SwapD3Q19(int nblocks, int nthreads, int S,
+		char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz);
+//*************************************************************************
+extern "C" void dvc_PackDist(int grid, int threads, int q, int *SendList, int start,
+		int sendCount, double *sendbuf, double *Dist, int N);
+//*************************************************************************
+extern "C" void dvc_UnpackDist(int grid, int threads, int q, int Cqx, int Cqy, int Cqz, int *RecvList, int start,
+		int recvCount, double *recvbuf, double *Dist, int Nx, int Ny, int Nz);
+//*************************************************************************
+
+int main(int argc, char *argv[])
+{
+	
+	//********** Initialize MPI ****************
+	int numprocs,rank;
+#ifdef useMPI
+	MPI_Status stat;
+	MPI_Init(&argc,&argv);
+	MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
+	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+#else
+	numprocs = 1;
+	rank = 0;
+#endif
+	//******************************************
+	
+	if (rank == 0){
+		printf("********************************************************\n");
+		printf("Running Hybrid Implementation of Color LBM	\n");
+		printf("********************************************************\n");
+	}
+	// Color Model parameters
+	string FILENAME;
+	unsigned int nBlocks, nthreads;
+	int Nx,Ny,Nz;
+	int timestepMax, interval;
+	double tau,Fx,Fy,Fz,tol;
+	double alpha, beta;
+	double das, dbs;
+	double din,dout;
+	bool pBC;
+
+	if (rank==0){
+		//.............................................................
+		//		READ SIMULATION PARMAETERS FROM INPUT FILE 
+		//.............................................................
+		ifstream input("Color.in");
+		// Line 1: Name of the phase indicator file (s=0,w=1,n=2)
+		input >> FILENAME;				
+		// Line 2: domain size (Nx, Ny, Nz)
+		input >> Nz;				// number of nodes (x,y,z)
+		input >> nBlocks;				
+		input >> nthreads;			
+		// Line 3: model parameters (tau, alpha, beta, das, dbs)
+		input >> tau;
+		input >> alpha;
+		input >> beta;
+		input >> das;	
+		input >> dbs;
+		// Line 4: External force components (Fx,Fy, Fz)
+		input >> Fx;				
+		input >> Fy;
+		input >> Fz;
+		// Line 5: Pressure Boundary conditions
+		input >> pBC;
+		input >> din;
+		input >> dout;
+		// Line 6: time-stepping criteria
+		input >> timestepMax;			// max no. of timesteps
+		input >> interval;			// error interval
+		input >> tol;				// error tolerance
+		//.............................................................
+	}
+#ifdef useMPI
+	// **************************************************************
+	// Broadcast simulation parameters from rank 0 to all other procs
+	MPI_Barrier(MPI_COMM_WORLD);
+	//.................................................
+	MPI_Bcast(&Nz,1,MPI_INT,0,MPI_COMM_WORLD);
+	MPI_Bcast(&nBlocks,1,MPI_INT,0,MPI_COMM_WORLD);
+	MPI_Bcast(&nthreads,1,MPI_INT,0,MPI_COMM_WORLD);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&das,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,MPI_COMM_WORLD);
+	MPI_Bcast(&din,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&dout,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,MPI_COMM_WORLD);
+	MPI_Bcast(&interval,1,MPI_INT,0,MPI_COMM_WORLD);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,MPI_COMM_WORLD);
+	//.................................................
+	MPI_Barrier(MPI_COMM_WORLD);
+	// **************************************************************
+#endif
+	
+	double rlxA = 1.f/tau;
+	double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
+
+	if (pBC && rank == 0){
+		printf("Assigning presusre boundary conditions \n");
+		printf("Inlet density = %f \n", din);
+		printf("Outlet density = %f \n", dout);
+	}
+
+	if (rank==0){
+		printf("....Parameters................\n");
+		printf("tau = %f \n", tau);
+		printf("alpha = %f \n", alpha);
+		printf("beta = %f \n", beta);
+		printf("das = %f \n", das);
+		printf("dbs = %f \n", dbs);
+		printf("Force(x) = %f \n", Fx);
+		printf("Force(y) = %f \n", Fy);
+		printf("Force(z) = %f \n", Fz);
+		printf("Nz = %i \n", Nz);
+		printf("timestepMax = %i \n", timestepMax);
+		printf("...............................\n");
+	}
+	
+	// Identical cubic sub-domains
+	Nx = Ny = Nz;// = 16*s;		// Cubic domain
+	int N = Nx*Ny*Nz;
+	int dist_mem_size = N*sizeof(double);
+
+//	unsigned int nBlocks = 32;
+//	int nthreads = 128;
+	int S = N/nthreads/nBlocks;
+	if (nBlocks*nthreads*S < N)	S++;
+//	int S = 1;
+	
+//	unsigned int nBlocks = N/nthreads + (N%nthreads == 0?0:1);
+//	dim3 grid(nBlocks,1,1);
+	if (rank==1){
+		printf("Number of blocks = %i \n", nBlocks);
+		printf("Threads per block = %i \n", nthreads);
+		printf("Sweeps per thread = %i \n", S);
+		printf("Number of nodes per side = %i \n", Nx);
+		printf("Total Number of nodes = %i \n", N);
+		printf("...............................\n");
+	}
+	
+	//.......................................................................
+	// .......... READ THE INPUT FILE .......................................
+	int n;
+	char value;
+	char *id;
+	id = new char[N];	
+	int sum = 0;
+	// RANK 0 READS THE INPUT FILE	
+	if (rank==0){
+		printf("Read input media... \n");
+		ifstream PM(FILENAME.c_str(),ios::binary);
+		for (int k=0;k<Nz;k++){
+			for (int j=0;j<Ny;j++){
+				for (int i=0;i<Nx;i++){
+					PM.read((char *) (&value), sizeof(value));
+					n = k*Nx*Ny+j*Nx+i;
+
+					if (value>0){
+						if (pBC) value=2; 	// Saturate with NWP
+						if (k<8){
+							value=1;
+						}
+					}
+
+					id[n] = value;
+					if (value > 0) sum++;
+				}
+			}
+		}
+		PM.close();
+		printf("File porosity = %f\n", double(sum)/N);
+	}
+	//......... for pressure BC only............................
+	// Void the first / last rows if pressure BC are to be used
+	if (pBC){
+		for (int k=0;k<Nz;k++){
+			for (int j=0;j<Ny;j++){
+				for (int i=0;i<Nx;i++){
+					n = k*Nx*Ny+j*Nx+i;
+					if (k<4) id[n] = 1;
+					if (k>Nz-5) id[n] = 2;
+				}
+			}
+			// Skip the non-boundary values
+			if (k==4)	k=Nz-5;
+		}
+	}
+#ifdef useMPI	//............................................................
+	MPI_Barrier(MPI_COMM_WORLD);
+	MPI_Bcast(&id[0],N,MPI_CHAR,0,MPI_COMM_WORLD);
+	MPI_Barrier(MPI_COMM_WORLD);
+#endif
+	if (rank == 0) printf("Domain set.\n");
+	//...........................................................................
+
+	int SBC;
+	int outlet = N-Nx*Ny;
+	if (pBC){
+		SBC = Nx*Ny/nthreads/nBlocks+1;
+		printf("Number of sweeps for inlet / outlet: %i \n", SBC);
+	}
+	//...........................................................................
+	
+	//...........................................................................
+	//...........device phase ID.................................................
+	char *ID;
+	cudaMalloc((void **) &ID, N);						// Allocate device memory
+	// Copy to the device
+	cudaMemcpy(ID, id, N, cudaMemcpyHostToDevice);
+	//...........................................................................
+	
+	//......................device distributions.................................
+	double *f_even,*f_odd;
+	//...........................................................................
+	cudaMalloc((void **) &f_even, 10*dist_mem_size);	// Allocate device memory
+	cudaMalloc((void **) &f_odd, 9*dist_mem_size);		// Allocate device memory
+//	f_even = new double[10*N];
+//	f_odd = new double[9*N];
+	//...........................................................................
+
+	//...........................................................................
+	//				MAIN  VARIABLES ALLOCATED HERE
+	//...........................................................................
+	double *Phi,*Den,*Copy;
+	double *ColorGrad, *Velocity;
+	//...........................................................................
+	cudaMalloc((void **) &Phi, dist_mem_size);
+	cudaMalloc((void **) &Den, 2*dist_mem_size);
+	cudaMalloc((void **) &Copy, 2*dist_mem_size);
+	cudaMalloc((void **) &Velocity, 3*dist_mem_size);
+	cudaMalloc((void **) &ColorGrad, 3*dist_mem_size);
+	//...........................................................................
+	
+	//...........................................................................
+	if (rank==0)	printf("Setting the distributions, size = : %i\n", N);
+	//...........................................................................
+	dvc_InitD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
+	dvc_InitDenColor(nBlocks, nthreads, S, ID, Den, Phi,  das, dbs, N);
+	//...........................................................................
+	dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
+	//...........................................................................
+
+	int timestep;
+//	double starttime,stoptime;
+	if (rank==0)	printf("No. of timesteps: %i \n", timestepMax);
+	timestep = 0;
+	//.......create and start timer............
+	cudaEvent_t start, stop;
+	float time;
+	//.......create a stream for the LB calculation.......
+	cudaStream_t stream;
+	cudaStreamCreate(&stream);
+
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord( start, 0 );
+	//.........................................
+	//************ MAIN TIMESTEP LOOP ***************************************/
+	while (timestep < timestepMax){
+
+		//*************************************************************************
+		// 		Compute the color gradient
+		//*************************************************************************
+		dvc_ComputeColorGradient(nBlocks, nthreads, S,
+				ID, Phi, ColorGrad, Nx, Ny, Nz);
+		//*************************************************************************
+
+		//*************************************************************************
+		// 		Perform collision step for the momentum transport
+		//*************************************************************************
+		dvc_ColorCollide(nBlocks, nthreads, S,
+				ID, f_even, f_odd, ColorGrad, Velocity,
+				rlxA, rlxB,alpha, beta, Fx, Fy, Fz, Nx, Ny, Nz, pBC);
+		//*************************************************************************
+
+		//*************************************************************************
+		// 		Carry out the density streaming step for mass transport
+		//*************************************************************************
+		dvc_DensityStreamD3Q7(nBlocks, nthreads, S,
+				ID, Den, Copy, Phi, ColorGrad, Velocity,beta, Nx, Ny, Nz, pBC);
+		//*************************************************************************
+
+		//*************************************************************************
+		// 		Swap the distributions for momentum transport
+		//*************************************************************************
+		dvc_SwapD3Q19(nBlocks, nthreads, S, ID, f_even, f_odd, Nx, Ny, Nz);
+		//*************************************************************************
+
+		//*************************************************************************
+		// 		Compute the phase indicator field and reset Copy, Den
+		//*************************************************************************
+		dvc_ComputePhi(nBlocks, nthreads, S,ID, Phi, Copy, Den, N);
+		//*************************************************************************
+
+		dvc_Barrier();
+		timestep++;
+		//.............................................................................
+	}
+	//************************************************************************/
+	dvc_Barrier();
+	//.......... stop and destroy timer.............................
+	cudaEventRecord( stop, stream);
+	cudaEventSynchronize( stop );
+
+	cudaEventElapsedTime( &time, start, stop );
+	printf("CPU time = %f \n", time);
+
+	float MLUPS = 0.001*float(Nx*Ny*Nz)*timestep/time;
+	printf("MLUPS = %f \n", MLUPS);
+
+	cudaEventDestroy( start );
+	cudaEventDestroy( stop );
+
+	double *Data;
+	Data = new double[3*N];
+
+	cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
+
+	// Write out the Phase Indicator Field
+	FILE *phase;
+	phase = fopen("Phase.out","wb");
+	fwrite(Data,8,N,phase);
+	fclose(phase);
+
+	//....................................................
+	// Write out the pressure - (reuse Phi arrays since we're done with those)
+//	ComputeDensity<<< grid, nthreads>>> (ID, f_even, f_odd, Phi, Nx, Ny, Nz, S);
+//	cudaMemcpy(Data, Phi, dist_mem_size, cudaMemcpyDeviceToHost);
+//	FILE *PRESSURE;
+//	PRESSURE = fopen("Pressure.out","wb");
+//	fwrite(Phi,8,N,PRESSURE);
+//	fclose(PRESSURE);
+	//....................................................
+
+	// Write out the Color Gradient
+
+	cudaMemcpy(Data, ColorGrad, 3*dist_mem_size, cudaMemcpyDeviceToHost);
+
+	FILE *CG;
+	CG = fopen("ColorGrad.out","wb");
+	fwrite(Data,8,3*N,CG);
+	fclose(CG);
+	
+	// Write out the Velocity
+//	FILE *VEL;
+//	VEL = fopen("Velocity.out","wb");
+//	fwrite(Velocity,8,3*N,VEL);
+//	fclose(VEL);
+
+	// cleanup	
+	cudaFree(ID);
+	cudaFree(f_even);	cudaFree(f_odd);	
+	cudaFree(Velocity);
+	cudaFree(Phi);
+	
+	cudaFree (ColorGrad);
+	cudaFree (Den);		cudaFree(Copy);
+	cudaFree (Phi);
+	free(id);
+	
+	//***********Finish up!*********************************
+#ifdef useMPI
+	MPI_Finalize();
+#endif
+	return 0;
+	
+}
--- a/gpu/exe/lb2_Color_mpi.cpp
+++ b/gpu/exe/lb2_Color_mpi.cpp
--- a/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
+++ b/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
--- a/gpu/exe/lb2_Color_wia_mpi.cpp
+++ b/gpu/exe/lb2_Color_wia_mpi.cpp